Added the first version prescott.py and two data files for it.

de8d6564 · Mustafa Tekpinar · 8c6cd547 · de8d6564 · de8d6564 · de8d6564
Commit de8d6564 authored Sep 27, 2023 by Mustafa Tekpinar
3 changed files
--- a/data/MLH1_normPred_evolCombi_singleline_1-ranksort.txt
+++ b/data/MLH1_normPred_evolCombi_singleline_1-ranksort.txt
--- a/data/gnomAD_v2.1.1_MLH1_HUMAN_ENSG00000076242.csv
+++ b/data/gnomAD_v2.1.1_MLH1_HUMAN_ENSG00000076242.csv
--- a/prescott/prescott.py
+++ b/prescott/prescott.py
+import os
+import sys
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from sklearn import mixture
+from sklearn import metrics
+import numpy as np
+import argparse
+one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \
+            'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y',    \
+            'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A',    \
+            'GLY':'G', 'PRO':'P', 'CYS':'C'}
+def plotROCandAUCV2(labels, pred):
+    fpr, tpr, thresholds = metrics.roc_curve(labels, pred, pos_label=1)
+    roc_auc = metrics.auc(fpr, tpr)
+    return fpr, tpr, roc_auc
+def getGnomADOverallFrequency(infile, usePopMax="true"):
+    # df = pd.read_csv("P53_gnomAD_v3.1.2_ENSG00000141510_2023_07_04_16_10_48.csv")
+    df = pd.read_csv(infile)
+    # print(df.columns)
+    #print(df1.columns)
+    dfMissense = df.loc[df['VEP Annotation']=='missense_variant',
+                    ['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence',
+                    'VEP Annotation', 'ClinVar Clinical Significance', 'ClinVar Variation ID', 'Flags', 
+                    'Allele Count', 'Allele Number', 'Allele Frequency', 
+                    'Homozygote Count', 'Hemizygote Count',
+                    'Allele Count African/African American',
+                    'Allele Number African/African American',
+                    'Homozygote Count African/African American',
+                    'Hemizygote Count African/African American',
+                    'Allele Count Latino/Admixed American',
+                    'Allele Number Latino/Admixed American',
+                    'Homozygote Count Latino/Admixed American',
+                    'Hemizygote Count Latino/Admixed American',
+                    'Allele Count Ashkenazi Jewish', 
+                    'Allele Number Ashkenazi Jewish',
+                    'Homozygote Count Ashkenazi Jewish',
+                    'Hemizygote Count Ashkenazi Jewish', 
+                    'Allele Count East Asian',
+                    'Allele Number East Asian', 
+                    'Homozygote Count East Asian',
+                    'Hemizygote Count East Asian', 
+                    'Allele Count European (Finnish)',
+                    'Allele Number European (Finnish)',
+                    'Homozygote Count European (Finnish)',
+                    'Hemizygote Count European (Finnish)',
+                    'Allele Count European (non-Finnish)',
+                    'Allele Number European (non-Finnish)',
+                    'Homozygote Count European (non-Finnish)',
+                    'Hemizygote Count European (non-Finnish)', 
+                    'Allele Count Other',
+                    'Allele Number Other', 
+                    'Homozygote Count Other',
+                    'Hemizygote Count Other', 
+                    'Allele Count South Asian',
+                    'Allele Number South Asian', 
+                    'Homozygote Count South Asian',
+                    'Hemizygote Count South Asian']]
+    dfMissense = dfMissense.reset_index()
+    dfMissense['Allele Frequency']
+    dfMissense['Allele Frequency Log'] = ""
+    if(usePopMax.lower()=="true"):
+        alleleCountList = ['Allele Count African/African American',
+                        'Allele Count Latino/Admixed American',
+                        'Allele Count Ashkenazi Jewish', 
+                        'Allele Count East Asian',
+                        'Allele Count European (Finnish)',
+                        'Allele Count European (non-Finnish)',
+                        'Allele Count Other',
+                        'Allele Count South Asian']
+        alleleNumberList = ['Allele Number African/African American',
+                        'Allele Number Latino/Admixed American',
+                        'Allele Number Ashkenazi Jewish',
+                        'Allele Number East Asian', 
+                        'Allele Number European (Finnish)',
+                        'Allele Number European (non-Finnish)',
+                        'Allele Number Other', 
+                        'Allele Number South Asian']
+        for index, row in dfMissense.iterrows():
+            maxFreq = 0.0
+            tempIndex = 0
+            for i in range(len(alleleCountList)):
+                if(row[alleleNumberList[i]]!=0):
+                    tempValue = (row[alleleCountList[i]]/row[alleleNumberList[i]])
+                    if(tempValue>maxFreq):
+                        maxFreq=tempValue
+                        tempIndex = i
+            # Avoid zero frequency error by setting it to a very low number such as 10**-10
+            if(maxFreq==0.0):
+                maxFreq = 10**-10 # Which means 1 in 10 billion, which is the estimated population in 2050. 
+            # print(maxFreq)
+            dfMissense.at[index,'Selected Population'] = alleleCountList[tempIndex]
+            dfMissense.at[index,'Allele Frequency Log'] = np.log10(maxFreq)
+            #print(np.log10(maxFreq))
+    else:
+        # Avoid zero frequency error by setting it to a very low number such as 10**-10
+        for index, row in dfMissense.iterrows():
+            if(row['Allele Frequency']==0.0):
+                dfMissense.at[index,'Allele Frequency Log'] =  np.log10(10**-10)
+            else:
+                dfMissense.at[index,'Allele Frequency Log'] =  np.log10(row['Allele Frequency'])
+    # sys.exit(-1)
+    #print(dfMissense['Allele Frequency Log'])
+    #print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
+    # dfMissense.dropna(subset = ['ClinVar Clinical Significance'], inplace=True)
+    dfMissense = dfMissense.reset_index()
+    #print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
+    # plt.figure()
+    # plt.hist(dfMissense['Allele Frequency Log'], density=False, color='red', label='pathogenic')
+    # plt.show()
+    mutantList = []
+    proteinNameList = []
+    for index, row in dfMissense.iterrows():
+        source = one_letter[row['Protein Consequence'][2:5].upper()]
+        position = (row['Protein Consequence'][5:-3])
+        target = one_letter[(row['Protein Consequence'][-3:]).upper()]
+        mutant = source+position+target
+        mutantList.append(mutant)
+        # print(mutant, row['Protein Consequence'])
+        # print(infile.split('_')[2])
+        proteinNameList.append(os.path.basename(infile).split('_')[2])
+    dfMissense['mutant'] = mutantList
+    dfMissense['protein'] = proteinNameList
+    return (dfMissense)
+if __name__ == "__main__":
+    # Adding the main parser
+    main_parser = argparse.ArgumentParser(description=\
+        "A Python toolkit to prepare, modify, visualize and analyze deep mutational scanning (DMS) data of proteins.")
+    main_parser.add_argument('-e', '--escottfile', dest='escottfile', type=str, \
+        help='Main input for prescott is an ESCOTT single point mutation file.', \
+        required=True, default=None)
+    main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \
+        help='The second input for prescott is a gnomad frequency file for the protein\n.'+
+             'You can obtain it from https://gnomad.broadinstitute.org/ by searching your favorite protein (such as P53).\n'+
+             'After you found your protein, go to the \'Export variants to CSV\' link and click it do download the csv file you needed!', \
+        required=True, default=None)
+    main_parser.add_argument('-o', '--outputfile', dest='outputfile', type=str, \
+        help='Name of the output file without file extension. Default extension is txt.', \
+        required=False, default='prescott-scores.txt')
+    main_parser.add_argument('--usepopmax', dest='usepopmax', type=str, \
+        help='A true or false value to use population max. frequency of one of the eight populations available in GnomAD (Default is true).',
+        required=False, default="true")
+    main_parser.add_argument('--equation', dest='equation', type=int, \
+        help='An integer value. Default is 2, which means it only reduces scores if it is too frequent!',
+        required=False, default=2)
+    main_parser.add_argument('-s', '--scalingcoefficient', dest='scalingcoefficient', type=float, \
+        help='Scaling coefficient to adjust contribution of frequency information to the scores. \n It is denoted with c in the model equation. Default is 1.0', \
+        required=False, default=1.0)
+    main_parser.add_argument('-f', '--frequencycutoff', dest='frequencycutoff', type=float, \
+        help='Frequency cutoff value. \n It is denoted with Fc in the model equation. Default is -4.0.', \
+        required=False, default=-4.0)
+    main_parser.add_argument('--usefrequencies', dest='usefrequencies', type=str, \
+        help='Do not touch this if you don\'t know what you are doing! Default is true',
+        required=False, default='true')
+    # main_parser.add_argument('--colormap', dest='colormap', type=str, \
+    #     help='A colormap as defined in matplotlib',
+    #     required=False, default='coolwarm_r')
+    # main_parser.add_argument('--field', dest='field', type=int, \
+    #     help='An integer value starting from 0 for reading the rhapsody output',
+    #     required=False, default=10)
+    # main_parser.add_argument('-b', '--beginning', dest='beginning', type=int, \
+    #     help='An integer to indicate the first residue index.',
+    #     required=False, default=1)
+    # main_parser.add_argument('-e', '--end', dest='end', type=int, \
+    #     help='An integer to indicate the final residue index.',
+    #     required=False, default=None)
+    args = main_parser.parse_args()
+    print("\n\n@> Running PRESCOTT with the following parameters:\n\n")
+    print("@> ESCOTT file                      : {}".format(args.escottfile))
+    print("@> GNOMAD frequency file            : {}".format(args.gnomadfile))
+    print("@> Use population max. freq         : {}".format(str(args.usepopmax).lower()))
+    print("@> Which equation to use (Default=2): {}".format(str(args.equation)))
+    print("@> Scaling coefficient (Default=1.0): {}".format(args.scalingcoefficient))
+    print("@> Frequency cutoff (Default=-4.0) : {}".format(args.frequencycutoff))
+    print("@> Name of the output file          : {}".format(args.outputfile))
+    # End of argument parsing!
+    mainPath = "/mnt/data/tekpinar/ESGEMME_vs_EVE_all_data"
+    # dataFolder = "/esgemme-v-1-4-0-max-two-components-eve-msas-entire-single-point-mutations"
+    # dataFolder = "/egemme-v-1-3-0-eve-msas-entire-single-point-mutations"
+    # dataFolder = sys.argv[9]
+    # mutFilePath = "/mnt/data/tekpinar/research/datasets/acmgClinVar/acmg-v3p1-with-esgemme-v1-4-0-eve-MSAs/all_gene_names_v4_existing_asm_normalized.txt"
+    varFilePath = "/variant_files"
+    # MUT_FILE = open(mutFilePath, "r")
+    # allLines = MUT_FILE.readlines()
+    # MUT_FILE.close()
+    allPathogenicList = []
+    allBenignList = []
+    # selectedListFile=open("all_gene_names_v4_existing_asm.txt" ,"w")
+    escottDataPath = args.escottfile
+    protein = os.path.splitext(os.path.basename(escottDataPath))[0]
+    # esmVariantsPath="/mnt/data/tekpinar/software/esm-variants/entire-dataset/"
+    # print(escottDataPath)
+    # Check if file exists
+    usePopMaxOrNot = args.usepopmax.lower()
+    version = args.equation
+    if (os.path.exists(escottDataPath)):     
+        dfESCOTT = pd.read_table(escottDataPath, sep='\s+', header=None)
+        dfESCOTT.columns = ['mutant', 'ESCOTT']
+        dfESCOTT['protein']=protein
+    else:
+        print("ERROR: ESCOTT input file does not exist!")
+        sys.exit(-1)
+    myBigMergedDF = pd.DataFrame()
+    myBigMergedDF = pd.concat([myBigMergedDF, dfESCOTT], ignore_index=True)
+    # figPrefix="all_gene_names_eve_msas_uniref100_"+method.lower()+"_vs_eve-asm_normalized-auc-sklearn-v5"
+    figPrefix="all_gene_names_eve_msas_uniref100_vs_eve-asm_normalized-auc-sklearn-v5"
+    gnomadDF = getGnomADOverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
+    # Assign labels to pathogenic/benign mutations for performance evaluation
+    gnomadDF['labels'] = ""
+    for index, row in gnomadDF.iterrows():
+        if ((row['ClinVar Clinical Significance']=='Benign/Likely benign') or \
+            (row['ClinVar Clinical Significance']=='Benign') or \
+            (row['ClinVar Clinical Significance']=='Likely benign')):
+            gnomadDF.at[index,'labels'] = 0
+        if((row['ClinVar Clinical Significance']=='Pathogenic/Likely pathogenic') or \
+            (row['ClinVar Clinical Significance']=='Pathogenic') or \
+            (row['ClinVar Clinical Significance']=='Likely pathogenic')):
+            gnomadDF.at[index,'labels'] = 1   
+    print(gnomadDF.loc[(gnomadDF['labels']==0) | (gnomadDF['labels']==1)])
+    # print(gnomadDF['ClinVar Clinical Significance'])
+    # Add frequency column and a dummy frequency to each row in myBigMergedDF
+    myBigMergedDF['frequency'] = 999.0
+    myBigMergedDF['labels'] = np.nan
+    myBigMergedDF['position'] = ""
+    useFrequencies = "true"
+    selectedPositionsList = []
+    selectedValuesList = []
+    selectedMutantsList = []
+    # Assign ESCOTT scores to PRESCOTT scores.
+    # Then, we will modify them according to different conditions. 
+    myBigMergedDF['PRESCOTT'] = myBigMergedDF['ESCOTT']
+    labelsList = []
+    if (useFrequencies.lower() == 'true'):
+        # print(myBigMergedDF)
+        for index, row in myBigMergedDF.iterrows():
+            myBigMergedDF.at[index,'position'] = row['mutant'][1:-1]
+            # print(row['mutant'], row['ESCOTT'])
+            # print(row['mutant'][1:-1])
+            temp = (gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'Allele Frequency Log'].values)
+            #print(temp)
+            if (len(temp) > 0):
+                myBigMergedDF.at[index,'frequency'] = temp[0]
+                myBigMergedDF.at[index,'labels'] = gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'labels'].values[0]
+        # print(myBigMergedDF)
+        scalingCoeff = args.scalingcoefficient
+        freqCutoff = args.frequencycutoff
+        for index, row in myBigMergedDF.iterrows():
+            if(row['frequency'] != 999.0):
+                # print(row['frequency'])
+                # freq = np.log10(row['frequency'])
+                freq = row['frequency']
+                # print(freq)
+                temp1 = row['PRESCOTT']
+                label = row['labels']
+                if(version==1):
+                    if(freq>freqCutoff):
+                        temp2 = temp1 - freq*scalingCoeff/freqCutoff
+                        if(temp2<0.0):
+                            myBigMergedDF.at[index,'PRESCOTT'] = 0.0
+                            selectedValuesList.append(0.0)
+                        else:
+                            myBigMergedDF.at[index,'PRESCOTT'] = temp2
+                            selectedValuesList.append(temp2)
+                        selectedPositionsList.append(row['position'])
+                        selectedMutantsList.append(row['mutant'])
+                if(version==2):
+                    if(freq>freqCutoff):
+                        temp2 = temp1 - scalingCoeff*(freqCutoff - freq)/freqCutoff
+                        if(temp2<0.0):
+                            temp2 = 0.0
+                        myBigMergedDF.at[index,'PRESCOTT'] = temp2
+                        if(label==0 or label==1):
+                            selectedValuesList.append(temp2)
+                            selectedPositionsList.append(row['position'])
+                            selectedMutantsList.append(row['mutant'])
+                if(version==3):
+                    temp2 = temp1 - scalingCoeff*(freqCutoff - freq)/freqCutoff
+                    if(freq>freqCutoff):
+                        if(temp2<0.0):
+                            temp2 = 0.0
+                        myBigMergedDF.at[index,'PRESCOTT'] = temp2
+                        if(label==0 or label==1):
+                            selectedValuesList.append(temp2)
+                            selectedPositionsList.append(row['position'])
+                            selectedMutantsList.append(row['mutant'])
+                    else:
+                        if(temp2>1.0):
+                            temp2 = 1.0
+                        myBigMergedDF.at[index,'PRESCOTT'] = temp2
+                        if(label==0 or label==1):
+                            selectedValuesList.append(temp2)
+                            selectedPositionsList.append(row['position'])
+                            selectedMutantsList.append(row['mutant'])                        
+    # myBigMergedDF.dropna(subset = ['labels'], inplace=True)
+    clinvarLabeledDF = myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1)]
+    clinvarLabeledDF['labels'] = clinvarLabeledDF['labels'].astype('int64')
+    print(clinvarLabeledDF)
+    #print(myBigMergedDF.loc[(myBigMergedDF['labels']=='0') | (myBigMergedDF['labels']=='1'), 'labels'])
+    # print(clinvarLabeledDF['labels'].values)
+    # print(clinvarLabeledDF['ESCOTT'].values)
+    fprESCOTT, tprESCOTT, AUC_ESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
+                                                       clinvarLabeledDF['ESCOTT'])
+    fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
+                                                       clinvarLabeledDF['PRESCOTT'])
+    # fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'labels'], \
+    #                                                   myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'PRESCOTT'])
+    fig = plt.figure(figsize=(12,6))
+    # plt.rcParams.update({'font.size': 18})
+    plt.grid(linestyle='--')
+    # plt.title(protName + " - "+method+" AUC={:.2f}".format(AUC_ESCOTT))
+    plt.title("AUC={:.2f} -> AUC={:.2f}".format(AUC_ESCOTT, AUC_PRESCOTT))
+    plt.ylim([0.0, 1.0])
+    #plt.xlim([1000, 1863])
+    plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'ESCOTT'], marker='o', color='red', label='pathogenic')
+    plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'ESCOTT'], marker='o', color='blue', label='benign')
+    if (useFrequencies.lower() == 'true'):
+        #print(selectedPositionsList)
+        #print(selectedValuesList)
+        plt.scatter(selectedPositionsList, selectedValuesList, marker='o', color='olive', label='PRESCOTT')
+        # Add vertical lines connecting old and new values
+        for i in range(len(selectedPositionsList)):
+            plt.annotate("", xy=(selectedPositionsList[i], selectedValuesList[i]), xycoords='data', \
+                         xytext=(selectedPositionsList[i], myBigMergedDF.loc[myBigMergedDF['mutant'] == selectedMutantsList[i], 'ESCOTT'].values[0]), textcoords='data',
+            arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
+    plt.xticks(rotation = 90)
+    plt.ylabel("Ranksorted Score")
+    plt.xlabel("Position")
+    plt.legend(loc='upper right')
+    plt.tight_layout()
+    plt.savefig("clinvar-vs-position.png")
+    plt.close()
+    myBigMergedDF.to_csv('myBigMergedDF-normalized-asm.csv', index=None)
+    myBigMergedDF.to_csv(args.outputfile, columns=['mutant', 'PRESCOTT'], index=False, header=None, sep=' ')
+    print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))