Updated prescott.py

15b17ad6 · Mustafa Tekpinar · 67c81d47 · 15b17ad6
Commit 15b17ad6 authored Jan 02, 2025 by Mustafa Tekpinar
Hide whitespace changes
Inline Side-by-side

Showing with 180 additions and 23 deletions

prescott.py prescott/prescott.py +180 -23

No files found.
--- a/prescott/prescott.py
+++ b/prescott/prescott.py
@@ -141,7 +141,7 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
        mutantList.append(mutant)
        # print(mutant, row['Protein Consequence'])
        # print(infile.split('_')[2])
-        proteinNameList.append(os.path.basename(infile).split('_')[2])
+        proteinNameList.append(os.path.basename(infile))
    dfMissense['mutant'] = mutantList
    dfMissense['protein'] = proteinNameList
@@ -281,15 +281,15 @@ def getGnomADOverallFrequencyV2(infile, usePopMax="true"):
        mutantList.append(mutant)
        # print(mutant, row['Protein Consequence'])
        # print(infile.split('_')[2])
-        proteinNameList.append(os.path.basename(infile).split('_')[2])
+        proteinNameList.append(os.path.basename(infile))
    dfMissense['mutant'] = mutantList
    dfMissense['protein'] = proteinNameList
    return (dfMissense)
-def getGnomADV4OverallFrequency(infile, usePopMax="true"):
+def getGnomADV4p0_OverallFrequency(infile, usePopMax="true"):
    """
-        This version is for gnomAD v4. 
+        This version is for gnomAD v4.0
        -Latino/Admixed is renamed as Admixed                                                                                                                                                                          
        -I used Middle Eastern instead of Other field.
    """
@@ -407,13 +407,140 @@ def getGnomADV4OverallFrequency(infile, usePopMax="true"):
        mutantList.append(mutant)
        # print(mutant, row['Protein Consequence'])
        # print(infile.split('_')[2])
-        proteinNameList.append(os.path.basename(infile).split('_')[2])
+        proteinNameList.append(os.path.basename(infile))
    dfMissense['mutant'] = mutantList
    dfMissense['protein'] = proteinNameList
    return (dfMissense)
+###############################################################################
+def getGnomADV4p1_OverallFrequency(infile, usePopMax="true"):
+    """
+        This version is for gnomAD v4.1
+        -Latino/Admixed is renamed as Admixed                                                                                                                                                                          
+        -I used Middle Eastern instead of Other field.
+    """
+    df = pd.read_csv(infile)
+    # print(df.columns)
+    #print(df1.columns)
+    dfMissense = df.loc[df['VEP Annotation']=='missense_variant',
+                    ['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence',
+                    'VEP Annotation', 'ClinVar Germline Classification', 'ClinVar Variation ID', 'Flags', 
+                    'Allele Count', 'Allele Number', 'Allele Frequency', 
+                    'Homozygote Count', 'Hemizygote Count',
+                    'Allele Count African/African American',
+                    'Allele Number African/African American',
+                    'Homozygote Count African/African American',
+                    'Hemizygote Count African/African American',
+                    'Allele Count Admixed American',
+                    'Allele Number Admixed American',
+                    'Homozygote Count Admixed American',
+                    'Hemizygote Count Admixed American',
+                    'Allele Count Ashkenazi Jewish', 
+                    'Allele Number Ashkenazi Jewish',
+                    'Homozygote Count Ashkenazi Jewish',
+                    'Hemizygote Count Ashkenazi Jewish', 
+                    'Allele Count East Asian',
+                    'Allele Number East Asian', 
+                    'Homozygote Count East Asian',
+                    'Hemizygote Count East Asian', 
+                    'Allele Count European (Finnish)',
+                    'Allele Number European (Finnish)',
+                    'Homozygote Count European (Finnish)',
+                    'Hemizygote Count European (Finnish)',
+                    'Allele Count European (non-Finnish)',
+                    'Allele Number European (non-Finnish)',
+                    'Homozygote Count European (non-Finnish)',
+                    'Hemizygote Count European (non-Finnish)', 
+                    'Allele Count Middle Eastern',
+                    'Allele Number Middle Eastern', 
+                    'Homozygote Count Middle Eastern',
+                    'Hemizygote Count Middle Eastern', 
+                    'Allele Count South Asian',
+                    'Allele Number South Asian', 
+                    'Homozygote Count South Asian',
+                    'Hemizygote Count South Asian']]
+    dfMissense = dfMissense.reset_index()
+    dfMissense['Allele Frequency']
+    dfMissense['Allele Frequency Log'] = ""
+    if(usePopMax.lower()=="true"):
+        alleleCountList = ['Allele Count African/African American',
+                        'Allele Count Admixed American',
+                        'Allele Count Ashkenazi Jewish', 
+                        'Allele Count East Asian',
+                        'Allele Count European (Finnish)',
+                        'Allele Count European (non-Finnish)',
+                        'Allele Count Middle Eastern',
+                        'Allele Count South Asian']
+        alleleNumberList = ['Allele Number African/African American',
+                        'Allele Number Admixed American',
+                        'Allele Number Ashkenazi Jewish',
+                        'Allele Number East Asian', 
+                        'Allele Number European (Finnish)',
+                        'Allele Number European (non-Finnish)',
+                        'Allele Number Middle Eastern', 
+                        'Allele Number South Asian']
+        for index, row in dfMissense.iterrows():
+            maxFreq = 0.0
+            tempIndex = 0
+            for i in range(len(alleleCountList)):
+                if(row[alleleNumberList[i]]!=0):
+                    tempValue = (row[alleleCountList[i]]/row[alleleNumberList[i]])
+                    if(tempValue>maxFreq):
+                        maxFreq=tempValue
+                        tempIndex = i
+            # Avoid zero frequency error by setting it to a very low number such as 10**-10
+            if(maxFreq==0.0):
+                maxFreq = 10**-10 # Which means 1 in 10 billion, which is the estimated population in 2050. 
+            # print(maxFreq)
+            dfMissense.at[index,'Selected Population'] = alleleCountList[tempIndex]
+            dfMissense.at[index,'Allele Frequency Log'] = np.log10(maxFreq)
+            #print(np.log10(maxFreq))
+    else:
+        # Avoid zero frequency error by setting it to a very low number such as 10**-10
+        for index, row in dfMissense.iterrows():
+            if(row['Allele Frequency']==0.0):
+                dfMissense.at[index,'Allele Frequency Log'] =  np.log10(10**-10)
+            else:
+                dfMissense.at[index,'Allele Frequency Log'] =  np.log10(row['Allele Frequency'])
+    # sys.exit(-1)
+    #print(dfMissense['Allele Frequency Log'])
+    #print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
+    # dfMissense.dropna(subset = ['ClinVar Clinical Significance'], inplace=True)
+    dfMissense = dfMissense.reset_index()
+    #print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
+    # plt.figure()
+    # plt.hist(dfMissense['Allele Frequency Log'], density=False, color='red', label='pathogenic')
+    # plt.show()
+    mutantList = []
+    proteinNameList = []
+    for index, row in dfMissense.iterrows():
+        source = one_letter[row['Protein Consequence'][2:5].upper()]
+        position = (row['Protein Consequence'][5:-3])
+        target = one_letter[(row['Protein Consequence'][-3:]).upper()]
+        mutant = source+position+target
+        mutantList.append(mutant)
+        # print(mutant, row['Protein Consequence'])
+        # print(infile.split('_')[2])
+        proteinNameList.append(os.path.basename(infile))
+    dfMissense['mutant'] = mutantList
+    dfMissense['protein'] = proteinNameList
+    return (dfMissense)
+###############################################################################
@@ -739,8 +866,8 @@ def main():
            'M1A 0.378\n', required=False, default='gemme')
    main_parser.add_argument('--gnomadversion', dest='gnomadversion', type=int, \
-        help='An integer value. Default is version 4 (4.0.0) of GnomAD! \n Other possible versions are 2 and 3.',
+        help='An integer value. Default is version 41 (4.1) of GnomAD! \n Other possible versions are 2, 3 or 40 (for 4.0).',
-        required=False, default=4)
+        required=False, default=41)
    # main_parser.add_argument('--colormap', dest='colormap', type=str, \
    #     help='A colormap as defined in matplotlib',
    #     required=False, default='coolwarm_r')
@@ -762,7 +889,7 @@ def main():
    args = main_parser.parse_args()
    print("\n\n@> Running PRESCOTT with the following parameters:\n\n")
    print("@> ESCOTT file                      : {}".format(args.escottfile))
-    print("@> Frequency file            : {}".format(args.gnomadfile))
+    print("@> Frequency file                   : {}".format(args.gnomadfile))
    print("@> Use population max. freq         : {}".format(str(args.usepopmax).lower()))
    print("@> Which equation to use (Default=2): {}".format(str(args.equation)))
    print("@> Scaling coefficient (Default=1.0): {}".format(args.coefficient))
@@ -841,6 +968,7 @@ def main():
    myBigMergedDF['log10frequency'] = 999.0
    myBigMergedDF['labels'] = np.nan
    myBigMergedDF['position'] = ""
+    myBigMergedDF['Selected Population'] = ""
    # Assign ESCOTT scores to PRESCOTT scores.
    # Then, we will modify them according to different conditions. 
@@ -850,27 +978,54 @@ def main():
    if(file_extension == ".csv"):
        print("@> You frequency data is in gnomAD format!")
-        print("@> GnomAD data version (Default=4)  : {}".format(str(args.gnomadversion)))
+        print("@> GnomAD data version (Default=41 for 4.1)  : {}".format(str(args.gnomadversion)))
        if (args.gnomadversion==2 or args.gnomadversion==3):
            gnomadDF = getGnomADOverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
-        elif (args.gnomadversion==4):
+            # Assign labels to pathogenic/benign mutations for performance evaluation
-            gnomadDF = getGnomADV4OverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
+            gnomadDF['labels'] = ""
+            for index, row in gnomadDF.iterrows():
+                if ((row['ClinVar Clinical Significance']=='Benign/Likely benign') or \
+                    (row['ClinVar Clinical Significance']=='Benign') or \
+                    (row['ClinVar Clinical Significance']=='Likely benign')):
+                    gnomadDF.at[index,'labels'] = 0
+                if((row['ClinVar Clinical Significance']=='Pathogenic/Likely pathogenic') or \
+                    (row['ClinVar Clinical Significance']=='Pathogenic') or \
+                    (row['ClinVar Clinical Significance']=='Likely pathogenic')):
+                    gnomadDF.at[index,'labels'] = 1
+        elif (args.gnomadversion==40):
+            gnomadDF = getGnomADV4p0_OverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
+                    # Assign labels to pathogenic/benign mutations for performance evaluation
+            gnomadDF['labels'] = ""
+            for index, row in gnomadDF.iterrows():
+                if ((row['ClinVar Clinical Significance']=='Benign/Likely benign') or \
+                    (row['ClinVar Clinical Significance']=='Benign') or \
+                    (row['ClinVar Clinical Significance']=='Likely benign')):
+                    gnomadDF.at[index,'labels'] = 0
+                if((row['ClinVar Clinical Significance']=='Pathogenic/Likely pathogenic') or \
+                    (row['ClinVar Clinical Significance']=='Pathogenic') or \
+                    (row['ClinVar Clinical Significance']=='Likely pathogenic')):
+                    gnomadDF.at[index,'labels'] = 1
+        elif (args.gnomadversion==41):
+            gnomadDF = getGnomADV4p1_OverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
+                    # Assign labels to pathogenic/benign mutations for performance evaluation
+            gnomadDF['labels'] = ""
+            for index, row in gnomadDF.iterrows():
+                if ((row['ClinVar Germline Classification']=='Benign/Likely benign') or \
+                    (row['ClinVar Germline Classification']=='Benign') or \
+                    (row['ClinVar Germline Classification']=='Likely benign')):
+                    gnomadDF.at[index,'labels'] = 0
+                if((row['ClinVar Germline Classification']=='Pathogenic/Likely pathogenic') or \
+                    (row['ClinVar Germline Classification']=='Pathogenic') or \
+                    (row['ClinVar Germline Classification']=='Likely pathogenic')):
+                    gnomadDF.at[index,'labels'] = 1
        else:
            print("ERROR: Unknown GnomAD version!")
            sys.exit(-1)
-        # Assign labels to pathogenic/benign mutations for performance evaluation
-        gnomadDF['labels'] = ""
-        for index, row in gnomadDF.iterrows():
-            if ((row['ClinVar Clinical Significance']=='Benign/Likely benign') or \
-                (row['ClinVar Clinical Significance']=='Benign') or \
-                (row['ClinVar Clinical Significance']=='Likely benign')):
-                gnomadDF.at[index,'labels'] = 0
-            if((row['ClinVar Clinical Significance']=='Pathogenic/Likely pathogenic') or \
-                (row['ClinVar Clinical Significance']=='Pathogenic') or \
-                (row['ClinVar Clinical Significance']=='Likely pathogenic')):
-                gnomadDF.at[index,'labels'] = 1
        if (len(gnomadDF.loc[(gnomadDF['labels']==0) | (gnomadDF['labels']==1)]) > 0):   
            print(gnomadDF.loc[(gnomadDF['labels']==0) | (gnomadDF['labels']==1)])
        # print(gnomadDF['ClinVar Clinical Significance'])
@@ -896,6 +1051,8 @@ def main():
                if (len(temp) > 0):
                    myBigMergedDF.at[index,'log10frequency'] = temp[0]
                    myBigMergedDF.at[index,'labels'] = gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'labels'].values[0]
+                    myBigMergedDF.at[index,'Selected Population'] = \
+                        gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'Selected Population'].values[0].replace('Allele Count ', '')
            # print(myBigMergedDF)
            # scalingCoeff = args.coefficient