Changes in prescott.py for csv files of gnomAD v4.0.0.

301e36d5 · Mustafa Tekpinar · 310fd718 · 301e36d5
Commit 301e36d5 authored Nov 07, 2023 by Mustafa Tekpinar
Show whitespace changes
Inline Side-by-side

Showing with 141 additions and 0 deletions

prescott.py prescott/prescott.py +141 -0

No files found.
--- a/prescott/prescott.py
+++ b/prescott/prescott.py
@@ -287,6 +287,136 @@ def getGnomADOverallFrequencyV2(infile, usePopMax="true"):
    dfMissense['protein'] = proteinNameList
    return (dfMissense)
+def getGnomADV4OverallFrequency(infile, usePopMax="true"):
+    """
+        This version is for gnomAD v4. 
+        -Latino/Admixed is renamed as Admixed                                                                                                                                                                          
+        -I used Middle Eastern instead of Other field.
+    """
+    df = pd.read_csv(infile)
+    # print(df.columns)
+    #print(df1.columns)
+    dfMissense = df.loc[df['VEP Annotation']=='missense_variant',
+                    ['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence',
+                    'VEP Annotation', 'ClinVar Clinical Significance', 'ClinVar Variation ID', 'Flags', 
+                    'Allele Count', 'Allele Number', 'Allele Frequency', 
+                    'Homozygote Count', 'Hemizygote Count',
+                    'Allele Count African/African American',
+                    'Allele Number African/African American',
+                    'Homozygote Count African/African American',
+                    'Hemizygote Count African/African American',
+                    'Allele Count Admixed American',
+                    'Allele Number Admixed American',
+                    'Homozygote Count Admixed American',
+                    'Hemizygote Count Admixed American',
+                    'Allele Count Ashkenazi Jewish', 
+                    'Allele Number Ashkenazi Jewish',
+                    'Homozygote Count Ashkenazi Jewish',
+                    'Hemizygote Count Ashkenazi Jewish', 
+                    'Allele Count East Asian',
+                    'Allele Number East Asian', 
+                    'Homozygote Count East Asian',
+                    'Hemizygote Count East Asian', 
+                    'Allele Count European (Finnish)',
+                    'Allele Number European (Finnish)',
+                    'Homozygote Count European (Finnish)',
+                    'Hemizygote Count European (Finnish)',
+                    'Allele Count European (non-Finnish)',
+                    'Allele Number European (non-Finnish)',
+                    'Homozygote Count European (non-Finnish)',
+                    'Hemizygote Count European (non-Finnish)', 
+                    'Allele Count Middle Eastern',
+                    'Allele Number Middle Eastern', 
+                    'Homozygote Count Middle Eastern',
+                    'Hemizygote Count Middle Eastern', 
+                    'Allele Count South Asian',
+                    'Allele Number South Asian', 
+                    'Homozygote Count South Asian',
+                    'Hemizygote Count South Asian']]
+    dfMissense = dfMissense.reset_index()
+    dfMissense['Allele Frequency']
+    dfMissense['Allele Frequency Log'] = ""
+    if(usePopMax.lower()=="true"):
+        alleleCountList = ['Allele Count African/African American',
+                        'Allele Count Admixed American',
+                        'Allele Count Ashkenazi Jewish', 
+                        'Allele Count East Asian',
+                        'Allele Count European (Finnish)',
+                        'Allele Count European (non-Finnish)',
+                        'Allele Count Middle Eastern',
+                        'Allele Count South Asian']
+        alleleNumberList = ['Allele Number African/African American',
+                        'Allele Number Admixed American',
+                        'Allele Number Ashkenazi Jewish',
+                        'Allele Number East Asian', 
+                        'Allele Number European (Finnish)',
+                        'Allele Number European (non-Finnish)',
+                        'Allele Number Middle Eastern', 
+                        'Allele Number South Asian']
+        for index, row in dfMissense.iterrows():
+            maxFreq = 0.0
+            tempIndex = 0
+            for i in range(len(alleleCountList)):
+                if(row[alleleNumberList[i]]!=0):
+                    tempValue = (row[alleleCountList[i]]/row[alleleNumberList[i]])
+                    if(tempValue>maxFreq):
+                        maxFreq=tempValue
+                        tempIndex = i
+            # Avoid zero frequency error by setting it to a very low number such as 10**-10
+            if(maxFreq==0.0):
+                maxFreq = 10**-10 # Which means 1 in 10 billion, which is the estimated population in 2050. 
+            # print(maxFreq)
+            dfMissense.at[index,'Selected Population'] = alleleCountList[tempIndex]
+            dfMissense.at[index,'Allele Frequency Log'] = np.log10(maxFreq)
+            #print(np.log10(maxFreq))
+    else:
+        # Avoid zero frequency error by setting it to a very low number such as 10**-10
+        for index, row in dfMissense.iterrows():
+            if(row['Allele Frequency']==0.0):
+                dfMissense.at[index,'Allele Frequency Log'] =  np.log10(10**-10)
+            else:
+                dfMissense.at[index,'Allele Frequency Log'] =  np.log10(row['Allele Frequency'])
+    # sys.exit(-1)
+    #print(dfMissense['Allele Frequency Log'])
+    #print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
+    # dfMissense.dropna(subset = ['ClinVar Clinical Significance'], inplace=True)
+    dfMissense = dfMissense.reset_index()
+    #print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
+    # plt.figure()
+    # plt.hist(dfMissense['Allele Frequency Log'], density=False, color='red', label='pathogenic')
+    # plt.show()
+    mutantList = []
+    proteinNameList = []
+    for index, row in dfMissense.iterrows():
+        source = one_letter[row['Protein Consequence'][2:5].upper()]
+        position = (row['Protein Consequence'][5:-3])
+        target = one_letter[(row['Protein Consequence'][-3:]).upper()]
+        mutant = source+position+target
+        mutantList.append(mutant)
+        # print(mutant, row['Protein Consequence'])
+        # print(infile.split('_')[2])
+        proteinNameList.append(os.path.basename(infile).split('_')[2])
+    dfMissense['mutant'] = mutantList
+    dfMissense['protein'] = proteinNameList
+    return (dfMissense)
 alphabeticalAminoAcidsList = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
                              'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
 def writeSinglelineFormat(scanningMatrix, outFile, residueList, 
@@ -479,6 +609,10 @@ def main():
            'gemme: a horizontal format of 20 rows and N columns.\n'+\
            'singleline: each line contains a mutation and its value separated by a space.\n'+\
            'M1A 0.378\n', required=False, default='gemme')
+    main_parser.add_argument('--gnomadversion', dest='gnomadversion', type=int, \
+        help='An integer value. Default is version 4 (4.0.0) of GnomAD! \n Other possible versions are 2 and 3.',
+        required=False, default=4)
    # main_parser.add_argument('--colormap', dest='colormap', type=str, \
    #     help='A colormap as defined in matplotlib',
    #     required=False, default='coolwarm_r')
@@ -506,6 +640,7 @@ def main():
    print("@> Scaling coefficient (Default=1.0): {}".format(args.coefficient))
    print("@> Frequency cutoff (Default=-4.0)  : {}".format(args.frequencycutoff))
    print("@> Name of the output file          : {}".format(args.outputfile))
+    print("@> GnomAD data version (Default=4)  : {}".format(str(args.gnomadversion)))
    # End of argument parsing!
    protein = os.path.splitext(os.path.basename(args.escottfile))[0]
@@ -564,7 +699,13 @@ def main():
    myBigMergedDF = pd.DataFrame()
    myBigMergedDF = pd.concat([myBigMergedDF, dfESCOTT], ignore_index=True)
+    if (args.gnomadversion==2 or args.gnomadversion==3):
        gnomadDF = getGnomADOverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
+    elif (args.gnomadversion==4):
+        gnomadDF = getGnomADV4OverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
+    else:
+        print("ERROR: Unknown GnomAD version!")
+        sys.exit(-1)
    # Assign labels to pathogenic/benign mutations for performance evaluation
    gnomadDF['labels'] = ""