Added example aliMLH1.fasta and mlh1-af2.pdb files to the data directory.

15106f2c · Mustafa Tekpinar · 837ed33e · 15106f2c · 15106f2c · 15106f2c
Commit 15106f2c authored Oct 18, 2023 by Mustafa Tekpinar
Show whitespace changes
Inline Side-by-side

Showing with 163 additions and 0 deletions

aliMLH1.fasta data/aliMLH1.fasta +0 -0

mlh1-af2.pdb data/mlh1-af2.pdb +0 -0

prescott.py prescott/prescott.py +163 -0

No files found.
--- a/data/aliMLH1.fasta
+++ b/data/aliMLH1.fasta
--- a/data/mlh1-af2.pdb
+++ b/data/mlh1-af2.pdb
--- a/prescott/prescott.py
+++ b/prescott/prescott.py
@@ -146,6 +146,147 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
    dfMissense['mutant'] = mutantList
    dfMissense['protein'] = proteinNameList
    return (dfMissense)
+def getGnomADOverallFrequencyV2(infile, usePopMax="true"):
+    """
+        In V2 of this function, I'll assing (practically) zero frequency (10**-10)
+        to the mutations that are observed only in one community out of 8. 
+    """
+    # df = pd.read_csv("P53_gnomAD_v3.1.2_ENSG00000141510_2023_07_04_16_10_48.csv")
+    df = pd.read_csv(infile)
+    # print(df.columns)
+    # #Select only columns containing missense variants!
+    # df = df.loc[df[]=='']
+    #print(df1.columns)
+    dfMissense = df.loc[df['VEP Annotation']=='missense_variant',
+                    ['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence',
+                    'VEP Annotation', 'ClinVar Clinical Significance', 'ClinVar Variation ID', 'Flags', 
+                    'Allele Count', 'Allele Number', 'Allele Frequency', 
+                    'Homozygote Count', 'Hemizygote Count',
+                    'Allele Count African/African American',
+                    'Allele Number African/African American',
+                    'Homozygote Count African/African American',
+                    'Hemizygote Count African/African American',
+                    'Allele Count Latino/Admixed American',
+                    'Allele Number Latino/Admixed American',
+                    'Homozygote Count Latino/Admixed American',
+                    'Hemizygote Count Latino/Admixed American',
+                    'Allele Count Ashkenazi Jewish', 
+                    'Allele Number Ashkenazi Jewish',
+                    'Homozygote Count Ashkenazi Jewish',
+                    'Hemizygote Count Ashkenazi Jewish', 
+                    'Allele Count East Asian',
+                    'Allele Number East Asian', 
+                    'Homozygote Count East Asian',
+                    'Hemizygote Count East Asian', 
+                    'Allele Count European (Finnish)',
+                    'Allele Number European (Finnish)',
+                    'Homozygote Count European (Finnish)',
+                    'Hemizygote Count European (Finnish)',
+                    'Allele Count European (non-Finnish)',
+                    'Allele Number European (non-Finnish)',
+                    'Homozygote Count European (non-Finnish)',
+                    'Hemizygote Count European (non-Finnish)', 
+                    'Allele Count Other',
+                    'Allele Number Other', 
+                    'Homozygote Count Other',
+                    'Hemizygote Count Other', 
+                    'Allele Count South Asian',
+                    'Allele Number South Asian', 
+                    'Homozygote Count South Asian',
+                    'Hemizygote Count South Asian']]
+    dfMissense = dfMissense.reset_index()
+    dfMissense['Allele Frequency']
+    dfMissense['Allele Frequency Log'] = ""
+    if(usePopMax.lower()=="true"):
+        alleleCountList = ['Allele Count African/African American',
+                        'Allele Count Latino/Admixed American',
+                        'Allele Count Ashkenazi Jewish', 
+                        'Allele Count East Asian',
+                        'Allele Count European (Finnish)',
+                        'Allele Count European (non-Finnish)',
+                        'Allele Count Other',
+                        'Allele Count South Asian']
+        alleleNumberList = ['Allele Number African/African American',
+                        'Allele Number Latino/Admixed American',
+                        'Allele Number Ashkenazi Jewish',
+                        'Allele Number East Asian', 
+                        'Allele Number European (Finnish)',
+                        'Allele Number European (non-Finnish)',
+                        'Allele Number Other', 
+                        'Allele Number South Asian']
+        for index, row in dfMissense.iterrows():
+            maxFreq = 0.0
+            tempIndex = 0
+            # Get counts for 8 populations.
+            alleleCountsAccrossPopulations = []
+            for i in range(len(alleleCountList)):
+                alleleCountsAccrossPopulations.append(row[alleleCountList[i]])
+            zeroOccuranceCount = alleleCountsAccrossPopulations.count(0)
+            if (zeroOccuranceCount>1):
+                for i in range(len(alleleCountList)):
+                    if(row[alleleNumberList[i]]!=0):
+                        tempValue = (row[alleleCountList[i]]/row[alleleNumberList[i]])
+                        if(tempValue>maxFreq):
+                            maxFreq=tempValue
+                            tempIndex = i
+                # Avoid zero frequency error by setting it to a very low number such as 10**-10
+                if(maxFreq==0.0):
+                    maxFreq = 10**-10 # Which means 1 in 10 billion, which is the estimated population in 2050. 
+                # print(maxFreq)
+                dfMissense.at[index,'Selected Population'] = alleleCountList[tempIndex]
+                dfMissense.at[index,'Allele Frequency Log'] = np.log10(maxFreq)
+            else:
+                dfMissense.at[index,'Selected Population'] = None
+                dfMissense.at[index,'Allele Frequency Log'] = np.log10(10**-10)
+            #print(np.log10(maxFreq))
+    else:
+        # Avoid zero frequency error by setting it to a very low number such as 10**-10
+        for index, row in dfMissense.iterrows():
+            if(row['Allele Frequency']==0.0):
+                dfMissense.at[index,'Allele Frequency Log'] =  np.log10(10**-10)
+            else:
+                dfMissense.at[index,'Allele Frequency Log'] =  np.log10(row['Allele Frequency'])
+    # sys.exit(-1)
+    #print(dfMissense['Allele Frequency Log'])
+    #print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
+    # dfMissense.dropna(subset = ['ClinVar Clinical Significance'], inplace=True)
+    dfMissense = dfMissense.reset_index()
+    #print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
+    # plt.figure()
+    # plt.hist(dfMissense['Allele Frequency Log'], density=False, color='red', label='pathogenic')
+    # plt.show()
+    mutantList = []
+    proteinNameList = []
+    for index, row in dfMissense.iterrows():
+        source = one_letter[row['Protein Consequence'][2:5].upper()]
+        position = (row['Protein Consequence'][5:-3])
+        target = one_letter[(row['Protein Consequence'][-3:]).upper()]
+        mutant = source+position+target
+        mutantList.append(mutant)
+        # print(mutant, row['Protein Consequence'])
+        # print(infile.split('_')[2])
+        proteinNameList.append(os.path.basename(infile).split('_')[2])
+    dfMissense['mutant'] = mutantList
+    dfMissense['protein'] = proteinNameList
+    return (dfMissense)
 alphabeticalAminoAcidsList = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
                              'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
 def writeSinglelineFormat(scanningMatrix, outFile, residueList, 
@@ -520,6 +661,28 @@ def main():
                            selectedValuesList.append(temp2)
                            selectedPositionsList.append(row['position'])
                            selectedMutantsList.append(row['mutant'])
+                if(version==4):
+                    temp2 = temp1 - scalingCoeff*(freqCutoff - freq)/freqCutoff
+                    if(freq>freqCutoff):
+                        if(temp2<0.0):
+                            temp2 = 0.0
+                        myBigMergedDF.at[index,'PRESCOTT'] = temp2
+                        if(label==0 or label==1):
+                            selectedValuesList.append(temp2)
+                            selectedPositionsList.append(row['position'])
+                            selectedMutantsList.append(row['mutant'])
+                    else:
+                        print(myBigMergedDF.loc[index, 'Selected Population'])
+                        sys.exit(-1)
+                        if (myBigMergedDF.iloc[index,'Selected Population'].values==None):                
+                            if(temp2>1.0):
+                                temp2 = 1.0
+                            myBigMergedDF.at[index,'PRESCOTT'] = temp2
+                            if(label==0 or label==1):
+                                selectedValuesList.append(temp2)
+                                selectedPositionsList.append(row['position'])
+                                selectedMutantsList.append(row['mutant'])                        
    # myBigMergedDF.dropna(subset = ['labels'], inplace=True)