Commit 15106f2c by Mustafa Tekpinar

Added example aliMLH1.fasta and mlh1-af2.pdb files to the data directory.

parent 837ed33e
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -146,6 +146,147 @@ def getGnomADOverallFrequency(infile, usePopMax="true"): ...@@ -146,6 +146,147 @@ def getGnomADOverallFrequency(infile, usePopMax="true"):
dfMissense['mutant'] = mutantList dfMissense['mutant'] = mutantList
dfMissense['protein'] = proteinNameList dfMissense['protein'] = proteinNameList
return (dfMissense) return (dfMissense)
def getGnomADOverallFrequencyV2(infile, usePopMax="true"):
"""
In V2 of this function, I'll assing (practically) zero frequency (10**-10)
to the mutations that are observed only in one community out of 8.
"""
# df = pd.read_csv("P53_gnomAD_v3.1.2_ENSG00000141510_2023_07_04_16_10_48.csv")
df = pd.read_csv(infile)
# print(df.columns)
# #Select only columns containing missense variants!
# df = df.loc[df[]=='']
#print(df1.columns)
dfMissense = df.loc[df['VEP Annotation']=='missense_variant',
['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence',
'VEP Annotation', 'ClinVar Clinical Significance', 'ClinVar Variation ID', 'Flags',
'Allele Count', 'Allele Number', 'Allele Frequency',
'Homozygote Count', 'Hemizygote Count',
'Allele Count African/African American',
'Allele Number African/African American',
'Homozygote Count African/African American',
'Hemizygote Count African/African American',
'Allele Count Latino/Admixed American',
'Allele Number Latino/Admixed American',
'Homozygote Count Latino/Admixed American',
'Hemizygote Count Latino/Admixed American',
'Allele Count Ashkenazi Jewish',
'Allele Number Ashkenazi Jewish',
'Homozygote Count Ashkenazi Jewish',
'Hemizygote Count Ashkenazi Jewish',
'Allele Count East Asian',
'Allele Number East Asian',
'Homozygote Count East Asian',
'Hemizygote Count East Asian',
'Allele Count European (Finnish)',
'Allele Number European (Finnish)',
'Homozygote Count European (Finnish)',
'Hemizygote Count European (Finnish)',
'Allele Count European (non-Finnish)',
'Allele Number European (non-Finnish)',
'Homozygote Count European (non-Finnish)',
'Hemizygote Count European (non-Finnish)',
'Allele Count Other',
'Allele Number Other',
'Homozygote Count Other',
'Hemizygote Count Other',
'Allele Count South Asian',
'Allele Number South Asian',
'Homozygote Count South Asian',
'Hemizygote Count South Asian']]
dfMissense = dfMissense.reset_index()
dfMissense['Allele Frequency']
dfMissense['Allele Frequency Log'] = ""
if(usePopMax.lower()=="true"):
alleleCountList = ['Allele Count African/African American',
'Allele Count Latino/Admixed American',
'Allele Count Ashkenazi Jewish',
'Allele Count East Asian',
'Allele Count European (Finnish)',
'Allele Count European (non-Finnish)',
'Allele Count Other',
'Allele Count South Asian']
alleleNumberList = ['Allele Number African/African American',
'Allele Number Latino/Admixed American',
'Allele Number Ashkenazi Jewish',
'Allele Number East Asian',
'Allele Number European (Finnish)',
'Allele Number European (non-Finnish)',
'Allele Number Other',
'Allele Number South Asian']
for index, row in dfMissense.iterrows():
maxFreq = 0.0
tempIndex = 0
# Get counts for 8 populations.
alleleCountsAccrossPopulations = []
for i in range(len(alleleCountList)):
alleleCountsAccrossPopulations.append(row[alleleCountList[i]])
zeroOccuranceCount = alleleCountsAccrossPopulations.count(0)
if (zeroOccuranceCount>1):
for i in range(len(alleleCountList)):
if(row[alleleNumberList[i]]!=0):
tempValue = (row[alleleCountList[i]]/row[alleleNumberList[i]])
if(tempValue>maxFreq):
maxFreq=tempValue
tempIndex = i
# Avoid zero frequency error by setting it to a very low number such as 10**-10
if(maxFreq==0.0):
maxFreq = 10**-10 # Which means 1 in 10 billion, which is the estimated population in 2050.
# print(maxFreq)
dfMissense.at[index,'Selected Population'] = alleleCountList[tempIndex]
dfMissense.at[index,'Allele Frequency Log'] = np.log10(maxFreq)
else:
dfMissense.at[index,'Selected Population'] = None
dfMissense.at[index,'Allele Frequency Log'] = np.log10(10**-10)
#print(np.log10(maxFreq))
else:
# Avoid zero frequency error by setting it to a very low number such as 10**-10
for index, row in dfMissense.iterrows():
if(row['Allele Frequency']==0.0):
dfMissense.at[index,'Allele Frequency Log'] = np.log10(10**-10)
else:
dfMissense.at[index,'Allele Frequency Log'] = np.log10(row['Allele Frequency'])
# sys.exit(-1)
#print(dfMissense['Allele Frequency Log'])
#print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
# dfMissense.dropna(subset = ['ClinVar Clinical Significance'], inplace=True)
dfMissense = dfMissense.reset_index()
#print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
# plt.figure()
# plt.hist(dfMissense['Allele Frequency Log'], density=False, color='red', label='pathogenic')
# plt.show()
mutantList = []
proteinNameList = []
for index, row in dfMissense.iterrows():
source = one_letter[row['Protein Consequence'][2:5].upper()]
position = (row['Protein Consequence'][5:-3])
target = one_letter[(row['Protein Consequence'][-3:]).upper()]
mutant = source+position+target
mutantList.append(mutant)
# print(mutant, row['Protein Consequence'])
# print(infile.split('_')[2])
proteinNameList.append(os.path.basename(infile).split('_')[2])
dfMissense['mutant'] = mutantList
dfMissense['protein'] = proteinNameList
return (dfMissense)
alphabeticalAminoAcidsList = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', alphabeticalAminoAcidsList = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
def writeSinglelineFormat(scanningMatrix, outFile, residueList, def writeSinglelineFormat(scanningMatrix, outFile, residueList,
...@@ -520,6 +661,28 @@ def main(): ...@@ -520,6 +661,28 @@ def main():
selectedValuesList.append(temp2) selectedValuesList.append(temp2)
selectedPositionsList.append(row['position']) selectedPositionsList.append(row['position'])
selectedMutantsList.append(row['mutant']) selectedMutantsList.append(row['mutant'])
if(version==4):
temp2 = temp1 - scalingCoeff*(freqCutoff - freq)/freqCutoff
if(freq>freqCutoff):
if(temp2<0.0):
temp2 = 0.0
myBigMergedDF.at[index,'PRESCOTT'] = temp2
if(label==0 or label==1):
selectedValuesList.append(temp2)
selectedPositionsList.append(row['position'])
selectedMutantsList.append(row['mutant'])
else:
print(myBigMergedDF.loc[index, 'Selected Population'])
sys.exit(-1)
if (myBigMergedDF.iloc[index,'Selected Population'].values==None):
if(temp2>1.0):
temp2 = 1.0
myBigMergedDF.at[index,'PRESCOTT'] = temp2
if(label==0 or label==1):
selectedValuesList.append(temp2)
selectedPositionsList.append(row['position'])
selectedMutantsList.append(row['mutant'])
# myBigMergedDF.dropna(subset = ['labels'], inplace=True) # myBigMergedDF.dropna(subset = ['labels'], inplace=True)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment