Commit de8d6564 by Mustafa Tekpinar

Added the first version prescott.py and two data files for it.

parent 8c6cd547
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import mixture
from sklearn import metrics
import numpy as np
import argparse
one_letter ={'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q', \
'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 'TYR':'Y', \
'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 'MET':'M', 'ALA':'A', \
'GLY':'G', 'PRO':'P', 'CYS':'C'}
def plotROCandAUCV2(labels, pred):
fpr, tpr, thresholds = metrics.roc_curve(labels, pred, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
return fpr, tpr, roc_auc
def getGnomADOverallFrequency(infile, usePopMax="true"):
# df = pd.read_csv("P53_gnomAD_v3.1.2_ENSG00000141510_2023_07_04_16_10_48.csv")
df = pd.read_csv(infile)
# print(df.columns)
#print(df1.columns)
dfMissense = df.loc[df['VEP Annotation']=='missense_variant',
['HGVS Consequence', 'Protein Consequence', 'Transcript Consequence',
'VEP Annotation', 'ClinVar Clinical Significance', 'ClinVar Variation ID', 'Flags',
'Allele Count', 'Allele Number', 'Allele Frequency',
'Homozygote Count', 'Hemizygote Count',
'Allele Count African/African American',
'Allele Number African/African American',
'Homozygote Count African/African American',
'Hemizygote Count African/African American',
'Allele Count Latino/Admixed American',
'Allele Number Latino/Admixed American',
'Homozygote Count Latino/Admixed American',
'Hemizygote Count Latino/Admixed American',
'Allele Count Ashkenazi Jewish',
'Allele Number Ashkenazi Jewish',
'Homozygote Count Ashkenazi Jewish',
'Hemizygote Count Ashkenazi Jewish',
'Allele Count East Asian',
'Allele Number East Asian',
'Homozygote Count East Asian',
'Hemizygote Count East Asian',
'Allele Count European (Finnish)',
'Allele Number European (Finnish)',
'Homozygote Count European (Finnish)',
'Hemizygote Count European (Finnish)',
'Allele Count European (non-Finnish)',
'Allele Number European (non-Finnish)',
'Homozygote Count European (non-Finnish)',
'Hemizygote Count European (non-Finnish)',
'Allele Count Other',
'Allele Number Other',
'Homozygote Count Other',
'Hemizygote Count Other',
'Allele Count South Asian',
'Allele Number South Asian',
'Homozygote Count South Asian',
'Hemizygote Count South Asian']]
dfMissense = dfMissense.reset_index()
dfMissense['Allele Frequency']
dfMissense['Allele Frequency Log'] = ""
if(usePopMax.lower()=="true"):
alleleCountList = ['Allele Count African/African American',
'Allele Count Latino/Admixed American',
'Allele Count Ashkenazi Jewish',
'Allele Count East Asian',
'Allele Count European (Finnish)',
'Allele Count European (non-Finnish)',
'Allele Count Other',
'Allele Count South Asian']
alleleNumberList = ['Allele Number African/African American',
'Allele Number Latino/Admixed American',
'Allele Number Ashkenazi Jewish',
'Allele Number East Asian',
'Allele Number European (Finnish)',
'Allele Number European (non-Finnish)',
'Allele Number Other',
'Allele Number South Asian']
for index, row in dfMissense.iterrows():
maxFreq = 0.0
tempIndex = 0
for i in range(len(alleleCountList)):
if(row[alleleNumberList[i]]!=0):
tempValue = (row[alleleCountList[i]]/row[alleleNumberList[i]])
if(tempValue>maxFreq):
maxFreq=tempValue
tempIndex = i
# Avoid zero frequency error by setting it to a very low number such as 10**-10
if(maxFreq==0.0):
maxFreq = 10**-10 # Which means 1 in 10 billion, which is the estimated population in 2050.
# print(maxFreq)
dfMissense.at[index,'Selected Population'] = alleleCountList[tempIndex]
dfMissense.at[index,'Allele Frequency Log'] = np.log10(maxFreq)
#print(np.log10(maxFreq))
else:
# Avoid zero frequency error by setting it to a very low number such as 10**-10
for index, row in dfMissense.iterrows():
if(row['Allele Frequency']==0.0):
dfMissense.at[index,'Allele Frequency Log'] = np.log10(10**-10)
else:
dfMissense.at[index,'Allele Frequency Log'] = np.log10(row['Allele Frequency'])
# sys.exit(-1)
#print(dfMissense['Allele Frequency Log'])
#print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
# dfMissense.dropna(subset = ['ClinVar Clinical Significance'], inplace=True)
dfMissense = dfMissense.reset_index()
#print(dfMissense[['Allele Frequency', 'ClinVar Clinical Significance']])
# plt.figure()
# plt.hist(dfMissense['Allele Frequency Log'], density=False, color='red', label='pathogenic')
# plt.show()
mutantList = []
proteinNameList = []
for index, row in dfMissense.iterrows():
source = one_letter[row['Protein Consequence'][2:5].upper()]
position = (row['Protein Consequence'][5:-3])
target = one_letter[(row['Protein Consequence'][-3:]).upper()]
mutant = source+position+target
mutantList.append(mutant)
# print(mutant, row['Protein Consequence'])
# print(infile.split('_')[2])
proteinNameList.append(os.path.basename(infile).split('_')[2])
dfMissense['mutant'] = mutantList
dfMissense['protein'] = proteinNameList
return (dfMissense)
if __name__ == "__main__":
# Adding the main parser
main_parser = argparse.ArgumentParser(description=\
"A Python toolkit to prepare, modify, visualize and analyze deep mutational scanning (DMS) data of proteins.")
main_parser.add_argument('-e', '--escottfile', dest='escottfile', type=str, \
help='Main input for prescott is an ESCOTT single point mutation file.', \
required=True, default=None)
main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \
help='The second input for prescott is a gnomad frequency file for the protein\n.'+
'You can obtain it from https://gnomad.broadinstitute.org/ by searching your favorite protein (such as P53).\n'+
'After you found your protein, go to the \'Export variants to CSV\' link and click it do download the csv file you needed!', \
required=True, default=None)
main_parser.add_argument('-o', '--outputfile', dest='outputfile', type=str, \
help='Name of the output file without file extension. Default extension is txt.', \
required=False, default='prescott-scores.txt')
main_parser.add_argument('--usepopmax', dest='usepopmax', type=str, \
help='A true or false value to use population max. frequency of one of the eight populations available in GnomAD (Default is true).',
required=False, default="true")
main_parser.add_argument('--equation', dest='equation', type=int, \
help='An integer value. Default is 2, which means it only reduces scores if it is too frequent!',
required=False, default=2)
main_parser.add_argument('-s', '--scalingcoefficient', dest='scalingcoefficient', type=float, \
help='Scaling coefficient to adjust contribution of frequency information to the scores. \n It is denoted with c in the model equation. Default is 1.0', \
required=False, default=1.0)
main_parser.add_argument('-f', '--frequencycutoff', dest='frequencycutoff', type=float, \
help='Frequency cutoff value. \n It is denoted with Fc in the model equation. Default is -4.0.', \
required=False, default=-4.0)
main_parser.add_argument('--usefrequencies', dest='usefrequencies', type=str, \
help='Do not touch this if you don\'t know what you are doing! Default is true',
required=False, default='true')
# main_parser.add_argument('--colormap', dest='colormap', type=str, \
# help='A colormap as defined in matplotlib',
# required=False, default='coolwarm_r')
# main_parser.add_argument('--field', dest='field', type=int, \
# help='An integer value starting from 0 for reading the rhapsody output',
# required=False, default=10)
# main_parser.add_argument('-b', '--beginning', dest='beginning', type=int, \
# help='An integer to indicate the first residue index.',
# required=False, default=1)
# main_parser.add_argument('-e', '--end', dest='end', type=int, \
# help='An integer to indicate the final residue index.',
# required=False, default=None)
args = main_parser.parse_args()
print("\n\n@> Running PRESCOTT with the following parameters:\n\n")
print("@> ESCOTT file : {}".format(args.escottfile))
print("@> GNOMAD frequency file : {}".format(args.gnomadfile))
print("@> Use population max. freq : {}".format(str(args.usepopmax).lower()))
print("@> Which equation to use (Default=2): {}".format(str(args.equation)))
print("@> Scaling coefficient (Default=1.0): {}".format(args.scalingcoefficient))
print("@> Frequency cutoff (Default=-4.0) : {}".format(args.frequencycutoff))
print("@> Name of the output file : {}".format(args.outputfile))
# End of argument parsing!
mainPath = "/mnt/data/tekpinar/ESGEMME_vs_EVE_all_data"
# dataFolder = "/esgemme-v-1-4-0-max-two-components-eve-msas-entire-single-point-mutations"
# dataFolder = "/egemme-v-1-3-0-eve-msas-entire-single-point-mutations"
# dataFolder = sys.argv[9]
# mutFilePath = "/mnt/data/tekpinar/research/datasets/acmgClinVar/acmg-v3p1-with-esgemme-v1-4-0-eve-MSAs/all_gene_names_v4_existing_asm_normalized.txt"
varFilePath = "/variant_files"
# MUT_FILE = open(mutFilePath, "r")
# allLines = MUT_FILE.readlines()
# MUT_FILE.close()
allPathogenicList = []
allBenignList = []
# selectedListFile=open("all_gene_names_v4_existing_asm.txt" ,"w")
escottDataPath = args.escottfile
protein = os.path.splitext(os.path.basename(escottDataPath))[0]
# esmVariantsPath="/mnt/data/tekpinar/software/esm-variants/entire-dataset/"
# print(escottDataPath)
# Check if file exists
usePopMaxOrNot = args.usepopmax.lower()
version = args.equation
if (os.path.exists(escottDataPath)):
dfESCOTT = pd.read_table(escottDataPath, sep='\s+', header=None)
dfESCOTT.columns = ['mutant', 'ESCOTT']
dfESCOTT['protein']=protein
else:
print("ERROR: ESCOTT input file does not exist!")
sys.exit(-1)
myBigMergedDF = pd.DataFrame()
myBigMergedDF = pd.concat([myBigMergedDF, dfESCOTT], ignore_index=True)
# figPrefix="all_gene_names_eve_msas_uniref100_"+method.lower()+"_vs_eve-asm_normalized-auc-sklearn-v5"
figPrefix="all_gene_names_eve_msas_uniref100_vs_eve-asm_normalized-auc-sklearn-v5"
gnomadDF = getGnomADOverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
# Assign labels to pathogenic/benign mutations for performance evaluation
gnomadDF['labels'] = ""
for index, row in gnomadDF.iterrows():
if ((row['ClinVar Clinical Significance']=='Benign/Likely benign') or \
(row['ClinVar Clinical Significance']=='Benign') or \
(row['ClinVar Clinical Significance']=='Likely benign')):
gnomadDF.at[index,'labels'] = 0
if((row['ClinVar Clinical Significance']=='Pathogenic/Likely pathogenic') or \
(row['ClinVar Clinical Significance']=='Pathogenic') or \
(row['ClinVar Clinical Significance']=='Likely pathogenic')):
gnomadDF.at[index,'labels'] = 1
print(gnomadDF.loc[(gnomadDF['labels']==0) | (gnomadDF['labels']==1)])
# print(gnomadDF['ClinVar Clinical Significance'])
# Add frequency column and a dummy frequency to each row in myBigMergedDF
myBigMergedDF['frequency'] = 999.0
myBigMergedDF['labels'] = np.nan
myBigMergedDF['position'] = ""
useFrequencies = "true"
selectedPositionsList = []
selectedValuesList = []
selectedMutantsList = []
# Assign ESCOTT scores to PRESCOTT scores.
# Then, we will modify them according to different conditions.
myBigMergedDF['PRESCOTT'] = myBigMergedDF['ESCOTT']
labelsList = []
if (useFrequencies.lower() == 'true'):
# print(myBigMergedDF)
for index, row in myBigMergedDF.iterrows():
myBigMergedDF.at[index,'position'] = row['mutant'][1:-1]
# print(row['mutant'], row['ESCOTT'])
# print(row['mutant'][1:-1])
temp = (gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'Allele Frequency Log'].values)
#print(temp)
if (len(temp) > 0):
myBigMergedDF.at[index,'frequency'] = temp[0]
myBigMergedDF.at[index,'labels'] = gnomadDF.loc[gnomadDF['mutant'] == row['mutant'], 'labels'].values[0]
# print(myBigMergedDF)
scalingCoeff = args.scalingcoefficient
freqCutoff = args.frequencycutoff
for index, row in myBigMergedDF.iterrows():
if(row['frequency'] != 999.0):
# print(row['frequency'])
# freq = np.log10(row['frequency'])
freq = row['frequency']
# print(freq)
temp1 = row['PRESCOTT']
label = row['labels']
if(version==1):
if(freq>freqCutoff):
temp2 = temp1 - freq*scalingCoeff/freqCutoff
if(temp2<0.0):
myBigMergedDF.at[index,'PRESCOTT'] = 0.0
selectedValuesList.append(0.0)
else:
myBigMergedDF.at[index,'PRESCOTT'] = temp2
selectedValuesList.append(temp2)
selectedPositionsList.append(row['position'])
selectedMutantsList.append(row['mutant'])
if(version==2):
if(freq>freqCutoff):
temp2 = temp1 - scalingCoeff*(freqCutoff - freq)/freqCutoff
if(temp2<0.0):
temp2 = 0.0
myBigMergedDF.at[index,'PRESCOTT'] = temp2
if(label==0 or label==1):
selectedValuesList.append(temp2)
selectedPositionsList.append(row['position'])
selectedMutantsList.append(row['mutant'])
if(version==3):
temp2 = temp1 - scalingCoeff*(freqCutoff - freq)/freqCutoff
if(freq>freqCutoff):
if(temp2<0.0):
temp2 = 0.0
myBigMergedDF.at[index,'PRESCOTT'] = temp2
if(label==0 or label==1):
selectedValuesList.append(temp2)
selectedPositionsList.append(row['position'])
selectedMutantsList.append(row['mutant'])
else:
if(temp2>1.0):
temp2 = 1.0
myBigMergedDF.at[index,'PRESCOTT'] = temp2
if(label==0 or label==1):
selectedValuesList.append(temp2)
selectedPositionsList.append(row['position'])
selectedMutantsList.append(row['mutant'])
# myBigMergedDF.dropna(subset = ['labels'], inplace=True)
clinvarLabeledDF = myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1)]
clinvarLabeledDF['labels'] = clinvarLabeledDF['labels'].astype('int64')
print(clinvarLabeledDF)
#print(myBigMergedDF.loc[(myBigMergedDF['labels']=='0') | (myBigMergedDF['labels']=='1'), 'labels'])
# print(clinvarLabeledDF['labels'].values)
# print(clinvarLabeledDF['ESCOTT'].values)
fprESCOTT, tprESCOTT, AUC_ESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
clinvarLabeledDF['ESCOTT'])
fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
clinvarLabeledDF['PRESCOTT'])
# fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'labels'], \
# myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'PRESCOTT'])
fig = plt.figure(figsize=(12,6))
# plt.rcParams.update({'font.size': 18})
plt.grid(linestyle='--')
# plt.title(protName + " - "+method+" AUC={:.2f}".format(AUC_ESCOTT))
plt.title("AUC={:.2f} -> AUC={:.2f}".format(AUC_ESCOTT, AUC_PRESCOTT))
plt.ylim([0.0, 1.0])
#plt.xlim([1000, 1863])
plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'ESCOTT'], marker='o', color='red', label='pathogenic')
plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'ESCOTT'], marker='o', color='blue', label='benign')
if (useFrequencies.lower() == 'true'):
#print(selectedPositionsList)
#print(selectedValuesList)
plt.scatter(selectedPositionsList, selectedValuesList, marker='o', color='olive', label='PRESCOTT')
# Add vertical lines connecting old and new values
for i in range(len(selectedPositionsList)):
plt.annotate("", xy=(selectedPositionsList[i], selectedValuesList[i]), xycoords='data', \
xytext=(selectedPositionsList[i], myBigMergedDF.loc[myBigMergedDF['mutant'] == selectedMutantsList[i], 'ESCOTT'].values[0]), textcoords='data',
arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
plt.xticks(rotation = 90)
plt.ylabel("Ranksorted Score")
plt.xlabel("Position")
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig("clinvar-vs-position.png")
plt.close()
myBigMergedDF.to_csv('myBigMergedDF-normalized-asm.csv', index=None)
myBigMergedDF.to_csv(args.outputfile, columns=['mutant', 'PRESCOTT'], index=False, header=None, sep=' ')
print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment