Commit f2e02c2f by Mustafa Tekpinar

Changes in prescott.py for proper AUC calculation over ClinVar labels.

parent d68afcdb
...@@ -988,7 +988,7 @@ def parse_command_line(): ...@@ -988,7 +988,7 @@ def parse_command_line():
parser.add_argument('--colormap', dest='colormap', type=str, \ parser.add_argument('--colormap', dest='colormap', type=str, \
help="Select a colormap from standard matplotlib colormaps", help="Select a colormap from standard matplotlib colormaps",
required=False, default='Oranges_r') required=False, default='turbo_r')
parser.add_argument('--maxcoillength', dest='maxcoillength', type=int, \ parser.add_argument('--maxcoillength', dest='maxcoillength', type=int, \
help="If coil length is > maxcoillength, it will use JET values.", help="If coil length is > maxcoillength, it will use JET values.",
......
...@@ -295,7 +295,7 @@ def main(): ...@@ -295,7 +295,7 @@ def main():
main_parser.add_argument('-s', '--sequencefile', dest='sequencefile', type=str, \ main_parser.add_argument('-s', '--sequencefile', dest='sequencefile', type=str, \
help='Sequence file of only the query protein sequence in fasta format.', \ help='Sequence file of only the query protein sequence in fasta format.', \
required=False, default=None) required=True, default=None)
main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \ main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \
help='The second input for prescott is a gnomad frequency file for the protein\n.'+ help='The second input for prescott is a gnomad frequency file for the protein\n.'+
...@@ -355,23 +355,6 @@ def main(): ...@@ -355,23 +355,6 @@ def main():
print("@> Name of the output file : {}".format(args.outputfile)) print("@> Name of the output file : {}".format(args.outputfile))
# End of argument parsing! # End of argument parsing!
# mainPath = "/mnt/data/tekpinar/ESGEMME_vs_EVE_all_data"
# dataFolder = "/esgemme-v-1-4-0-max-two-components-eve-msas-entire-single-point-mutations"
# dataFolder = "/egemme-v-1-3-0-eve-msas-entire-single-point-mutations"
# dataFolder = sys.argv[9]
# mutFilePath = "/mnt/data/tekpinar/research/datasets/acmgClinVar/acmg-v3p1-with-esgemme-v1-4-0-eve-MSAs/all_gene_names_v4_existing_asm_normalized.txt"
varFilePath = "/variant_files"
# MUT_FILE = open(mutFilePath, "r")
# allLines = MUT_FILE.readlines()
# MUT_FILE.close()
allPathogenicList = []
allBenignList = []
# selectedListFile=open("all_gene_names_v4_existing_asm.txt" ,"w")
escottDataPath = args.escottfile escottDataPath = args.escottfile
protein = os.path.splitext(os.path.basename(escottDataPath))[0] protein = os.path.splitext(os.path.basename(escottDataPath))[0]
# esmVariantsPath="/mnt/data/tekpinar/software/esm-variants/entire-dataset/" # esmVariantsPath="/mnt/data/tekpinar/software/esm-variants/entire-dataset/"
...@@ -416,9 +399,6 @@ def main(): ...@@ -416,9 +399,6 @@ def main():
myBigMergedDF = pd.DataFrame() myBigMergedDF = pd.DataFrame()
myBigMergedDF = pd.concat([myBigMergedDF, dfESCOTT], ignore_index=True) myBigMergedDF = pd.concat([myBigMergedDF, dfESCOTT], ignore_index=True)
# figPrefix="all_gene_names_eve_msas_uniref100_"+method.lower()+"_vs_eve-asm_normalized-auc-sklearn-v5"
figPrefix="all_gene_names_eve_msas_uniref100_vs_eve-asm_normalized-auc-sklearn-v5"
gnomadDF = getGnomADOverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot) gnomadDF = getGnomADOverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
# Assign labels to pathogenic/benign mutations for performance evaluation # Assign labels to pathogenic/benign mutations for performance evaluation
...@@ -526,45 +506,50 @@ def main(): ...@@ -526,45 +506,50 @@ def main():
#print(myBigMergedDF.loc[(myBigMergedDF['labels']=='0') | (myBigMergedDF['labels']=='1'), 'labels']) #print(myBigMergedDF.loc[(myBigMergedDF['labels']=='0') | (myBigMergedDF['labels']=='1'), 'labels'])
# print(clinvarLabeledDF['labels'].values) # print(clinvarLabeledDF['labels'].values)
# print(clinvarLabeledDF['ESCOTT'].values) # print(clinvarLabeledDF['ESCOTT'].values)
fprESCOTT, tprESCOTT, AUC_ESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \ numPathogenic = len(myBigMergedDF.loc[(myBigMergedDF['labels']==1)])
clinvarLabeledDF['ESCOTT']) numBenign = len(myBigMergedDF.loc[(myBigMergedDF['labels']==0)])
fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \ if ((numPathogenic >=1) and (numBenign>=1)):
clinvarLabeledDF['PRESCOTT']) fprESCOTT, tprESCOTT, AUC_ESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
# fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'labels'], \ clinvarLabeledDF['ESCOTT'])
# myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'PRESCOTT'])
fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
fig = plt.figure(figsize=(12,6)) clinvarLabeledDF['PRESCOTT'])
# plt.rcParams.update({'font.size': 18}) # fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'labels'], \
plt.grid(linestyle='--') # myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'PRESCOTT'])
# plt.title(protName + " - "+method+" AUC={:.2f}".format(AUC_ESCOTT))
plt.title("AUC={:.2f} -> AUC={:.2f}".format(AUC_ESCOTT, AUC_PRESCOTT)) fig = plt.figure(figsize=(12,6))
plt.ylim([0.0, 1.0]) # plt.rcParams.update({'font.size': 18})
#plt.xlim([1000, 1863]) plt.grid(linestyle='--')
plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'ESCOTT'], marker='o', color='red', label='pathogenic') # plt.title(protName + " - "+method+" AUC={:.2f}".format(AUC_ESCOTT))
plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'ESCOTT'], marker='o', color='blue', label='benign') plt.title("AUC={:.2f} -> AUC={:.2f}".format(AUC_ESCOTT, AUC_PRESCOTT))
if (useFrequencies.lower() == 'true'): plt.ylim([0.0, 1.0])
#print(selectedPositionsList) #plt.xlim([1000, 1863])
#print(selectedValuesList) plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'ESCOTT'], marker='o', color='red', label='pathogenic')
plt.scatter(selectedPositionsList, selectedValuesList, marker='o', color='olive', label='PRESCOTT') plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'ESCOTT'], marker='o', color='blue', label='benign')
if (useFrequencies.lower() == 'true'):
# Add vertical lines connecting old and new values #print(selectedPositionsList)
for i in range(len(selectedPositionsList)): #print(selectedValuesList)
plt.annotate("", xy=(selectedPositionsList[i], selectedValuesList[i]), xycoords='data', \ plt.scatter(selectedPositionsList, selectedValuesList, marker='o', color='olive', label='PRESCOTT')
xytext=(selectedPositionsList[i], myBigMergedDF.loc[myBigMergedDF['mutant'] == selectedMutantsList[i], 'ESCOTT'].values[0]), textcoords='data',
arrowprops=dict(arrowstyle="->", connectionstyle="arc3")) # Add vertical lines connecting old and new values
for i in range(len(selectedPositionsList)):
plt.xticks(rotation = 90) plt.annotate("", xy=(selectedPositionsList[i], selectedValuesList[i]), xycoords='data', \
plt.ylabel("Ranksorted Score") xytext=(selectedPositionsList[i], myBigMergedDF.loc[myBigMergedDF['mutant'] == selectedMutantsList[i], 'ESCOTT'].values[0]), textcoords='data',
plt.xlabel("Position") arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
plt.legend(loc='upper right')
plt.tight_layout() plt.xticks(rotation = 90)
plt.savefig("clinvar-vs-position.png") plt.ylabel("Ranksorted Score")
plt.close() plt.xlabel("Position")
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig("clinvar-vs-position.png")
plt.close()
print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))
myBigMergedDF.to_csv('myBigMergedDF-normalized-asm.csv', index=None) myBigMergedDF.to_csv('myBigMergedDF-normalized-asm.csv', index=None)
myBigMergedDF.to_csv(args.outputfile, columns=['mutant', 'PRESCOTT'], index=False, header=None, sep=' ') myBigMergedDF.to_csv(args.outputfile, columns=['mutant', 'PRESCOTT'], index=False, header=None, sep=' ')
print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))
if __name__ == "__main__": if __name__ == "__main__":
main() main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment