Commit f2e02c2f by Mustafa Tekpinar

Changes in prescott.py for proper AUC calculation over ClinVar labels.

parent d68afcdb
......@@ -988,7 +988,7 @@ def parse_command_line():
parser.add_argument('--colormap', dest='colormap', type=str, \
help="Select a colormap from standard matplotlib colormaps",
required=False, default='Oranges_r')
required=False, default='turbo_r')
parser.add_argument('--maxcoillength', dest='maxcoillength', type=int, \
help="If coil length is > maxcoillength, it will use JET values.",
......
......@@ -295,7 +295,7 @@ def main():
main_parser.add_argument('-s', '--sequencefile', dest='sequencefile', type=str, \
help='Sequence file of only the query protein sequence in fasta format.', \
required=False, default=None)
required=True, default=None)
main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \
help='The second input for prescott is a gnomad frequency file for the protein\n.'+
......@@ -355,23 +355,6 @@ def main():
print("@> Name of the output file : {}".format(args.outputfile))
# End of argument parsing!
# mainPath = "/mnt/data/tekpinar/ESGEMME_vs_EVE_all_data"
# dataFolder = "/esgemme-v-1-4-0-max-two-components-eve-msas-entire-single-point-mutations"
# dataFolder = "/egemme-v-1-3-0-eve-msas-entire-single-point-mutations"
# dataFolder = sys.argv[9]
# mutFilePath = "/mnt/data/tekpinar/research/datasets/acmgClinVar/acmg-v3p1-with-esgemme-v1-4-0-eve-MSAs/all_gene_names_v4_existing_asm_normalized.txt"
varFilePath = "/variant_files"
# MUT_FILE = open(mutFilePath, "r")
# allLines = MUT_FILE.readlines()
# MUT_FILE.close()
allPathogenicList = []
allBenignList = []
# selectedListFile=open("all_gene_names_v4_existing_asm.txt" ,"w")
escottDataPath = args.escottfile
protein = os.path.splitext(os.path.basename(escottDataPath))[0]
# esmVariantsPath="/mnt/data/tekpinar/software/esm-variants/entire-dataset/"
......@@ -416,9 +399,6 @@ def main():
myBigMergedDF = pd.DataFrame()
myBigMergedDF = pd.concat([myBigMergedDF, dfESCOTT], ignore_index=True)
# figPrefix="all_gene_names_eve_msas_uniref100_"+method.lower()+"_vs_eve-asm_normalized-auc-sklearn-v5"
figPrefix="all_gene_names_eve_msas_uniref100_vs_eve-asm_normalized-auc-sklearn-v5"
gnomadDF = getGnomADOverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
# Assign labels to pathogenic/benign mutations for performance evaluation
......@@ -526,6 +506,10 @@ def main():
#print(myBigMergedDF.loc[(myBigMergedDF['labels']=='0') | (myBigMergedDF['labels']=='1'), 'labels'])
# print(clinvarLabeledDF['labels'].values)
# print(clinvarLabeledDF['ESCOTT'].values)
numPathogenic = len(myBigMergedDF.loc[(myBigMergedDF['labels']==1)])
numBenign = len(myBigMergedDF.loc[(myBigMergedDF['labels']==0)])
if ((numPathogenic >=1) and (numBenign>=1)):
fprESCOTT, tprESCOTT, AUC_ESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
clinvarLabeledDF['ESCOTT'])
......@@ -561,10 +545,11 @@ def main():
plt.tight_layout()
plt.savefig("clinvar-vs-position.png")
plt.close()
print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))
myBigMergedDF.to_csv('myBigMergedDF-normalized-asm.csv', index=None)
myBigMergedDF.to_csv(args.outputfile, columns=['mutant', 'PRESCOTT'], index=False, header=None, sep=' ')
print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment