Changes in prescott.py for proper AUC calculation over ClinVar labels.

f2e02c2f · Mustafa Tekpinar · d68afcdb · f2e02c2f · f2e02c2f
Commit f2e02c2f authored Oct 03, 2023 by Mustafa Tekpinar
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 23 deletions

escott.py prescott/escott.py +1 -1

prescott.py prescott/prescott.py +7 -22

No files found.
--- a/prescott/escott.py
+++ b/prescott/escott.py
@@ -988,7 +988,7 @@ def parse_command_line():
    parser.add_argument('--colormap', dest='colormap', type=str, \
        help="Select a colormap from standard matplotlib colormaps",
-        required=False, default='Oranges_r')
+        required=False, default='turbo_r')
    parser.add_argument('--maxcoillength', dest='maxcoillength', type=int, \
        help="If coil length is > maxcoillength, it will use JET values.",

--- a/prescott/prescott.py
+++ b/prescott/prescott.py
@@ -295,7 +295,7 @@ def main():
    main_parser.add_argument('-s', '--sequencefile', dest='sequencefile', type=str, \
        help='Sequence file of only the query protein sequence in fasta format.', \
-        required=False, default=None)
+        required=True, default=None)
    main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \
        help='The second input for prescott is a gnomad frequency file for the protein\n.'+
@@ -355,23 +355,6 @@ def main():
    print("@> Name of the output file          : {}".format(args.outputfile))
    # End of argument parsing!
-    # mainPath = "/mnt/data/tekpinar/ESGEMME_vs_EVE_all_data"
-    # dataFolder = "/esgemme-v-1-4-0-max-two-components-eve-msas-entire-single-point-mutations"
-    # dataFolder = "/egemme-v-1-3-0-eve-msas-entire-single-point-mutations"
-    # dataFolder = sys.argv[9]
-    # mutFilePath = "/mnt/data/tekpinar/research/datasets/acmgClinVar/acmg-v3p1-with-esgemme-v1-4-0-eve-MSAs/all_gene_names_v4_existing_asm_normalized.txt"
-    varFilePath = "/variant_files"
-    # MUT_FILE = open(mutFilePath, "r")
-    # allLines = MUT_FILE.readlines()
-    # MUT_FILE.close()
-    allPathogenicList = []
-    allBenignList = []
-    # selectedListFile=open("all_gene_names_v4_existing_asm.txt" ,"w")
    escottDataPath = args.escottfile
    protein = os.path.splitext(os.path.basename(escottDataPath))[0]
    # esmVariantsPath="/mnt/data/tekpinar/software/esm-variants/entire-dataset/"
@@ -416,9 +399,6 @@ def main():
    myBigMergedDF = pd.DataFrame()
    myBigMergedDF = pd.concat([myBigMergedDF, dfESCOTT], ignore_index=True)
-    # figPrefix="all_gene_names_eve_msas_uniref100_"+method.lower()+"_vs_eve-asm_normalized-auc-sklearn-v5"
-    figPrefix="all_gene_names_eve_msas_uniref100_vs_eve-asm_normalized-auc-sklearn-v5"
    gnomadDF = getGnomADOverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
    # Assign labels to pathogenic/benign mutations for performance evaluation
@@ -526,6 +506,10 @@ def main():
    #print(myBigMergedDF.loc[(myBigMergedDF['labels']=='0') | (myBigMergedDF['labels']=='1'), 'labels'])
    # print(clinvarLabeledDF['labels'].values)
    # print(clinvarLabeledDF['ESCOTT'].values)
+    numPathogenic = len(myBigMergedDF.loc[(myBigMergedDF['labels']==1)])
+    numBenign = len(myBigMergedDF.loc[(myBigMergedDF['labels']==0)])
+    if ((numPathogenic >=1) and (numBenign>=1)):
        fprESCOTT, tprESCOTT, AUC_ESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
                                                        clinvarLabeledDF['ESCOTT'])
@@ -561,10 +545,11 @@ def main():
        plt.tight_layout()
        plt.savefig("clinvar-vs-position.png")
        plt.close()
+        print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))
    myBigMergedDF.to_csv('myBigMergedDF-normalized-asm.csv', index=None)
    myBigMergedDF.to_csv(args.outputfile, columns=['mutant', 'PRESCOTT'], index=False, header=None, sep=' ')
-    print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))
 if __name__ == "__main__":
    main()