Changes in prescott.py for proper AUC calculation over ClinVar labels.

f2e02c2f · Mustafa Tekpinar · d68afcdb · f2e02c2f · f2e02c2f
Commit f2e02c2f authored Oct 03, 2023 by Mustafa Tekpinar
Hide whitespace changes
Inline Side-by-side

Showing with 43 additions and 58 deletions

escott.py prescott/escott.py +1 -1

prescott.py prescott/prescott.py +42 -57

No files found.
--- a/prescott/escott.py
+++ b/prescott/escott.py
@@ -988,7 +988,7 @@ def parse_command_line():

    parser.add_argument('--colormap', dest='colormap', type=str, \
        help="Select a colormap from standard matplotlib colormaps",
-        required=False, default='Oranges_r')
+        required=False, default='turbo_r')
    
    parser.add_argument('--maxcoillength', dest='maxcoillength', type=int, \
        help="If coil length is > maxcoillength, it will use JET values.",

--- a/prescott/prescott.py
+++ b/prescott/prescott.py
@@ -295,7 +295,7 @@ def main():
    
    main_parser.add_argument('-s', '--sequencefile', dest='sequencefile', type=str, \
        help='Sequence file of only the query protein sequence in fasta format.', \
-        required=False, default=None)
+        required=True, default=None)

    main_parser.add_argument('-g', '--gnomadfile', dest='gnomadfile', type=str, \
        help='The second input for prescott is a gnomad frequency file for the protein\n.'+
@@ -355,23 +355,6 @@ def main():
    print("@> Name of the output file          : {}".format(args.outputfile))
    # End of argument parsing!

-    # mainPath = "/mnt/data/tekpinar/ESGEMME_vs_EVE_all_data"
-    # dataFolder = "/esgemme-v-1-4-0-max-two-components-eve-msas-entire-single-point-mutations"
-    # dataFolder = "/egemme-v-1-3-0-eve-msas-entire-single-point-mutations"
-    # dataFolder = sys.argv[9]
-    # mutFilePath = "/mnt/data/tekpinar/research/datasets/acmgClinVar/acmg-v3p1-with-esgemme-v1-4-0-eve-MSAs/all_gene_names_v4_existing_asm_normalized.txt"
-    varFilePath = "/variant_files"
-
-    # MUT_FILE = open(mutFilePath, "r")
-    # allLines = MUT_FILE.readlines()
-    # MUT_FILE.close()
-
-
-    allPathogenicList = []
-    allBenignList = []
-
-    # selectedListFile=open("all_gene_names_v4_existing_asm.txt" ,"w")
-
    escottDataPath = args.escottfile
    protein = os.path.splitext(os.path.basename(escottDataPath))[0]
    # esmVariantsPath="/mnt/data/tekpinar/software/esm-variants/entire-dataset/"
@@ -416,9 +399,6 @@ def main():
    myBigMergedDF = pd.DataFrame()
    myBigMergedDF = pd.concat([myBigMergedDF, dfESCOTT], ignore_index=True)

-    # figPrefix="all_gene_names_eve_msas_uniref100_"+method.lower()+"_vs_eve-asm_normalized-auc-sklearn-v5"
-    figPrefix="all_gene_names_eve_msas_uniref100_vs_eve-asm_normalized-auc-sklearn-v5"
-
    gnomadDF = getGnomADOverallFrequency(args.gnomadfile, usePopMax=usePopMaxOrNot)
    
    # Assign labels to pathogenic/benign mutations for performance evaluation
@@ -526,45 +506,50 @@ def main():
    #print(myBigMergedDF.loc[(myBigMergedDF['labels']=='0') | (myBigMergedDF['labels']=='1'), 'labels'])
    # print(clinvarLabeledDF['labels'].values)
    # print(clinvarLabeledDF['ESCOTT'].values)
-    fprESCOTT, tprESCOTT, AUC_ESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
-                                                       clinvarLabeledDF['ESCOTT'])
-    
-    fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
-                                                       clinvarLabeledDF['PRESCOTT'])
-    # fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'labels'], \
-    #                                                   myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'PRESCOTT'])
-
-    fig = plt.figure(figsize=(12,6))
-    # plt.rcParams.update({'font.size': 18})
-    plt.grid(linestyle='--')
-    # plt.title(protName + " - "+method+" AUC={:.2f}".format(AUC_ESCOTT))
-    plt.title("AUC={:.2f} -> AUC={:.2f}".format(AUC_ESCOTT, AUC_PRESCOTT))
-    plt.ylim([0.0, 1.0])
-    #plt.xlim([1000, 1863])
-    plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'ESCOTT'], marker='o', color='red', label='pathogenic')
-    plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'ESCOTT'], marker='o', color='blue', label='benign')
-    if (useFrequencies.lower() == 'true'):
-        #print(selectedPositionsList)
-        #print(selectedValuesList)
-        plt.scatter(selectedPositionsList, selectedValuesList, marker='o', color='olive', label='PRESCOTT')
-
-        # Add vertical lines connecting old and new values
-        for i in range(len(selectedPositionsList)):
-            plt.annotate("", xy=(selectedPositionsList[i], selectedValuesList[i]), xycoords='data', \
-                         xytext=(selectedPositionsList[i], myBigMergedDF.loc[myBigMergedDF['mutant'] == selectedMutantsList[i], 'ESCOTT'].values[0]), textcoords='data',
-            arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
-
-    plt.xticks(rotation = 90)
-    plt.ylabel("Ranksorted Score")
-    plt.xlabel("Position")
-    plt.legend(loc='upper right')
-    plt.tight_layout()
-    plt.savefig("clinvar-vs-position.png")
-    plt.close()
+    numPathogenic = len(myBigMergedDF.loc[(myBigMergedDF['labels']==1)])
+    numBenign = len(myBigMergedDF.loc[(myBigMergedDF['labels']==0)])
+
+    if ((numPathogenic >=1) and (numBenign>=1)):
+        fprESCOTT, tprESCOTT, AUC_ESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
+                                                        clinvarLabeledDF['ESCOTT'])
+        
+        fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(clinvarLabeledDF['labels'], \
+                                                        clinvarLabeledDF['PRESCOTT'])
+        # fprPRESCOTT, tprPRESCOTT, AUC_PRESCOTT = plotROCandAUCV2(myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'labels'], \
+        #                                                   myBigMergedDF.loc[(myBigMergedDF['labels']==0) | (myBigMergedDF['labels']==1), 'PRESCOTT'])
+
+        fig = plt.figure(figsize=(12,6))
+        # plt.rcParams.update({'font.size': 18})
+        plt.grid(linestyle='--')
+        # plt.title(protName + " - "+method+" AUC={:.2f}".format(AUC_ESCOTT))
+        plt.title("AUC={:.2f} -> AUC={:.2f}".format(AUC_ESCOTT, AUC_PRESCOTT))
+        plt.ylim([0.0, 1.0])
+        #plt.xlim([1000, 1863])
+        plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 1, 'ESCOTT'], marker='o', color='red', label='pathogenic')
+        plt.scatter(myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'position'], myBigMergedDF.loc[myBigMergedDF['labels'] == 0, 'ESCOTT'], marker='o', color='blue', label='benign')
+        if (useFrequencies.lower() == 'true'):
+            #print(selectedPositionsList)
+            #print(selectedValuesList)
+            plt.scatter(selectedPositionsList, selectedValuesList, marker='o', color='olive', label='PRESCOTT')
+
+            # Add vertical lines connecting old and new values
+            for i in range(len(selectedPositionsList)):
+                plt.annotate("", xy=(selectedPositionsList[i], selectedValuesList[i]), xycoords='data', \
+                            xytext=(selectedPositionsList[i], myBigMergedDF.loc[myBigMergedDF['mutant'] == selectedMutantsList[i], 'ESCOTT'].values[0]), textcoords='data',
+                arrowprops=dict(arrowstyle="->", connectionstyle="arc3"))
+
+        plt.xticks(rotation = 90)
+        plt.ylabel("Ranksorted Score")
+        plt.xlabel("Position")
+        plt.legend(loc='upper right')
+        plt.tight_layout()
+        plt.savefig("clinvar-vs-position.png")
+        plt.close()
+        print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))

    myBigMergedDF.to_csv('myBigMergedDF-normalized-asm.csv', index=None)
    myBigMergedDF.to_csv(args.outputfile, columns=['mutant', 'PRESCOTT'], index=False, header=None, sep=' ')
-    print("@> AUC= {:.3f} {:.3f}".format( AUC_ESCOTT, AUC_PRESCOTT))
+

 if __name__ == "__main__":
    main()