Added escottfile and ranksorted args to prescott UI.

42dbf51a · Mustafa Tekpinar · f98b9cb1 · 42dbf51a · 42dbf51a
Commit 42dbf51a authored Oct 06, 2023 by Mustafa Tekpinar
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 32 deletions

prescott.py prescott/prescott.py +50 -30

requirements.txt requirements.txt +1 -2

No files found.
--- a/prescott/prescott.py
+++ b/prescott/prescott.py
@@ -328,7 +328,16 @@ def main():
    main_parser.add_argument('--usefrequencies', dest='usefrequencies', type=str, \
        help='Do not touch this if you don\'t know what you are doing! Default is true',
        required=False, default='true')
+    main_parser.add_argument('--ranksorted', dest='ranksorted', type=str, \
+        help='If your data is already ranksorted, change this argument to true. Default is false',
+        required=False, default='false')
+    main_parser.add_argument('--escottformat', dest='escottformat', type=str, \
+        help='Main format of escott file. There are two possibilities: gemme or singleline. \n'+\
+            'gemme: a horizontal format of 20 rows and N columns.\n'+\
+            'singleline: each line contains a mutation and its value separated by a space.\n'+\
+            'M1A 0.378\n', required=False, default='gemme')
    # main_parser.add_argument('--colormap', dest='colormap', type=str, \
    #     help='A colormap as defined in matplotlib',
    #     required=False, default='coolwarm_r')
@@ -354,7 +363,7 @@ def main():
    print("@> Use population max. freq         : {}".format(str(args.usepopmax).lower()))
    print("@> Which equation to use (Default=2): {}".format(str(args.equation)))
    print("@> Scaling coefficient (Default=1.0): {}".format(args.coefficient))
-    print("@> Frequency cutoff (Default=-4.0) : {}".format(args.frequencycutoff))
+    print("@> Frequency cutoff (Default=-4.0)  : {}".format(args.frequencycutoff))
    print("@> Name of the output file          : {}".format(args.outputfile))
    # End of argument parsing!
@@ -366,32 +375,45 @@ def main():
    usePopMaxOrNot = args.usepopmax.lower()
    version = args.equation
    if (os.path.exists(escottDataPath)):
-        #Parse the file containing raw ESCOTT scores. 
-        scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
+        if(args.escottformat=='gemme'):
-        #Convert the matrix format to singleline format
+            #Parse the file containing raw ESCOTT scores. 
-        localResidueList = None
+            scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
-        if(args.sequencefile != None):
-            referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
+            #Convert the matrix format to singleline format
-            localResidueList = list(referenceSeq.seq)
+            localResidueList = None
-        aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
+            if(args.sequencefile != None):
-        writeSinglelineFormat(scanningMatrix, protein+'_singleline.txt', residueList = localResidueList,\
+                referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
-                        beg=0, end=None, aaOrder = aaOrderList, \
+                localResidueList = list(referenceSeq.seq)
-                        offSet=0)
+            aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
+            writeSinglelineFormat(scanningMatrix, protein+'_singleline.txt', residueList = localResidueList,\
-        #Mostyl, I am using normPred_Combi_singleline as input file and it doesn't have a header.
+                            beg=0, end=None, aaOrder = aaOrderList, \
-        df = pd.read_table(protein+'_singleline.txt', sep="\s+", header=None)
+                            offSet=0)
-        #data = np.genfromtxt(args.input,dtype=None)
+            #Mostyl, I am using normPred_Combi_singleline as input file and it doesn't have a header.
-        data = df.to_numpy()
+            df = pd.read_table(protein+'_singleline.txt', sep="\s+", header=None)
-        rawData = data.T[1]
-        processedData = 1.0 - rankSortData(rawData)
+        elif(args.escottformat=='singleline'):
-        with open(protein+'_singleline_1-ranksort.txt', 'w') as f:
-            #f.write("#Resid Value\n")
+            df = pd.read_table(args.escottfile, sep="\s+", header=None)
-            for i in range (len(processedData)):
+        else:
-                f.write("{:} {:6.2f}\n".format(data.T[0][i], processedData[i]))
+            print('@> ERROR: Unknown escott format. It should be gemme or singleline!')
+            sys.exit(-1)
-        dfESCOTT = pd.read_table(protein+'_singleline_1-ranksort.txt', sep='\s+', header=None)
+        if(args.ranksorted == 'false'):
+            #data = np.genfromtxt(args.input,dtype=None)
+            data = df.to_numpy()
+            rawData = data.T[1]
+            processedData = 1.0 - rankSortData(rawData)
+            with open(protein+'_singleline_1-ranksort.txt', 'w') as f:
+                #f.write("#Resid Value\n")
+                for i in range (len(processedData)):
+                    f.write("{:} {:6.2f}\n".format(data.T[0][i], processedData[i]))
+            dfESCOTT = pd.read_table(protein+'_singleline_1-ranksort.txt', sep='\s+', header=None)
+        else:
+            dfESCOTT = df
        dfESCOTT.columns = ['mutant', 'ESCOTT']
        dfESCOTT['protein']=protein
@@ -416,10 +438,8 @@ def main():
            (row['ClinVar Clinical Significance']=='Pathogenic') or \
            (row['ClinVar Clinical Significance']=='Likely pathogenic')):
            gnomadDF.at[index,'labels'] = 1   
    print(gnomadDF.loc[(gnomadDF['labels']==0) | (gnomadDF['labels']==1)])
    # print(gnomadDF['ClinVar Clinical Significance'])
    # Add frequency column and a dummy frequency to each row in myBigMergedDF
    myBigMergedDF['frequency'] = 999.0
    myBigMergedDF['labels'] = np.nan

--- a/requirements.txt
+++ b/requirements.txt
@@ -5,5 +5,4 @@ scipy
 pandas
 biopython<=1.79
 biotite
-sklearn
+scikit-learn