Residue ID correction for already ranksorted scores.

When your MSA file contains a subset of the real protein (such as your MSA file contains residues 520-5207 instead of 1-5207), it becomes problematic to write csv output with correct residue IDs. I corrected this problem even though it is not a common problem.

Residue ID correction for already ranksorted scores.
When your MSA file contains a subset of the real protein (such as your MSA file contains residues 520-5207 instead of 1-5207), it becomes problematic to write csv output with correct residue IDs. I corrected this problem even though it is not a common problem.
83fcd07e · Mustafa Tekpinar · 09e6079f · 83fcd07e
Commit 83fcd07e authored Nov 21, 2023 by Mustafa Tekpinar
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 16 deletions

prescott.py prescott/prescott.py +23 -16

No files found.
--- a/prescott/prescott.py
+++ b/prescott/prescott.py
@@ -650,18 +650,20 @@ def main():
    usePopMaxOrNot = args.usepopmax.lower()
    version = args.equation
    if (os.path.exists(args.escottfile)):
+        #Convert the matrix format to singleline format
+        localResidueList = None
+        if(args.sequencefile != None):
+            referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
+            localResidueList = list(referenceSeq.seq)

+        aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
        if(args.escottformat=='gemme'):

            #Parse the file containing raw ESCOTT scores. 
            scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
            
-            #Convert the matrix format to singleline format
-            localResidueList = None
-            if(args.sequencefile != None):
-                referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
-                localResidueList = list(referenceSeq.seq)
-            aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
+
+
            writeSinglelineFormat(scanningMatrix, protein+'_singleline.txt', residueList = localResidueList,\
                            beg=0, end=None, aaOrder = aaOrderList, \
                            offSet=0)
@@ -887,10 +889,12 @@ def main():
    # is actually log10 frequencies. Normally, one can deduce it from the values as well
    # but it is always better to be clear. 
    myBigMergedDF = myBigMergedDF.rename(columns={'frequency': 'log10frequency'})
-
+    myBigMergedDF['mutant'] = myBigMergedDF['mutant'].str.upper()
+    # myBigMergedDF = myBigMergedDF['mutant'].apply(lambda x: x.upper())
    myBigMergedDF.to_csv(outfile+'-details.csv', index=None)
    
-    
+    print(localResidueList)
+
    with open(outfile+'.csv', 'w') as my_file:
        my_file.write(",")

@@ -900,24 +904,27 @@ def main():
            else:
                my_file.write(item+",")

+        posList = myBigMergedDF['position'].unique().tolist()
+        # print(posList)
        for pos in range(len(localResidueList)):
-            resAndPos = str(localResidueList[pos])+str(pos+1)
+            resAndPos = str(localResidueList[pos])+str(posList[pos])
            my_file.write("{},".format(resAndPos))
            for item in alphabeticalAminoAcidsList:
-                variant = str(localResidueList[pos]).upper()+str(pos+1)+item
+                variant = str(localResidueList[pos]).upper()+str(posList[pos])+item
+                # print(variant)
                if(item=='Y'):
-                    #print(variant)
+                   
                    #print(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values[0])
                    my_file.write("{:.2f}\n".format(float(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values[0])))
                else:
-                    #print(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values[0])
+                    # print(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values)
                    my_file.write("{:.2f},".format(float(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values[0])))


-    if(os.path.exists(protein+'_singleline.txt')):
-        os.remove(protein+'_singleline.txt')
-    if(os.path.exists(protein+'_singleline_1-ranksort.txt')):
-        os.remove(protein+'_singleline_1-ranksort.txt')
+    # if(os.path.exists(protein+'_singleline.txt')):
+    #     os.remove(protein+'_singleline.txt')
+    # if(os.path.exists(protein+'_singleline_1-ranksort.txt')):
+    #     os.remove(protein+'_singleline_1-ranksort.txt')

 if __name__ == "__main__":
    main()