Residue ID correction for already ranksorted scores.

When your MSA file contains a subset of the real protein (such as your MSA file contains residues 520-5207 instead of 1-5207), it becomes problematic to write csv output with correct residue IDs. I corrected this problem even though it is not a common problem.

Residue ID correction for already ranksorted scores.
When your MSA file contains a subset of the real protein (such as your MSA file contains residues 520-5207 instead of 1-5207), it becomes problematic to write csv output with correct residue IDs. I corrected this problem even though it is not a common problem.
83fcd07e · Mustafa Tekpinar · 09e6079f · 83fcd07e
Commit 83fcd07e authored Nov 21, 2023 by Mustafa Tekpinar
Show whitespace changes
Inline Side-by-side

Showing with 22 additions and 15 deletions

prescott.py prescott/prescott.py +22 -15

No files found.
--- a/prescott/prescott.py
+++ b/prescott/prescott.py
@@ -650,18 +650,20 @@ def main():
    usePopMaxOrNot = args.usepopmax.lower()
    version = args.equation
    if (os.path.exists(args.escottfile)):
-        if(args.escottformat=='gemme'):
-            #Parse the file containing raw ESCOTT scores. 
-            scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
        #Convert the matrix format to singleline format
        localResidueList = None
        if(args.sequencefile != None):
            referenceSeq = SeqIO.read(args.sequencefile, 'fasta')
            localResidueList = list(referenceSeq.seq)
        aaOrderList = list('ACDEFGHIKLMNPQRSTVWY')
+        if(args.escottformat=='gemme'):
+            #Parse the file containing raw ESCOTT scores. 
+            scanningMatrix = parseGEMMEoutput(args.escottfile, verbose=False)
            writeSinglelineFormat(scanningMatrix, protein+'_singleline.txt', residueList = localResidueList,\
                            beg=0, end=None, aaOrder = aaOrderList, \
                            offSet=0)
@@ -887,9 +889,11 @@ def main():
    # is actually log10 frequencies. Normally, one can deduce it from the values as well
    # but it is always better to be clear. 
    myBigMergedDF = myBigMergedDF.rename(columns={'frequency': 'log10frequency'})
+    myBigMergedDF['mutant'] = myBigMergedDF['mutant'].str.upper()
+    # myBigMergedDF = myBigMergedDF['mutant'].apply(lambda x: x.upper())
    myBigMergedDF.to_csv(outfile+'-details.csv', index=None)
+    print(localResidueList)
    with open(outfile+'.csv', 'w') as my_file:
        my_file.write(",")
@@ -900,24 +904,27 @@ def main():
            else:
                my_file.write(item+",")
+        posList = myBigMergedDF['position'].unique().tolist()
+        # print(posList)
        for pos in range(len(localResidueList)):
-            resAndPos = str(localResidueList[pos])+str(pos+1)
+            resAndPos = str(localResidueList[pos])+str(posList[pos])
            my_file.write("{},".format(resAndPos))
            for item in alphabeticalAminoAcidsList:
-                variant = str(localResidueList[pos]).upper()+str(pos+1)+item
+                variant = str(localResidueList[pos]).upper()+str(posList[pos])+item
+                # print(variant)
                if(item=='Y'):
-                    #print(variant)
                    #print(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values[0])
                    my_file.write("{:.2f}\n".format(float(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values[0])))
                else:
-                    #print(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values[0])
+                    # print(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values)
                    my_file.write("{:.2f},".format(float(myBigMergedDF.loc[myBigMergedDF['mutant']==variant, 'PRESCOTT'].values[0])))
-    if(os.path.exists(protein+'_singleline.txt')):
+    # if(os.path.exists(protein+'_singleline.txt')):
-        os.remove(protein+'_singleline.txt')
+    #     os.remove(protein+'_singleline.txt')
-    if(os.path.exists(protein+'_singleline_1-ranksort.txt')):
+    # if(os.path.exists(protein+'_singleline_1-ranksort.txt')):
-        os.remove(protein+'_singleline_1-ranksort.txt')
+    #     os.remove(protein+'_singleline_1-ranksort.txt')
 if __name__ == "__main__":
    main()