Removed Linux grep command calls.

cce7cb89 · Mustafa Tekpinar · 2d117e24 · cce7cb89 · cce7cb89 · cce7cb89
Commit cce7cb89 authored Apr 03, 2023 by Mustafa Tekpinar
4 changed files
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ ESGEMME requires two files:
 One of the fastest ways to obtain both input MSA and a PDB file is to run colabfold:
 https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/AlphaFold2.ipynb
-Please note that the MSA file produced by colabfold (a3m file) can contain gaps in the query sequence. You have to remove them before using it in ESGEMME. 
+Please note that the MSA file produced by colabfold (a3m file) can contain gaps in the query sequence. You have to remove them before using it in ESGEMME. You can remove the gaps with pragrams that have a GUI, such as ugene (http://ugene.net/) or jalview (https://www.jalview.org/). 
 For testing purpose, you can find example input files for BLAT protein in data/ folder of this repository. 
@@ -81,9 +81,12 @@ export ESGEMME_PATH=/path-to-ESGEMME-directory/
 #### Installing the dependencies:
 ESGEMME has the following external dependencies:
-* Joint Evolutionary Trees: http://www.lcqb.upmc.fr/JET2/
+* Joint Evolutionary Trees: http://www.lcqb.upmc.fr/JET2/ and its dependencies:
-* seqinr R package: https://cran.r-project.org/web/packages/seqinr/index.html
+    - naccess: http://www.bioinf.manchester.ac.uk/naccess/
-* naccess: http://www.bioinf.manchester.ac.uk/naccess/
+    - muscle: https://www.drive5.com/muscle/
+**
+* seqinr R package: https://cran.r-project.org/web/packages/seqinr/index.html 
 These tools should be installed to be able to use ESGEMME.

--- a/data/BLAT_ECOLX_Stiffler_2015_experimental.dat
+++ b/data/BLAT_ECOLX_Stiffler_2015_experimental.dat
--- a/esgemme.py
+++ b/esgemme.py
@@ -19,7 +19,7 @@ import glob
 from prody import *
 from scipy.stats import rankdata
-from Bio import AlignIO
+from Bio import AlignIO, SeqIO
 import biotite.structure.io as strucio
 import biotite.application.dssp as dssp
@@ -29,11 +29,40 @@ import biotite.structure.io.pdb as pdb
 import pandas as pd
+def selectMaxFastaSequences(inputMSAfile, outputMSAfile, maxNumSeqs=40001):
+    """
+        Given a multiple sequence alignment file in FASTA format,
+        select the top maxNumSeqs sequences and write them to outputMSAfile.
+    Parameters
+    ----------
+	inputMSAfile: string 
+        Name of the input multiple sequence alignment file in FASTA format.
+    outputMSAfile: string
+        Name of the output multiple sequence alignment file in FASTA format.
+	maxNumSeqs: int
+		Maximum number of sequences to be selected from the input MSA file.
+        Default is 40000 + 1 (for query sequence) .
+    Returns
+    -------
+	Nothing 
+    """
+    alignment = AlignIO.read(inputMSAfile, 'fasta')
+    # print(alignment[0].seq)
+    numSeqInMSA = len(alignment)
+    # print("Number of sequences read: "+str(numSeqInMSA))
+    with open(outputMSAfile, "w") as handle:
+        SeqIO.write(alignment[0:maxNumSeqs], handle, "fasta")
 #############The following part between # signs is moved from gemmeAnal.py to 
 # make the code more simple.  
 def extractQuerySeq(filename):
 	"""
-		# Extract the query sequence from the input alignment
+		Extract the query sequence from the input alignment
 	"""
 	fIN = open(filename,"r")
@@ -209,13 +238,17 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
                shutil.copy2(fFile, fFile+".orig")
                #I think this subprocess call causes overwriting of the fasta file. 
                #subprocess.call("cp "+fFile+" "+prot+"_"+chainID+".fasta",shell=True)
-                grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+".orig > "+prot+"_"+chainID+".fasta"
+                # grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+".orig > "+prot+"_"+chainID+".fasta"
+                # print("\nRunning:\n"+grpcmd)
+                # subprocess.call(grpcmd,shell=True)
+                selectMaxFastaSequences(fFile+".orig", prot+"_"+chainID+".fasta", maxNumSeqs=(int(N)+1))
            else:
                #subprocess.call("cp "+fFile+" "+prot+"_"+chainID+".fasta",shell=True)
-                grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+" > "+prot+"_"+chainID+".fasta"
+                #grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+" > "+prot+"_"+chainID+".fasta"
-            print("\nRunning:\n"+grpcmd)
+                #print("\nRunning:\n"+grpcmd)
-            subprocess.call(grpcmd,shell=True)
+                #subprocess.call(grpcmd,shell=True)
+                selectMaxFastaSequences(fFile, prot+"_"+chainID+".fasta", maxNumSeqs=(int(N)+1))
            if(pdbfile == None):
                jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\
                        prot+".pdb -o `pwd` -p J -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" > "+prot+".out"

--- a/examples/example-esgemme-script.sh
+++ b/examples/example-esgemme-script.sh
@@ -48,7 +48,7 @@ then
    python $ESGEMME_PATH/esgemme.py ../data/aliBLAT.fasta -r input -f ../data/aliBLAT.fasta \
 	   --pdbfile ../data/blat-af2.pdb --normweightmode sstjetormax \
 	   -m ../data/Stiffler_2015_BLAT_ECOLX.mut
+    #demust compare -i ../data/BLAT_ECOLX_Stiffler_2015_experimental.dat --itype singleline -j BLAT_normPred_evolCombi.txt --jtype singleline
 else
    echo "Running EGEMME with a user-provided alignment file."
    echo "Since a pdb file is not provided, only evolutionary information will be used!"