Commit cce7cb89 by Mustafa Tekpinar

Removed Linux grep command calls.

parent 2d117e24
...@@ -21,7 +21,7 @@ ESGEMME requires two files: ...@@ -21,7 +21,7 @@ ESGEMME requires two files:
One of the fastest ways to obtain both input MSA and a PDB file is to run colabfold: One of the fastest ways to obtain both input MSA and a PDB file is to run colabfold:
https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/AlphaFold2.ipynb https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/AlphaFold2.ipynb
Please note that the MSA file produced by colabfold (a3m file) can contain gaps in the query sequence. You have to remove them before using it in ESGEMME. Please note that the MSA file produced by colabfold (a3m file) can contain gaps in the query sequence. You have to remove them before using it in ESGEMME. You can remove the gaps with pragrams that have a GUI, such as ugene (http://ugene.net/) or jalview (https://www.jalview.org/).
For testing purpose, you can find example input files for BLAT protein in data/ folder of this repository. For testing purpose, you can find example input files for BLAT protein in data/ folder of this repository.
...@@ -81,9 +81,12 @@ export ESGEMME_PATH=/path-to-ESGEMME-directory/ ...@@ -81,9 +81,12 @@ export ESGEMME_PATH=/path-to-ESGEMME-directory/
#### Installing the dependencies: #### Installing the dependencies:
ESGEMME has the following external dependencies: ESGEMME has the following external dependencies:
* Joint Evolutionary Trees: http://www.lcqb.upmc.fr/JET2/ * Joint Evolutionary Trees: http://www.lcqb.upmc.fr/JET2/ and its dependencies:
* seqinr R package: https://cran.r-project.org/web/packages/seqinr/index.html - naccess: http://www.bioinf.manchester.ac.uk/naccess/
* naccess: http://www.bioinf.manchester.ac.uk/naccess/ - muscle: https://www.drive5.com/muscle/
**
* seqinr R package: https://cran.r-project.org/web/packages/seqinr/index.html
These tools should be installed to be able to use ESGEMME. These tools should be installed to be able to use ESGEMME.
......
...@@ -19,7 +19,7 @@ import glob ...@@ -19,7 +19,7 @@ import glob
from prody import * from prody import *
from scipy.stats import rankdata from scipy.stats import rankdata
from Bio import AlignIO from Bio import AlignIO, SeqIO
import biotite.structure.io as strucio import biotite.structure.io as strucio
import biotite.application.dssp as dssp import biotite.application.dssp as dssp
...@@ -29,11 +29,40 @@ import biotite.structure.io.pdb as pdb ...@@ -29,11 +29,40 @@ import biotite.structure.io.pdb as pdb
import pandas as pd import pandas as pd
def selectMaxFastaSequences(inputMSAfile, outputMSAfile, maxNumSeqs=40001):
"""
Given a multiple sequence alignment file in FASTA format,
select the top maxNumSeqs sequences and write them to outputMSAfile.
Parameters
----------
inputMSAfile: string
Name of the input multiple sequence alignment file in FASTA format.
outputMSAfile: string
Name of the output multiple sequence alignment file in FASTA format.
maxNumSeqs: int
Maximum number of sequences to be selected from the input MSA file.
Default is 40000 + 1 (for query sequence) .
Returns
-------
Nothing
"""
alignment = AlignIO.read(inputMSAfile, 'fasta')
# print(alignment[0].seq)
numSeqInMSA = len(alignment)
# print("Number of sequences read: "+str(numSeqInMSA))
with open(outputMSAfile, "w") as handle:
SeqIO.write(alignment[0:maxNumSeqs], handle, "fasta")
#############The following part between # signs is moved from gemmeAnal.py to #############The following part between # signs is moved from gemmeAnal.py to
# make the code more simple. # make the code more simple.
def extractQuerySeq(filename): def extractQuerySeq(filename):
""" """
# Extract the query sequence from the input alignment Extract the query sequence from the input alignment
""" """
fIN = open(filename,"r") fIN = open(filename,"r")
...@@ -209,13 +238,17 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl): ...@@ -209,13 +238,17 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
shutil.copy2(fFile, fFile+".orig") shutil.copy2(fFile, fFile+".orig")
#I think this subprocess call causes overwriting of the fasta file. #I think this subprocess call causes overwriting of the fasta file.
#subprocess.call("cp "+fFile+" "+prot+"_"+chainID+".fasta",shell=True) #subprocess.call("cp "+fFile+" "+prot+"_"+chainID+".fasta",shell=True)
grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+".orig > "+prot+"_"+chainID+".fasta" # grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+".orig > "+prot+"_"+chainID+".fasta"
# print("\nRunning:\n"+grpcmd)
# subprocess.call(grpcmd,shell=True)
selectMaxFastaSequences(fFile+".orig", prot+"_"+chainID+".fasta", maxNumSeqs=(int(N)+1))
else: else:
#subprocess.call("cp "+fFile+" "+prot+"_"+chainID+".fasta",shell=True) #subprocess.call("cp "+fFile+" "+prot+"_"+chainID+".fasta",shell=True)
grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+" > "+prot+"_"+chainID+".fasta" #grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+" > "+prot+"_"+chainID+".fasta"
print("\nRunning:\n"+grpcmd) #print("\nRunning:\n"+grpcmd)
subprocess.call(grpcmd,shell=True) #subprocess.call(grpcmd,shell=True)
selectMaxFastaSequences(fFile, prot+"_"+chainID+".fasta", maxNumSeqs=(int(N)+1))
if(pdbfile == None): if(pdbfile == None):
jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\ jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\
prot+".pdb -o `pwd` -p J -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" > "+prot+".out" prot+".pdb -o `pwd` -p J -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" > "+prot+".out"
......
...@@ -48,7 +48,7 @@ then ...@@ -48,7 +48,7 @@ then
python $ESGEMME_PATH/esgemme.py ../data/aliBLAT.fasta -r input -f ../data/aliBLAT.fasta \ python $ESGEMME_PATH/esgemme.py ../data/aliBLAT.fasta -r input -f ../data/aliBLAT.fasta \
--pdbfile ../data/blat-af2.pdb --normweightmode sstjetormax \ --pdbfile ../data/blat-af2.pdb --normweightmode sstjetormax \
-m ../data/Stiffler_2015_BLAT_ECOLX.mut -m ../data/Stiffler_2015_BLAT_ECOLX.mut
#demust compare -i ../data/BLAT_ECOLX_Stiffler_2015_experimental.dat --itype singleline -j BLAT_normPred_evolCombi.txt --jtype singleline
else else
echo "Running EGEMME with a user-provided alignment file." echo "Running EGEMME with a user-provided alignment file."
echo "Since a pdb file is not provided, only evolutionary information will be used!" echo "Since a pdb file is not provided, only evolutionary information will be used!"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment