Commit cce7cb89 by Mustafa Tekpinar

Removed Linux grep command calls.

parent 2d117e24
......@@ -21,7 +21,7 @@ ESGEMME requires two files:
One of the fastest ways to obtain both input MSA and a PDB file is to run colabfold:
https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/AlphaFold2.ipynb
Please note that the MSA file produced by colabfold (a3m file) can contain gaps in the query sequence. You have to remove them before using it in ESGEMME.
Please note that the MSA file produced by colabfold (a3m file) can contain gaps in the query sequence. You have to remove them before using it in ESGEMME. You can remove the gaps with pragrams that have a GUI, such as ugene (http://ugene.net/) or jalview (https://www.jalview.org/).
For testing purpose, you can find example input files for BLAT protein in data/ folder of this repository.
......@@ -81,9 +81,12 @@ export ESGEMME_PATH=/path-to-ESGEMME-directory/
#### Installing the dependencies:
ESGEMME has the following external dependencies:
* Joint Evolutionary Trees: http://www.lcqb.upmc.fr/JET2/
* Joint Evolutionary Trees: http://www.lcqb.upmc.fr/JET2/ and its dependencies:
- naccess: http://www.bioinf.manchester.ac.uk/naccess/
- muscle: https://www.drive5.com/muscle/
**
* seqinr R package: https://cran.r-project.org/web/packages/seqinr/index.html
* naccess: http://www.bioinf.manchester.ac.uk/naccess/
These tools should be installed to be able to use ESGEMME.
......
......@@ -19,7 +19,7 @@ import glob
from prody import *
from scipy.stats import rankdata
from Bio import AlignIO
from Bio import AlignIO, SeqIO
import biotite.structure.io as strucio
import biotite.application.dssp as dssp
......@@ -29,11 +29,40 @@ import biotite.structure.io.pdb as pdb
import pandas as pd
def selectMaxFastaSequences(inputMSAfile, outputMSAfile, maxNumSeqs=40001):
"""
Given a multiple sequence alignment file in FASTA format,
select the top maxNumSeqs sequences and write them to outputMSAfile.
Parameters
----------
inputMSAfile: string
Name of the input multiple sequence alignment file in FASTA format.
outputMSAfile: string
Name of the output multiple sequence alignment file in FASTA format.
maxNumSeqs: int
Maximum number of sequences to be selected from the input MSA file.
Default is 40000 + 1 (for query sequence) .
Returns
-------
Nothing
"""
alignment = AlignIO.read(inputMSAfile, 'fasta')
# print(alignment[0].seq)
numSeqInMSA = len(alignment)
# print("Number of sequences read: "+str(numSeqInMSA))
with open(outputMSAfile, "w") as handle:
SeqIO.write(alignment[0:maxNumSeqs], handle, "fasta")
#############The following part between # signs is moved from gemmeAnal.py to
# make the code more simple.
def extractQuerySeq(filename):
"""
# Extract the query sequence from the input alignment
Extract the query sequence from the input alignment
"""
fIN = open(filename,"r")
......@@ -209,13 +238,17 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
shutil.copy2(fFile, fFile+".orig")
#I think this subprocess call causes overwriting of the fasta file.
#subprocess.call("cp "+fFile+" "+prot+"_"+chainID+".fasta",shell=True)
grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+".orig > "+prot+"_"+chainID+".fasta"
# grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+".orig > "+prot+"_"+chainID+".fasta"
# print("\nRunning:\n"+grpcmd)
# subprocess.call(grpcmd,shell=True)
selectMaxFastaSequences(fFile+".orig", prot+"_"+chainID+".fasta", maxNumSeqs=(int(N)+1))
else:
#subprocess.call("cp "+fFile+" "+prot+"_"+chainID+".fasta",shell=True)
grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+" > "+prot+"_"+chainID+".fasta"
#grpcmd="grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+" > "+prot+"_"+chainID+".fasta"
print("\nRunning:\n"+grpcmd)
subprocess.call(grpcmd,shell=True)
#print("\nRunning:\n"+grpcmd)
#subprocess.call(grpcmd,shell=True)
selectMaxFastaSequences(fFile, prot+"_"+chainID+".fasta", maxNumSeqs=(int(N)+1))
if(pdbfile == None):
jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\
prot+".pdb -o `pwd` -p J -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" > "+prot+".out"
......
......@@ -48,7 +48,7 @@ then
python $ESGEMME_PATH/esgemme.py ../data/aliBLAT.fasta -r input -f ../data/aliBLAT.fasta \
--pdbfile ../data/blat-af2.pdb --normweightmode sstjetormax \
-m ../data/Stiffler_2015_BLAT_ECOLX.mut
#demust compare -i ../data/BLAT_ECOLX_Stiffler_2015_experimental.dat --itype singleline -j BLAT_normPred_evolCombi.txt --jtype singleline
else
echo "Running EGEMME with a user-provided alignment file."
echo "Since a pdb file is not provided, only evolutionary information will be used!"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment