Commit faafebd8 by Mustafa Tekpinar

Added a timer function to sgemme.py to show total compute time.

parent a53a653e
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -9,27 +9,33 @@ then ...@@ -9,27 +9,33 @@ then
# rm -rf BLAT.fasta # rm -rf BLAT.fasta
rm -rf BLAT* rm -rf BLAT*
rm -f default.conf caracTest.dat rm -f default.conf caracTest.dat
rm -rf bin*.fasta
elif [ "$1" == "jetoff" ] elif [ "$1" == "jetoff" ]
then then
# If you have your own JET2 score file, you can turn off JET2 as follows: # If you have your own JET2 score file, you can turn off JET2 as follows:
# In ../tests folder, you can find a sample JET2 file called BLAT_jet.res # In ../tests folder, you can find a sample JET2 file called BLAT_jet.res
echo "Running GEMME with a user-provided alignment file." echo "Running SGEMME with a user-provided alignment file."
echo "Using a previously produced prot_jet.res file to check reproducibility!" echo "Using a previously produced prot_jet.res file to check reproducibility!"
cp ../tests/BLAT_jet.res . cp ../tests/BLAT_jet.res .
python $GEMME_PATH/gemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --jetfile BLAT_jet.res python $SGEMME_PATH/sgemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --jetfile BLAT_jet.res
elif [ "$1" == "withpdb" ] elif [ "$1" == "withpdb" ]
then then
#Please note that CV isa structural feature and it can not be calculated if you don't specify a pdb file. #Please note that CV isa structural feature and it can not be calculated if you don't specify a pdb file.
echo "Using blat-curated.pdb for the structural feature calculations!" echo "Using blat-af2.pdb for the structural feature calculations!"
python $GEMME_PATH/gemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --pdbfile blat-curated.pdb --normweightmode dfi python $SGEMME_PATH/sgemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --pdbfile blat-af2.pdb --normweightmode maxhalftracepchalftracecvhalfcvpc
elif [ "$1" == "withpdb-withmutfile" ]
then
#Please note that CV isa structural feature and it can not be calculated if you don't specify a pdb file.
echo "Using blat-af2.pdb for the structural feature calculations!"
python $SGEMME_PATH/sgemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --pdbfile blat-af2.pdb --normweightmode maxhalftracepchalftracecvhalfcvpc -m Stiffler_2015_BLAT_ECOLX.mut
else else
echo "Running GEMME with a user-provided alignment file." echo "Running SGEMME with a user-provided alignment file."
python $GEMME_PATH/gemme.py aliBLAT.fasta -r input -f aliBLAT.fasta python $SGEMME_PATH/sgemme.py aliBLAT.fasta -r input -f aliBLAT.fasta
fi fi
...@@ -43,8 +43,10 @@ def extractQuerySeq(filename): ...@@ -43,8 +43,10 @@ def extractQuerySeq(filename):
def getNbSeq(filename): def getNbSeq(filename):
#TODO: Remove bash dependency and count the sequences within Python!
""" """
# Get the number of sequences in a multi-fasta file # Get the number of sequences in a multi-fasta file
""" """
if filename!='': if filename!='':
proc=subprocess.Popen("grep -c '^>' "+filename,stdout=subprocess.PIPE,shell=True) proc=subprocess.Popen("grep -c '^>' "+filename,stdout=subprocess.PIPE,shell=True)
...@@ -131,7 +133,7 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl): ...@@ -131,7 +133,7 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
chainID = chains[0] chainID = chains[0]
subprocess.call("cp $GEMME_PATH/default.conf .",shell=True) subprocess.call("cp $SGEMME_PATH/default.conf .",shell=True)
if retMet=="input": if retMet=="input":
if bFile!='': if bFile!='':
#TODO: I think these two lines must be here as well but I am not sure. #TODO: I think these two lines must be here as well but I am not sure.
...@@ -184,7 +186,7 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl): ...@@ -184,7 +186,7 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
print("\nRunning for SC1:\n"+jetcmd) print("\nRunning for SC1:\n"+jetcmd)
reCode=subprocess.call(jetcmd,shell=True) reCode=subprocess.call(jetcmd,shell=True)
if os.path.isfile(prot+"/"+prot+"_jet.res"): if os.path.isfile(prot+"/"+prot+"_jet.res"):
os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc1.res") os.rename(prot+"/"+prot+"_jet.res",prot+"_jet.res")
dir_name = prot+"/" dir_name = prot+"/"
if os.path.isdir(dir_name): if os.path.isdir(dir_name):
...@@ -193,153 +195,153 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl): ...@@ -193,153 +195,153 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
if os.path.isfile(f_path): if os.path.isfile(f_path):
os.remove(f_path) os.remove(f_path)
os.rmdir(dir_name) os.rmdir(dir_name)
# Calculate SC2 # # Calculate SC2
jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\ # jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\
prot+".pdb -o `pwd` -p AVJCG -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" -a 4"+" > "+prot+".out" # prot+".pdb -o `pwd` -p AVJCG -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" -a 4"+" > "+prot+".out"
#One can also add: -g 'trace,pc,cv,clusters,axs' # #One can also add: -g 'trace,pc,cv,clusters,axs'
print("\nRunning for SC2:\n"+jetcmd) # print("\nRunning for SC2:\n"+jetcmd)
reCode=subprocess.call(jetcmd,shell=True) # reCode=subprocess.call(jetcmd,shell=True)
if os.path.isfile(prot+"/"+prot+"_jet.res"): # if os.path.isfile(prot+"/"+prot+"_jet.res"):
os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc2.res") # os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc2.res")
dir_name = prot+"/" # dir_name = prot+"/"
if os.path.isdir(dir_name): # if os.path.isdir(dir_name):
for f in os.listdir(dir_name): # for f in os.listdir(dir_name):
f_path = os.path.join(dir_name, f) # f_path = os.path.join(dir_name, f)
if os.path.isfile(f_path): # if os.path.isfile(f_path):
os.remove(f_path) # os.remove(f_path)
os.rmdir(dir_name) # os.rmdir(dir_name)
# Calculate SC3 # # Calculate SC3
jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\ # jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\
prot+".pdb -o `pwd` -p AVJCG -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" -a 5"+" > "+prot+".out" # prot+".pdb -o `pwd` -p AVJCG -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" -a 5"+" > "+prot+".out"
#One can also add: -g 'trace,pc,cv,clusters,axs' # #One can also add: -g 'trace,pc,cv,clusters,axs'
print("\nRunning for SC3:\n"+jetcmd) # print("\nRunning for SC3:\n"+jetcmd)
reCode=subprocess.call(jetcmd,shell=True) # reCode=subprocess.call(jetcmd,shell=True)
if os.path.isfile(prot+"/"+prot+"_jet.res"): # if os.path.isfile(prot+"/"+prot+"_jet.res"):
os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc3.res") # os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc3.res")
dir_name = prot+"/" # dir_name = prot+"/"
if os.path.isdir(dir_name): # if os.path.isdir(dir_name):
for f in os.listdir(dir_name): # for f in os.listdir(dir_name):
f_path = os.path.join(dir_name, f) # f_path = os.path.join(dir_name, f)
if os.path.isfile(f_path): # if os.path.isfile(f_path):
os.remove(f_path) # os.remove(f_path)
os.rmdir(dir_name) # os.rmdir(dir_name)
dfSC1 = pd.read_table(prot+"_jet_sc1.res", delimiter=r"\s+") # dfSC1 = pd.read_table(prot+"_jet_sc1.res", delimiter=r"\s+")
print(dfSC1) # print(dfSC1)
dfSC2 = pd.read_table(prot+"_jet_sc2.res", delimiter=r"\s+") # # dfSC2 = pd.read_table(prot+"_jet_sc2.res", delimiter=r"\s+")
print(dfSC2) # # print(dfSC2)
dfSC3 = pd.read_table(prot+"_jet_sc3.res", delimiter=r"\s+") # # dfSC3 = pd.read_table(prot+"_jet_sc3.res", delimiter=r"\s+")
print(dfSC3) # # print(dfSC3)
dfSC1['sc1'] =dfSC1['clusters'] # dfSC1['sc1'] =dfSC1['clusters']
dfSC1['sc2'] =dfSC2['clusters'] # # dfSC1['sc2'] =dfSC2['clusters']
dfSC1['sc3'] =dfSC3['clusters'] # # dfSC1['sc3'] =dfSC3['clusters']
print(dfSC1) # print(dfSC1)
#Calculate combined values of SC1, SC2 and SC3 and normalize them. # #Calculate combined values of SC1, SC2 and SC3 and normalize them.
combinedValuesV1 = [] # combinedValuesV1 = []
combinedValuesV2 = [] # combinedValuesV2 = []
value1List = [] # value1List = []
value2List = [] # value2List = []
value3List = [] # value3List = []
value4List = [] # value4List = []
value5List = [] # value5List = []
value6List = [] # value6List = []
temp1List = [] # temp1List = []
temp2List = [] # temp2List = []
temp3List = [] # temp3List = []
for i in range(len(dfSC1)): # for i in range(len(dfSC1)):
value1=(dfSC1.loc[i, "sc1"]*dfSC1.loc[i, "trace"]) + dfSC1.loc[i, "trace"] # value1=(dfSC1.loc[i, "sc1"]*dfSC1.loc[i, "trace"]) + dfSC1.loc[i, "trace"]
value1List.append(value1) # value1List.append(value1)
value2=(dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)**2 + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2) # value2=(dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)**2 + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)
value2List.append(value2) # value2List.append(value2)
value3=(dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)**2 + ((dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2) # value3=(dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)**2 + ((dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)
value3List.append(value3) # value3List.append(value3)
value4=(dfSC1.loc[i, "sc1"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)**2 + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2) # value4=(dfSC1.loc[i, "sc1"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)**2 + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)
value4List.append(value4) # value4List.append(value4)
value5=(dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2) # value5=(dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)
value5List.append(value5) # value5List.append(value5)
value6=(dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2) # value6=(dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)
value6List.append(value6) # value6List.append(value6)
temp1 = (dfSC1.loc[i, "sc1"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)**2 # temp1 = (dfSC1.loc[i, "sc1"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)**2
temp1List.append(temp1) # temp1List.append(temp1)
temp2 = (dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)**2 # temp2 = (dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)**2
temp2List.append(temp2) # temp2List.append(temp2)
temp3 = (dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)**2 # temp3 = (dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)**2
temp3List.append(temp3) # temp3List.append(temp3)
combinedValuesV1.append(max([value1, value2, value3])) # combinedValuesV1.append(max([value1, value2, value3]))
combinedValuesV2.append(max([value2, value3, value4])) # combinedValuesV2.append(max([value2, value3, value4]))
#MinMax normalize the list # #MinMax normalize the list
combinedValuesV1MinMaxed = minMaxNormalization(combinedValuesV1) # combinedValuesV1MinMaxed = minMaxNormalization(combinedValuesV1)
combinedValuesV2MinMaxed = minMaxNormalization(combinedValuesV2) # combinedValuesV2MinMaxed = minMaxNormalization(combinedValuesV2)
# value1ListMinMaxed = minMaxNormalization(value1List) # # value1ListMinMaxed = minMaxNormalization(value1List)
# value2ListMinMaxed = minMaxNormalization(value2List) # # value2ListMinMaxed = minMaxNormalization(value2List)
# value3ListMinMaxed = minMaxNormalization(value3List) # # value3ListMinMaxed = minMaxNormalization(value3List)
# value4ListMinMaxed = minMaxNormalization(value4List) # # value4ListMinMaxed = minMaxNormalization(value4List)
# # combinedValuesV3MinMaxed = []
# # for i in range(len(dfSC1)):
# # combinedValuesV3MinMaxed.append(max([value2ListMinMaxed[i], value3ListMinMaxed[i], value4ListMinMaxed[i]]))
# #Let's skip that minmax normalization part
# combinedValuesV3MinMaxed = [] # combinedValuesV3MinMaxed = []
# for i in range(len(dfSC1)): # for i in range(len(dfSC1)):
# combinedValuesV3MinMaxed.append(max([value2ListMinMaxed[i], value3ListMinMaxed[i], value4ListMinMaxed[i]])) # maxTemp = max([value2List[i], value3List[i], value4List[i]])
# if(maxTemp > 1.0):
#Let's skip that minmax normalization part # combinedValuesV3MinMaxed.append(1.0)
combinedValuesV3MinMaxed = [] # else:
for i in range(len(dfSC1)): # combinedValuesV3MinMaxed.append(maxTemp)
maxTemp = max([value2List[i], value3List[i], value4List[i]])
if(maxTemp > 1.0): # combinedValuesV4MinMaxed = []
combinedValuesV3MinMaxed.append(1.0) # for i in range(len(dfSC1)):
else: # maxTemp = max([temp1List[i], temp2List[i], temp3List[i]])
combinedValuesV3MinMaxed.append(maxTemp) # if(maxTemp > 1.0):
# combinedValuesV4MinMaxed.append(1.0)
combinedValuesV4MinMaxed = [] # else:
for i in range(len(dfSC1)): # combinedValuesV4MinMaxed.append(maxTemp)
maxTemp = max([temp1List[i], temp2List[i], temp3List[i]])
if(maxTemp > 1.0): # dfSC1['combinedv1'] = combinedValuesV1MinMaxed.round(4)
combinedValuesV4MinMaxed.append(1.0) # dfSC1['combinedv2'] = combinedValuesV2MinMaxed.round(4)
else: # dfSC1['combinedv3'] = list(np.around(np.array(combinedValuesV3MinMaxed), 4))
combinedValuesV4MinMaxed.append(maxTemp) # dfSC1['combinedv4'] = list(np.around(np.array(combinedValuesV4MinMaxed), 4))
# #dfSC1['combinedv3'] = combinedValuesV3MinMaxed.round(4)
dfSC1['combinedv1'] = combinedValuesV1MinMaxed.round(4)
dfSC1['combinedv2'] = combinedValuesV2MinMaxed.round(4)
dfSC1['combinedv3'] = list(np.around(np.array(combinedValuesV3MinMaxed), 4))
dfSC1['combinedv4'] = list(np.around(np.array(combinedValuesV4MinMaxed), 4)) # # dfSC1['cv'] = dfSC1['cv'].round(4)
#dfSC1['combinedv3'] = combinedValuesV3MinMaxed.round(4) # # dfSC1['pc'] = dfSC1['pc'].round(4)
# # dfSC1['tr'] = dfSC1['tr'].round(4)
# # dfSC1['freq'] = dfSC1['freq'].round(1)
# # dfSC1['trace'] = dfSC1['trace'].round(4)
# dfSC1['cv'] = dfSC1['cv'].round(4) # # dfSC1['clusters'] = dfSC1['clusters'].round(4)
# dfSC1['pc'] = dfSC1['pc'].round(4) # # dfSC1['clusnumber'] = dfSC1['clusnumber'].round(1)
# dfSC1['tr'] = dfSC1['tr'].round(4) # # dfSC1['sc1'] = dfSC1['sc1'].round(4)
# dfSC1['freq'] = dfSC1['freq'].round(1) # # dfSC1['sc2'] = dfSC1['sc2'].round(4)
# dfSC1['trace'] = dfSC1['trace'].round(4) # # dfSC1['sc3'] = dfSC1['sc3'].round(4)
# dfSC1['clusters'] = dfSC1['clusters'].round(4)
# dfSC1['clusnumber'] = dfSC1['clusnumber'].round(1)
# dfSC1['sc1'] = dfSC1['sc1'].round(4)
# dfSC1['sc2'] = dfSC1['sc2'].round(4) # dfSC1.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
# dfSC1['sc3'] = dfSC1['sc3'].round(4)
dfSC1.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
#dfSC1.to_csv(prot+"_jet.res", header=True, index=None, mode='w') #dfSC1.to_csv(prot+"_jet.res", header=True, index=None, mode='w')
#sys.exit(-1) #sys.exit(-1)
...@@ -363,9 +365,9 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl): ...@@ -363,9 +365,9 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
def launchPred(prot,inAli,mutFile, normWeightMode, alphabet): def launchPred(prot,inAli,mutFile, normWeightMode, alphabet):
if mutFile!='': if mutFile!='':
rcmd="Rscript --save $GEMME_PATH/computePred.R "+prot+" "+inAli+" FALSE "+mutFile+" "+normWeightMode+" "+alphabet rcmd="Rscript --save $SGEMME_PATH/computePred.R "+prot+" "+inAli+" FALSE "+mutFile+" "+normWeightMode+" "+alphabet
else: else:
rcmd="Rscript --save $GEMME_PATH/computePred.R "+prot+" "+inAli+" TRUE none "+normWeightMode+" "+alphabet rcmd="Rscript --save $SGEMME_PATH/computePred.R "+prot+" "+inAli+" TRUE none "+normWeightMode+" "+alphabet
print("\nRunning: \n"+rcmd) print("\nRunning: \n"+rcmd)
reCode=subprocess.call(rcmd,shell=True) reCode=subprocess.call(rcmd,shell=True)
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (c) 2018: Elodie Laine # Copyright (c) 2018: Elodie Laine
# Copyright (c) 2022: Mustafa Tekpinar # Copyright (c) 2022-2023: Mustafa Tekpinar
# This code is part of the gemme package and governed by its license. # This code is part of the gemme package and governed by its license.
# Please see the LICENSE.txt file included as part of this package. # Please see the LICENSE.txt file included as part of this package.
import sys import sys
import os import os
import time
import argparse import argparse
import re import re
import subprocess import subprocess
...@@ -672,11 +673,17 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N, jetfile, pdbfile, normWeightMode, ...@@ -672,11 +673,17 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N, jetfile, pdbfile, normWeightMode,
def main(): def main():
tic = time.perf_counter()
args = parse_command_line() args = parse_command_line()
doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,\ doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,\
args.fastaFile,args.nIter,args.NSeqs, args.jetfile, args.pdbfile,\ args.fastaFile,args.nIter,args.NSeqs, args.jetfile, args.pdbfile,\
args.normweightmode, args.alphabet) args.normweightmode, args.alphabet)
toc = time.perf_counter()
print(f"S-GEMME computation finished in {toc - tic:0.4f} seconds!")
if (__name__ == '__main__'): if (__name__ == '__main__'):
main() main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment