Added a timer function to sgemme.py to show total compute time.

faafebd8 · Mustafa Tekpinar · a53a653e · faafebd8 · faafebd8 · faafebd8
Commit faafebd8 authored Jan 19, 2023 by Mustafa Tekpinar
5 changed files
--- a/example/aliBLAT.fasta
+++ b/example/aliBLAT.fasta
--- a/example/blat-af2.pdb
+++ b/example/blat-af2.pdb
--- a/example/example-gemme-script.sh
+++ b/example/example-gemme-script.sh
@@ -9,27 +9,33 @@ then
    # rm -rf BLAT.fasta
    rm -rf BLAT*
    rm -f default.conf caracTest.dat
+    rm -rf bin*.fasta
 elif [ "$1" == "jetoff" ]
 then
    # If you have your own JET2 score file, you can turn off JET2 as follows:
    # In ../tests folder, you can find a sample JET2 file called BLAT_jet.res 
-    echo "Running GEMME with a user-provided alignment file."
+    echo "Running SGEMME with a user-provided alignment file."
    echo "Using a previously produced prot_jet.res file to check reproducibility!"
    cp ../tests/BLAT_jet.res .
-    python $GEMME_PATH/gemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --jetfile BLAT_jet.res
+    python $SGEMME_PATH/sgemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --jetfile BLAT_jet.res
 elif [ "$1" == "withpdb" ]
 then
    #Please note that CV isa structural feature and it can not be calculated if you don't specify a pdb file.
-    echo "Using blat-curated.pdb for the structural feature calculations!"
+    echo "Using blat-af2.pdb for the structural feature calculations!"
-    python $GEMME_PATH/gemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --pdbfile blat-curated.pdb --normweightmode dfi
+    python $SGEMME_PATH/sgemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --pdbfile blat-af2.pdb --normweightmode maxhalftracepchalftracecvhalfcvpc
+elif [ "$1" == "withpdb-withmutfile" ]
+then
+    #Please note that CV isa structural feature and it can not be calculated if you don't specify a pdb file.
+    echo "Using blat-af2.pdb for the structural feature calculations!"
+    python $SGEMME_PATH/sgemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --pdbfile blat-af2.pdb --normweightmode maxhalftracepchalftracecvhalfcvpc -m Stiffler_2015_BLAT_ECOLX.mut
 else
-    echo "Running GEMME with a user-provided alignment file."
+    echo "Running SGEMME with a user-provided alignment file."
-    python $GEMME_PATH/gemme.py aliBLAT.fasta -r input -f aliBLAT.fasta
+    python $SGEMME_PATH/sgemme.py aliBLAT.fasta -r input -f aliBLAT.fasta
 fi
--- a/gemmeAnal.py
+++ b/gemmeAnal.py
@@ -43,8 +43,10 @@ def extractQuerySeq(filename):
 def getNbSeq(filename):
+	#TODO: Remove bash dependency and count the sequences within Python!
 	"""
 		# Get the number of sequences in a multi-fasta file
 	"""
 	if filename!='':
 		proc=subprocess.Popen("grep -c '^>' "+filename,stdout=subprocess.PIPE,shell=True)
@@ -131,7 +133,7 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
 	chainID = chains[0]
-	subprocess.call("cp $GEMME_PATH/default.conf .",shell=True)
+	subprocess.call("cp $SGEMME_PATH/default.conf .",shell=True)
 	if retMet=="input":
 		if bFile!='':
 			#TODO: I think these two lines must be here as well but I am not sure. 
@@ -184,7 +186,7 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
 				print("\nRunning for SC1:\n"+jetcmd)
 				reCode=subprocess.call(jetcmd,shell=True)
 				if os.path.isfile(prot+"/"+prot+"_jet.res"):
-					os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc1.res")
+					os.rename(prot+"/"+prot+"_jet.res",prot+"_jet.res")
 				dir_name = prot+"/"
 				if os.path.isdir(dir_name):
@@ -193,153 +195,153 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
 						if os.path.isfile(f_path):
 							os.remove(f_path)
 					os.rmdir(dir_name)
-				# Calculate SC2
+				# # Calculate SC2
-				jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\
+				# jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\
-					    prot+".pdb -o `pwd` -p AVJCG -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" -a 4"+" > "+prot+".out"
+				# 	    prot+".pdb -o `pwd` -p AVJCG -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" -a 4"+" > "+prot+".out"
-						#One can also add: -g 'trace,pc,cv,clusters,axs' 
+				# 		#One can also add: -g 'trace,pc,cv,clusters,axs' 
-				print("\nRunning for SC2:\n"+jetcmd)
+				# print("\nRunning for SC2:\n"+jetcmd)
-				reCode=subprocess.call(jetcmd,shell=True)
+				# reCode=subprocess.call(jetcmd,shell=True)
-				if os.path.isfile(prot+"/"+prot+"_jet.res"):
+				# if os.path.isfile(prot+"/"+prot+"_jet.res"):
-					os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc2.res")
+				# 	os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc2.res")
-				dir_name = prot+"/"
+				# dir_name = prot+"/"
-				if os.path.isdir(dir_name):
+				# if os.path.isdir(dir_name):
-					for f in os.listdir(dir_name): 
+				# 	for f in os.listdir(dir_name): 
-						f_path = os.path.join(dir_name, f)
+				# 		f_path = os.path.join(dir_name, f)
-						if os.path.isfile(f_path):
+				# 		if os.path.isfile(f_path):
-							os.remove(f_path)
+				# 			os.remove(f_path)
-					os.rmdir(dir_name)
+				# 	os.rmdir(dir_name)
-				# Calculate SC3
+				# # Calculate SC3
-				jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\
+				# jetcmd = "java -Xmx4096m -cp $JET2_PATH:$JET2_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+\
-					    prot+".pdb -o `pwd` -p AVJCG -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" -a 5"+" > "+prot+".out"
+				# 	    prot+".pdb -o `pwd` -p AVJCG -r input -f "+prot+"_"+chainID+".fasta -d chain -n "+n+" -a 5"+" > "+prot+".out"
-						#One can also add: -g 'trace,pc,cv,clusters,axs' 
+				# 		#One can also add: -g 'trace,pc,cv,clusters,axs' 
-				print("\nRunning for SC3:\n"+jetcmd)
+				# print("\nRunning for SC3:\n"+jetcmd)
-				reCode=subprocess.call(jetcmd,shell=True)
+				# reCode=subprocess.call(jetcmd,shell=True)
-				if os.path.isfile(prot+"/"+prot+"_jet.res"):
+				# if os.path.isfile(prot+"/"+prot+"_jet.res"):
-					os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc3.res")
+				# 	os.rename(prot+"/"+prot+"_jet.res",prot+"_jet_sc3.res")
-				dir_name = prot+"/"
+				# dir_name = prot+"/"
-				if os.path.isdir(dir_name):
+				# if os.path.isdir(dir_name):
-					for f in os.listdir(dir_name): 
+				# 	for f in os.listdir(dir_name): 
-						f_path = os.path.join(dir_name, f)
+				# 		f_path = os.path.join(dir_name, f)
-						if os.path.isfile(f_path):
+				# 		if os.path.isfile(f_path):
-							os.remove(f_path)
+				# 			os.remove(f_path)
-					os.rmdir(dir_name)
+				# 	os.rmdir(dir_name)
-				dfSC1 = pd.read_table(prot+"_jet_sc1.res", delimiter=r"\s+")
+				# dfSC1 = pd.read_table(prot+"_jet_sc1.res", delimiter=r"\s+")
-				print(dfSC1)
+				# print(dfSC1)
-				dfSC2 = pd.read_table(prot+"_jet_sc2.res", delimiter=r"\s+")
+				# # dfSC2 = pd.read_table(prot+"_jet_sc2.res", delimiter=r"\s+")
-				print(dfSC2)
+				# # print(dfSC2)
-				dfSC3 = pd.read_table(prot+"_jet_sc3.res", delimiter=r"\s+")
+				# # dfSC3 = pd.read_table(prot+"_jet_sc3.res", delimiter=r"\s+")
-				print(dfSC3)
+				# # print(dfSC3)
-				dfSC1['sc1'] =dfSC1['clusters']
+				# dfSC1['sc1'] =dfSC1['clusters']
-				dfSC1['sc2'] =dfSC2['clusters']
+				# # dfSC1['sc2'] =dfSC2['clusters']
-				dfSC1['sc3'] =dfSC3['clusters']
+				# # dfSC1['sc3'] =dfSC3['clusters']
-				print(dfSC1)
+				# print(dfSC1)
-				#Calculate combined values of SC1, SC2 and SC3 and normalize them. 
+				# #Calculate combined values of SC1, SC2 and SC3 and normalize them. 
-				combinedValuesV1 = []
+				# combinedValuesV1 = []
-				combinedValuesV2 = []
+				# combinedValuesV2 = []
-				value1List = []
+				# value1List = []
-				value2List = []
+				# value2List = []
-				value3List = []
+				# value3List = []
-				value4List = []
+				# value4List = []
-				value5List = []
+				# value5List = []
-				value6List = []
+				# value6List = []
-				temp1List = []
+				# temp1List = []
-				temp2List = []
+				# temp2List = []
-				temp3List = []
+				# temp3List = []
-				for i in range(len(dfSC1)):
+				# for i in range(len(dfSC1)):
-					value1=(dfSC1.loc[i, "sc1"]*dfSC1.loc[i, "trace"]) + dfSC1.loc[i, "trace"]
+				# 	value1=(dfSC1.loc[i, "sc1"]*dfSC1.loc[i, "trace"]) + dfSC1.loc[i, "trace"]
-					value1List.append(value1)
+				# 	value1List.append(value1)
-					value2=(dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)**2 + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)
+				# 	value2=(dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)**2 + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)
-					value2List.append(value2)
+				# 	value2List.append(value2)
-					value3=(dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)**2  + ((dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)
+				# 	value3=(dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)**2  + ((dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)
-					value3List.append(value3)
+				# 	value3List.append(value3)
-					value4=(dfSC1.loc[i, "sc1"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)**2  + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)
+				# 	value4=(dfSC1.loc[i, "sc1"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)**2  + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)
-					value4List.append(value4)
+				# 	value4List.append(value4)
-					value5=(dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)
+				# 	value5=(dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)
-					value5List.append(value5)
+				# 	value5List.append(value5)
-					value6=(dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)
+				# 	value6=(dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2) + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)
-					value6List.append(value6)
+				# 	value6List.append(value6)
-					temp1 = (dfSC1.loc[i, "sc1"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)  + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)**2
+				# 	temp1 = (dfSC1.loc[i, "sc1"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)  + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "pc"])/2)**2
-					temp1List.append(temp1)
+				# 	temp1List.append(temp1)
-					temp2 = (dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)  + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)**2
+				# 	temp2 = (dfSC1.loc[i, "sc2"]*(dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)  + ((dfSC1.loc[i, "trace"]+dfSC1.loc[i, "cv"])/2)**2
-					temp2List.append(temp2)
+				# 	temp2List.append(temp2)
-					temp3 = (dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)  + ((dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)**2
+				# 	temp3 = (dfSC1.loc[i, "sc3"]*(dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)  + ((dfSC1.loc[i, "pc"]+dfSC1.loc[i, "cv"])/2)**2
-					temp3List.append(temp3)
+				# 	temp3List.append(temp3)
-					combinedValuesV1.append(max([value1, value2, value3]))
+				# 	combinedValuesV1.append(max([value1, value2, value3]))
-					combinedValuesV2.append(max([value2, value3, value4]))
+				# 	combinedValuesV2.append(max([value2, value3, value4]))
-				#MinMax normalize the list
+				# #MinMax normalize the list
-				combinedValuesV1MinMaxed = minMaxNormalization(combinedValuesV1)
+				# combinedValuesV1MinMaxed = minMaxNormalization(combinedValuesV1)
-				combinedValuesV2MinMaxed = minMaxNormalization(combinedValuesV2)
+				# combinedValuesV2MinMaxed = minMaxNormalization(combinedValuesV2)
-				# value1ListMinMaxed = minMaxNormalization(value1List)
+				# # value1ListMinMaxed = minMaxNormalization(value1List)
-				# value2ListMinMaxed = minMaxNormalization(value2List)
+				# # value2ListMinMaxed = minMaxNormalization(value2List)
-				# value3ListMinMaxed = minMaxNormalization(value3List)
+				# # value3ListMinMaxed = minMaxNormalization(value3List)
-				# value4ListMinMaxed = minMaxNormalization(value4List)
+				# # value4ListMinMaxed = minMaxNormalization(value4List)
+				# # combinedValuesV3MinMaxed = []
+				# # for i in range(len(dfSC1)):
+				# # 	combinedValuesV3MinMaxed.append(max([value2ListMinMaxed[i], value3ListMinMaxed[i], value4ListMinMaxed[i]]))
+				# #Let's skip that minmax normalization part
 				# combinedValuesV3MinMaxed = []
 				# for i in range(len(dfSC1)):
-				# 	combinedValuesV3MinMaxed.append(max([value2ListMinMaxed[i], value3ListMinMaxed[i], value4ListMinMaxed[i]]))
+				# 	maxTemp = max([value2List[i], value3List[i], value4List[i]])
+				# 	if(maxTemp > 1.0):
-				#Let's skip that minmax normalization part
+				# 		combinedValuesV3MinMaxed.append(1.0)
-				combinedValuesV3MinMaxed = []
+				# 	else:
-				for i in range(len(dfSC1)):
+				# 		combinedValuesV3MinMaxed.append(maxTemp)
-					maxTemp = max([value2List[i], value3List[i], value4List[i]])
-					if(maxTemp > 1.0):
+				# combinedValuesV4MinMaxed = []
-						combinedValuesV3MinMaxed.append(1.0)
+				# for i in range(len(dfSC1)):
-					else:
+				# 	maxTemp = max([temp1List[i], temp2List[i], temp3List[i]])
-						combinedValuesV3MinMaxed.append(maxTemp)
+				# 	if(maxTemp > 1.0):
+				# 		combinedValuesV4MinMaxed.append(1.0)
-				combinedValuesV4MinMaxed = []
+				# 	else:
-				for i in range(len(dfSC1)):
+				# 		combinedValuesV4MinMaxed.append(maxTemp)
-					maxTemp = max([temp1List[i], temp2List[i], temp3List[i]])
-					if(maxTemp > 1.0):
+				# dfSC1['combinedv1'] = combinedValuesV1MinMaxed.round(4)
-						combinedValuesV4MinMaxed.append(1.0)
+				# dfSC1['combinedv2'] = combinedValuesV2MinMaxed.round(4)
-					else:
+				# dfSC1['combinedv3'] = list(np.around(np.array(combinedValuesV3MinMaxed), 4))
-						combinedValuesV4MinMaxed.append(maxTemp)
+				# dfSC1['combinedv4'] = list(np.around(np.array(combinedValuesV4MinMaxed), 4))
+				# #dfSC1['combinedv3'] = combinedValuesV3MinMaxed.round(4)
-				dfSC1['combinedv1'] = combinedValuesV1MinMaxed.round(4)
-				dfSC1['combinedv2'] = combinedValuesV2MinMaxed.round(4)
-				dfSC1['combinedv3'] = list(np.around(np.array(combinedValuesV3MinMaxed), 4))
-				dfSC1['combinedv4'] = list(np.around(np.array(combinedValuesV4MinMaxed), 4))
+				# # dfSC1['cv'] = dfSC1['cv'].round(4)
-				#dfSC1['combinedv3'] = combinedValuesV3MinMaxed.round(4)
+				# # dfSC1['pc'] = dfSC1['pc'].round(4)
+				# # dfSC1['tr'] = dfSC1['tr'].round(4)
+				# # dfSC1['freq'] = dfSC1['freq'].round(1)
+				# # dfSC1['trace'] = dfSC1['trace'].round(4)
-				# dfSC1['cv'] = dfSC1['cv'].round(4)
+				# # dfSC1['clusters'] = dfSC1['clusters'].round(4)
-				# dfSC1['pc'] = dfSC1['pc'].round(4)
+				# # dfSC1['clusnumber'] = dfSC1['clusnumber'].round(1)
-				# dfSC1['tr'] = dfSC1['tr'].round(4)
+				# # dfSC1['sc1'] = dfSC1['sc1'].round(4)
-				# dfSC1['freq'] = dfSC1['freq'].round(1)
+				# # dfSC1['sc2'] = dfSC1['sc2'].round(4)
-				# dfSC1['trace'] = dfSC1['trace'].round(4)
+				# # dfSC1['sc3'] = dfSC1['sc3'].round(4)
-				# dfSC1['clusters'] = dfSC1['clusters'].round(4)
-				# dfSC1['clusnumber'] = dfSC1['clusnumber'].round(1)
-				# dfSC1['sc1'] = dfSC1['sc1'].round(4)
-				# dfSC1['sc2'] = dfSC1['sc2'].round(4)
+				# dfSC1.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
-				# dfSC1['sc3'] = dfSC1['sc3'].round(4)
-				dfSC1.to_csv(prot+"_jet.res", header=True, index=None, sep='\t', mode='w')
 				#dfSC1.to_csv(prot+"_jet.res", header=True, index=None, mode='w')
 				#sys.exit(-1)
@@ -363,9 +365,9 @@ def launchJET(prot, retMet, bFile, fFile, pdbfile, chains, n, N, nl):
 def launchPred(prot,inAli,mutFile, normWeightMode, alphabet):
 	if mutFile!='':
-		rcmd="Rscript --save $GEMME_PATH/computePred.R "+prot+" "+inAli+" FALSE "+mutFile+" "+normWeightMode+" "+alphabet
+		rcmd="Rscript --save $SGEMME_PATH/computePred.R "+prot+" "+inAli+" FALSE "+mutFile+" "+normWeightMode+" "+alphabet
 	else:
-		rcmd="Rscript --save $GEMME_PATH/computePred.R "+prot+" "+inAli+" TRUE none "+normWeightMode+" "+alphabet
+		rcmd="Rscript --save $SGEMME_PATH/computePred.R "+prot+" "+inAli+" TRUE none "+normWeightMode+" "+alphabet
 	print("\nRunning: \n"+rcmd)
 	reCode=subprocess.call(rcmd,shell=True)

--- a/sgemme.py
+++ b/sgemme.py
 # -*- coding: utf-8 -*-
 # Copyright (c) 2018: Elodie Laine
-# Copyright (c) 2022: Mustafa Tekpinar
+# Copyright (c) 2022-2023: Mustafa Tekpinar
 # This code is part of the gemme package and governed by its license.
 # Please see the LICENSE.txt file included as part of this package.
 import sys
 import os
+import time
 import argparse
 import re
 import subprocess
@@ -672,11 +673,17 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N, jetfile, pdbfile, normWeightMode,
 def main():
+    tic = time.perf_counter()
    args = parse_command_line()
    doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,\
         args.fastaFile,args.nIter,args.NSeqs, args.jetfile, args.pdbfile,\
         args.normweightmode, args.alphabet)
+    toc = time.perf_counter()
+    print(f"S-GEMME computation finished in {toc - tic:0.4f} seconds!")
 if (__name__ == '__main__'):
    main()