Commit 2af63caf by Mustafa Tekpinar

Implemented --isjet2on option for reproducibility checks.

parent 05fb8c57
# Copyright (c) 2018: Elodie Laine # Copyright (c) 2018: Elodie Laine
# Copyright (c) 2022: Mustafa Tekpinar
# This code is part of the gemme package and governed by its license. # This code is part of the gemme package and governed by its license.
# Please see the LICENSE.txt file included as part of this package. # Please see the LICENSE.txt file included as part of this package.
...@@ -43,12 +44,6 @@ write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt")) ...@@ -43,12 +44,6 @@ write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt"))
jet=read.table(paste(prot,"_jet.res",sep=""),head=TRUE) jet=read.table(paste(prot,"_jet.res",sep=""),head=TRUE)
if(sum(colnames(jet)=="traceMax")==1){trace=jet[,"traceMax"]}else{trace=jet[,"trace"]} if(sum(colnames(jet)=="traceMax")==1){trace=jet[,"traceMax"]}else{trace=jet[,"trace"]}
#You should comment line 44 to use this functionality. Or maybe, it should go into normalize functions
#That was what we originally decided with Alessandra.
#To get max values of PC, CV or Trace
#trace = c()
#for (row in 1:nrow(jet)) { trace<-append(trace, max(jet[row, "trace"], jet[row, "pc"])) }
#traceAli = sweep(binAli, MARGIN=2, trace, `*`) #traceAli = sweep(binAli, MARGIN=2, trace, `*`)
# compute evolutionary distances of all sequences with respect to the query # compute evolutionary distances of all sequences with respect to the query
distTrace = binAli[2:N[1],] %*% trace^2 distTrace = binAli[2:N[1],] %*% trace^2
...@@ -75,44 +70,56 @@ nbGaps = N[1] - apply(nbSeqs,2,sum) ...@@ -75,44 +70,56 @@ nbGaps = N[1] - apply(nbSeqs,2,sum)
# output the conservation values # output the conservation values
dat=rbind(trace,KL=res[[1]][[2]],SE=res[[1]][[1]],gap=nbGaps/N[1],KL60=res[[2]][[2]],SE60=res[[2]][[1]],KL80=res[[3]][[2]],SE80=res[[3]][[1]]) dat=rbind(trace,KL=res[[1]][[2]],SE=res[[1]][[1]],gap=nbGaps/N[1],KL60=res[[2]][[2]],SE60=res[[2]][[1]],KL80=res[[3]][[2]],SE80=res[[3]][[1]])
write.table(dat,paste0(prot,"_conservation.txt")) write.table(dat,paste0(prot,"_conservation.txt"))
# compute log-odd ratios between mutated and wt sequence counts # compute log-odd ratios between mutated and wt sequence counts
predInd = computePredNbSeqs(wt,nbSeqs) predInd = computePredNbSeqs(wt,nbSeqs)
# output the sequence counts log-odd ratios
write.table(predInd,paste0(prot,"_pred_evolInd.txt"))
# Do the normalization for the independent component here!
rownames(predInd)=aa rownames(predInd)=aa
if(simple){
normPredInd = normalizePredWithNbSeqsZero(predInd,trace,wt)
rownames(normPredInd)=aa
}else{
normPredInd = normalizePredWithNbSeqsZeroSelMult(predInd, trace, wt, list(pos,subsaa))
names(normPredInd)=rawMut
}
# output the predicted mutational effects based on sequence counts (conservation at the bottom) # output the sequence counts log-odd ratios
write.table(normPredInd,paste0(prot,"_normPred_evolInd.txt")) write.table(predInd,paste0(prot,"_pred_evolInd.txt"))
print("done") print("done")
print("running global epistatic model...") print("running global epistatic model...")
pred=computePredSimple(ali,distTrace,wt,5) pred=computePredSimple(ali,distTrace,wt,5)
rownames(pred)=aa
# output the evolutionary distances between the query and the closest variants # output the evolutionary distances between the query and the closest variants
evolDist = pred/sum(trace^2) evolDist = pred/sum(trace^2)
evolDist[is.na(evolDist)] = 1 evolDist[is.na(evolDist)] = 1
write.table(evolDist,paste0(prot,"_pred_evolEpi.txt")) write.table(evolDist,paste0(prot,"_pred_evolEpi.txt"))
print("done")
print("running normalization...")
#You should comment line 44 to use this functionality. Or maybe, it should go into normalize functions
#That was what we originally decided with Alessandra.
#To get max values of PC, or Trace
# trace = c()
# print(trace)
# for (row in 1:nrow(jet)) { trace<-append(trace, max(jet[row, "trace"], jet[row, "pc"])) }
# print(trace)
# Do the normalization for the epistatic component here!
rownames(pred)=aa
if(simple){ if(simple){
#Independent model normalization
normPredInd = normalizePredWithNbSeqsZero(predInd,trace,wt)
rownames(normPredInd)=aa
# Epistatic model normalization
normPred=normalizePred(pred, trace, wt) normPred=normalizePred(pred, trace, wt)
rownames(normPred)=aa rownames(normPred)=aa
}else{ }else{
#Independent model normalization
normPredInd = normalizePredWithNbSeqsZeroSelMult(predInd, trace, wt, list(pos,subsaa))
names(normPredInd)=rawMut
# Epistatic model normalization
normPred=normalizePredSelMult(pred, trace, wt, list(pos,subsaa)) normPred=normalizePredSelMult(pred, trace, wt, list(pos,subsaa))
names(normPred)=rawMut names(normPred)=rawMut
} }
# output the predicted mutational effects based on evolutionary distance (conservation at the bottom)
# output the normalized predicted mutational effects based on sequence counts (conservation at the bottom)
write.table(normPredInd,paste0(prot,"_normPred_evolInd.txt"))
# output the predicted normalized mutational effects based on evolutionary distance (conservation at the bottom)
write.table(normPred,paste0(prot,"_normPred_evolEpi.txt")) write.table(normPred,paste0(prot,"_normPred_evolEpi.txt"))
print("done") print("done")
......
...@@ -9,3 +9,8 @@ else ...@@ -9,3 +9,8 @@ else
echo "Running GEMME with a user-provided alignment file." echo "Running GEMME with a user-provided alignment file."
python $GEMME_PATH/gemme.py aliBLAT.fasta -r input -f aliBLAT.fasta python $GEMME_PATH/gemme.py aliBLAT.fasta -r input -f aliBLAT.fasta
fi fi
# If you have your own JET2 score file, you can turn off JET2 as follows:
# python $GEMME_PATH/gemme.py aliBLAT.fasta -r input -f aliBLAT.fasta --isjet2on false
# This option can be useful for testing reproducibility!
...@@ -11,6 +11,7 @@ import argparse ...@@ -11,6 +11,7 @@ import argparse
import re import re
import subprocess import subprocess
import math import math
from xmlrpc.client import boolean
import numpy as np import numpy as np
import matplotlib.pylab as plt import matplotlib.pylab as plt
...@@ -254,6 +255,9 @@ def parse_command_line(): ...@@ -254,6 +255,9 @@ def parse_command_line():
help='fasta file containing related sequences', help='fasta file containing related sequences',
default='' default=''
) )
retMet_args.add_argument('--isjet2on', dest='isjet2on', type=str, \
help="If false, it will skip JET2 calculation and use a precalculated JET2 file. Default is true",
required=False, default="True")
args = parser.parse_args() args = parser.parse_args()
...@@ -268,22 +272,32 @@ def parse_command_line(): ...@@ -268,22 +272,32 @@ def parse_command_line():
return args return args
def doit(inAli,mutFile,retMet,bFile,fFile,n,N): def doit(inAli,mutFile,retMet,bFile,fFile,n,N, isjet2on):
""" """
doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,args.fastaFile) Fonksiyon aciklamasi ile taniminin ayni olmasi super olmus!
doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,args.fastaFile, args.isjet2on)
""" """
simple = True
prot,seq,nl=extractQuerySeq(inAli) prot,seq,nl=extractQuerySeq(inAli)
createPDB(prot,seq) createPDB(prot,seq)
print("query protein: "+prot) print("query protein: "+prot)
print("computing conservation levels...")
if((isjet2on.lower()) == "true"):
#I intend to run JET2 completely externally!! #I intend to run JET2 completely externally!!
#It is too much buggy and it has too many dependencies. #It is too much buggy and it has too many dependencies.
#Using it with a Docker or Singularity may be the best solution! #Using it with a Docker or Singularity may be the best solution!
launchJET(prot,retMet,bFile,fFile,n,N,nl) print("computing conservation levels...")
print("done") launchJET(prot,retMet,bFile,fFile,n,N,nl)
print("done")
elif((isjet2on.lower()) == "false"):
print("using previously calculated JET2 conservation levels...")
print("done")
else:
print("ERROR: You can only use true or false after --isjet2on!")
sys.exit(-1)
launchPred(prot,inAli,mutFile) launchPred(prot,inAli,mutFile)
#Do Python plotting here #Do Python plotting here
...@@ -292,9 +306,8 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N): ...@@ -292,9 +306,8 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N):
#TODO: Mark the original (wildtype) residue locations with a dot or something #TODO: Mark the original (wildtype) residue locations with a dot or something
# special to show the original amino acid. # special to show the original amino acid.
#TODO: You can even put letters on the top line like in EVmutation output. #TODO: You can even put letters on the top line like in EVmutation output.
simple = True
if(simple): if(simple):
print("generating the plots...") print("generating the plots...")
gemmeData = parseGEMMEoutput(prot+"_normPred_evolEpi.txt", verbose=False) gemmeData = parseGEMMEoutput(prot+"_normPred_evolEpi.txt", verbose=False)
...@@ -315,7 +328,8 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N): ...@@ -315,7 +328,8 @@ def doit(inAli,mutFile,retMet,bFile,fFile,n,N):
def main(): def main():
args = parse_command_line() args = parse_command_line()
doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,args.fastaFile,args.nIter,args.NSeqs) doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,\
args.fastaFile,args.nIter,args.NSeqs, args.isjet2on)
if (__name__ == '__main__'): if (__name__ == '__main__'):
main() main()
......
# Copyright (c) 2018: Elodie Laine # Copyright (c) 2018: Elodie Laine
# Copyright (c) 2022: Mustafa Tekpinar
# This code is part of the gemme package and governed by its license. # This code is part of the gemme package and governed by its license.
# Please see the LICENSE.txt file included as part of this package. # Please see the LICENSE.txt file included as part of this package.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment