Commented unused code in pred.R and computePred.R

2d117e24 · Mustafa Tekpinar · 5647766c · 2d117e24 · 2d117e24 · 2d117e24
Commit 2d117e24 authored Mar 31, 2023 by Mustafa Tekpinar
Hide whitespace changes
Inline Side-by-side

Showing with 90 additions and 92 deletions

README.md README.md +0 -2

computePred.R computePred.R +13 -13

example-esgemme-script.sh examples/example-esgemme-script.sh +1 -1

pred.R pred.R +76 -76

No files found.
--- a/README.md
+++ b/README.md
@@ -55,9 +55,7 @@ JET2 configuration file is: default.conf.
 JET2 output file is: myProt_jet.res.
 ### Analyzing the ESGEMME output
 By default, ESGEMME will output the following files: 
-* myProt_pred_evolEpi.txt
 * myProt_normPred_evolEpi.txt
-* myProt_pred_evolInd.txt
 * myProt_normPred_evolInd.txt
 * myProt_normPred_evolCombi.txt


--- a/computePred.R
+++ b/computePred.R
@@ -36,11 +36,11 @@ aliCons = ali[pId>0.6,]
 aliVeryCons = ali[pId>0.8,]
 # number of sequences
 N = c(dim(ali)[[1]],dim(aliCons)[[1]],dim(aliVeryCons)[[1]])
-#resAliCons = computePSSM(aliCons)
-res = list(computePSSM(ali,N[1],npos),computePSSM(aliCons,N[2],npos),computePSSM(aliVeryCons,N[3],npos))
-# write.table(res[[1]][[3]],paste0(prot,"_pssm.txt"))
-# write.table(res[[2]][[3]],paste0(prot,"_pssm60.txt"))
-# write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt"))
+# #resAliCons = computePSSM(aliCons)
+# res = list(computePSSM(ali,N[1],npos),computePSSM(aliCons,N[2],npos),computePSSM(aliVeryCons,N[3],npos))
+# # write.table(res[[1]][[3]],paste0(prot,"_pssm.txt"))
+# # write.table(res[[2]][[3]],paste0(prot,"_pssm60.txt"))
+# # write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt"))

 # read evolutionary traces computed by JET
 jet=read.table(paste(prot,"_jet.res",sep=""),head=TRUE)
@@ -176,14 +176,14 @@ print("done")


 print("running combined model...")
-#Read frequencies from the jet output file. 
-frequencies = c()
-print(paste("Reading frequencies:"))
-for (row in 1:nrow(jet)) {
-  frequencies<-append(frequencies, jet[row, "freq"] )
-}
-#print(frequencies)
-freq_mean = mean(frequencies)
+# #Read frequencies from the jet output file. 
+# frequencies = c()
+# print(paste("Reading frequencies:"))
+# for (row in 1:nrow(jet)) {
+#   frequencies<-append(frequencies, jet[row, "freq"] )
+# }
+# #print(frequencies)
+# freq_mean = mean(frequencies)

 alpha = 0.6
 # alpha = c()

--- a/examples/example-esgemme-script.sh
+++ b/examples/example-esgemme-script.sh
@@ -10,7 +10,7 @@ then
    rm -rf BLAT*
    rm -f default.conf caracTest.dat
    rm -rf bin*.fasta
-    rm -rf blat-af2.pdb.dssp blat-af2.pdb.dssp.new
+    rm -rf ../data/blat-af2.pdb.dssp ../data/blat-af2.pdb.dssp.new
 elif [ "$1" == "jetoff" ]
 then
    # If you have your own JET2 score file, you can turn off JET2 as follows:

--- a/pred.R
+++ b/pred.R
@@ -91,87 +91,87 @@ computeNbSeqs<-function(mat,gap=FALSE){
  return(res)
 }

-# compute variability levels as the Shanon entropy
-computeConsSE<-function(mat,N){
-  mat[mat==0] = 1
-  return(apply(mat/N,2,f<-function(x){return(-sum(x*log2(x)))}))
-}
+# # compute variability levels as the Shanon entropy
+# computeConsSE<-function(mat,N){
+#   mat[mat==0] = 1
+#   return(apply(mat/N,2,f<-function(x){return(-sum(x*log2(x)))}))
+# }

-# compute conservation levels as the Kullback-Leibler divergence
-computeConsKL<-function(mat){
-  n = apply(mat,2,sum)
-  mat = t(t(mat)/n)
-  mat = mat + 0.000001
-  bg = blosum62[,1][order(rownames(blosum62))]
-  return(apply(mat,2,f<-function(x){return(sum(x*log2(x/bg)))}))
-}
+# # compute conservation levels as the Kullback-Leibler divergence
+# computeConsKL<-function(mat){
+#   n = apply(mat,2,sum)
+#   mat = t(t(mat)/n)
+#   mat = mat + 0.000001
+#   bg = blosum62[,1][order(rownames(blosum62))]
+#   return(apply(mat,2,f<-function(x){return(sum(x*log2(x/bg)))}))
+# }

-computeSeqWeights<-function(mat,N,npos){
-  # compute the number of different observed amino acids in each column
-  r = apply(mat,2,f<-function(x){return(length(unique(x)))})
-  # compute occurrence of each letter at each position
-  occMat = computeNbSeqs(mat,TRUE)
-  #print(occMat)
-  # compute weights for each sequence
-  weightMat=matrix(nr=N,nc=npos)
-  for(k in 1:N){
-    midx=cbind(mat[k,],1:npos)
-    weightMat[k,] = 1 / (occMat[midx] * r)
-  }
-  indObs = sum(r)/npos
-  return(list(weightMat,indObs))
-}
+# computeSeqWeights<-function(mat,N,npos){
+#   # compute the number of different observed amino acids in each column
+#   r = apply(mat,2,f<-function(x){return(length(unique(x)))})
+#   # compute occurrence of each letter at each position
+#   occMat = computeNbSeqs(mat,TRUE)
+#   #print(occMat)
+#   # compute weights for each sequence
+#   weightMat=matrix(nr=N,nc=npos)
+#   for(k in 1:N){
+#     midx=cbind(mat[k,],1:npos)
+#     weightMat[k,] = 1 / (occMat[midx] * r)
+#   }
+#   indObs = sum(r)/npos
+#   return(list(weightMat,indObs))
+# }

-computePseudoCounts<-function(freqmat,npos){
-  PC = matrix(nr=length(aa),nc=npos)
-  rownames(PC)= aa
-  for(a in aa){
-    PC[a,] = apply(freqmat,2,f<-function(x){return(sum(x/bg*bgp[a,]))})
-  }
-  return(PC)
-}
+# computePseudoCounts<-function(freqmat,npos){
+#   PC = matrix(nr=length(aa),nc=npos)
+#   rownames(PC)= aa
+#   for(a in aa){
+#     PC[a,] = apply(freqmat,2,f<-function(x){return(sum(x/bg*bgp[a,]))})
+#   }
+#   return(PC)
+# }

-computePSSM<-function(mat,N,npos){
-  # compute sequence weights
-  res  = computeSeqWeights(mat,N,npos)
-  weights = res[[1]]
-  indObs = res[[2]]
-  # extend amino acid alphabet with gaps
-  aa=c(aa,"-")
-  # compute  weighted occurrences
-  occMat = matrix(0,nr=length(aa),nc=npos)
-  rownames(occMat) = aa
-  for(i in 1:npos){
-    counts = tapply(weights[,i],mat[,i],sum)
-    occMat[names(counts),i] = counts
-  }
-  # distribute gaps
-  occMat = occMat[1:20,] + t(occMat["-",]%*%t(bg))
-  # compute pseudo-counts
-  PC = computePseudoCounts(occMat,npos)
-  # distribute pseudo-counts according to an empirical (?) weight
-  occMat = (occMat*(indObs-1) + PC) /indObs
-  # normalize to relative frequencies
-  n = apply(occMat,2,sum)
-  freq = t(t(occMat)/n)
-  # compute Shannon entropy
-  SE = apply(freq,2,f<-function(x){return(-sum(x*log2(x)))})
-  # compute Kullback-Leibler divergence
-  KL = apply(freq,2,f<-function(x){return(sum(x*log2(x/bg)))})
-  # divide by bg freqs and convertto log-scores
-  pssm = 2 * log2(freq/bg)
-  return(list(SE,KL,pssm))
-}
+# computePSSM<-function(mat,N,npos){
+#   # compute sequence weights
+#   res  = computeSeqWeights(mat,N,npos)
+#   weights = res[[1]]
+#   indObs = res[[2]]
+#   # extend amino acid alphabet with gaps
+#   aa=c(aa,"-")
+#   # compute  weighted occurrences
+#   occMat = matrix(0,nr=length(aa),nc=npos)
+#   rownames(occMat) = aa
+#   for(i in 1:npos){
+#     counts = tapply(weights[,i],mat[,i],sum)
+#     occMat[names(counts),i] = counts
+#   }
+#   # distribute gaps
+#   occMat = occMat[1:20,] + t(occMat["-",]%*%t(bg))
+#   # compute pseudo-counts
+#   PC = computePseudoCounts(occMat,npos)
+#   # distribute pseudo-counts according to an empirical (?) weight
+#   occMat = (occMat*(indObs-1) + PC) /indObs
+#   # normalize to relative frequencies
+#   n = apply(occMat,2,sum)
+#   freq = t(t(occMat)/n)
+#   # compute Shannon entropy
+#   SE = apply(freq,2,f<-function(x){return(-sum(x*log2(x)))})
+#   # compute Kullback-Leibler divergence
+#   KL = apply(freq,2,f<-function(x){return(sum(x*log2(x/bg)))})
+#   # divide by bg freqs and convertto log-scores
+#   pssm = 2 * log2(freq/bg)
+#   return(list(SE,KL,pssm))
+# }

-computePSSM2<-function(mat){
-  occMat = computeNbSeqs(mat)
-  n = apply(occMat,2,sum)
-  pssm = t(t(occMat)/n)
-  pssm = pssm + 0.000001
-  bg = blosum62[,1][order(rownames(blosum62))]
-  res = 2 * log2(pssm/bg)
-  return(list(pssm,res))
-}
+# computePSSM2<-function(mat){
+#   occMat = computeNbSeqs(mat)
+#   n = apply(occMat,2,sum)
+#   pssm = t(t(occMat)/n)
+#   pssm = pssm + 0.000001
+#   bg = blosum62[,1][order(rownames(blosum62))]
+#   res = 2 * log2(pssm/bg)
+#   return(list(pssm,res))
+# }

 computeNbSeqsAlph<-function(nbSeqs,alphabet){
 	path=paste(Sys.getenv("ESGEMME_PATH"),"/data/alphabets/",sep="")