Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
PRESCOTT
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Mustafa Tekpinar
PRESCOTT
Commits
2d117e24
Commit
2d117e24
authored
Mar 31, 2023
by
Mustafa Tekpinar
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Commented unused code in pred.R and computePred.R
parent
5647766c
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
90 additions
and
92 deletions
+90
-92
README.md
README.md
+0
-2
computePred.R
computePred.R
+13
-13
example-esgemme-script.sh
examples/example-esgemme-script.sh
+1
-1
pred.R
pred.R
+76
-76
No files found.
README.md
View file @
2d117e24
...
@@ -55,9 +55,7 @@ JET2 configuration file is: default.conf.
...
@@ -55,9 +55,7 @@ JET2 configuration file is: default.conf.
JET2 output file is: myProt_jet.res.
JET2 output file is: myProt_jet.res.
### Analyzing the ESGEMME output
### Analyzing the ESGEMME output
By default, ESGEMME will output the following files:
By default, ESGEMME will output the following files:
*
myProt_pred_evolEpi.txt
*
myProt_normPred_evolEpi.txt
*
myProt_normPred_evolEpi.txt
*
myProt_pred_evolInd.txt
*
myProt_normPred_evolInd.txt
*
myProt_normPred_evolInd.txt
*
myProt_normPred_evolCombi.txt
*
myProt_normPred_evolCombi.txt
...
...
computePred.R
View file @
2d117e24
...
@@ -36,11 +36,11 @@ aliCons = ali[pId>0.6,]
...
@@ -36,11 +36,11 @@ aliCons = ali[pId>0.6,]
aliVeryCons
=
ali
[
pId
>
0.8
,]
aliVeryCons
=
ali
[
pId
>
0.8
,]
# number of sequences
# number of sequences
N
=
c
(
dim
(
ali
)[[
1
]],
dim
(
aliCons
)[[
1
]],
dim
(
aliVeryCons
)[[
1
]])
N
=
c
(
dim
(
ali
)[[
1
]],
dim
(
aliCons
)[[
1
]],
dim
(
aliVeryCons
)[[
1
]])
#resAliCons = computePSSM(aliCons)
#
#
resAliCons = computePSSM(aliCons)
res
=
list
(
computePSSM
(
ali
,
N
[
1
],
npos
),
computePSSM
(
aliCons
,
N
[
2
],
npos
),
computePSSM
(
aliVeryCons
,
N
[
3
],
npos
))
#
res = list(computePSSM(ali,N[1],npos),computePSSM(aliCons,N[2],npos),computePSSM(aliVeryCons,N[3],npos))
# write.table(res[[1]][[3]],paste0(prot,"_pssm.txt"))
#
#
write.table(res[[1]][[3]],paste0(prot,"_pssm.txt"))
# write.table(res[[2]][[3]],paste0(prot,"_pssm60.txt"))
#
#
write.table(res[[2]][[3]],paste0(prot,"_pssm60.txt"))
# write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt"))
#
#
write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt"))
# read evolutionary traces computed by JET
# read evolutionary traces computed by JET
jet
=
read.table
(
paste
(
prot
,
"_jet.res"
,
sep
=
""
),
head
=
TRUE
)
jet
=
read.table
(
paste
(
prot
,
"_jet.res"
,
sep
=
""
),
head
=
TRUE
)
...
@@ -176,14 +176,14 @@ print("done")
...
@@ -176,14 +176,14 @@ print("done")
print
(
"running combined model..."
)
print
(
"running combined model..."
)
#Read frequencies from the jet output file.
#
#
Read frequencies from the jet output file.
frequencies
=
c
()
#
frequencies = c()
print
(
paste
(
"Reading frequencies:"
))
#
print(paste("Reading frequencies:"))
for
(
row
in
1
:
nrow
(
jet
))
{
#
for (row in 1:nrow(jet)) {
frequencies
<-
append
(
frequencies
,
jet
[
row
,
"freq"
]
)
#
frequencies<-append(frequencies, jet[row, "freq"] )
}
#
}
#print(frequencies)
#
#
print(frequencies)
freq_mean
=
mean
(
frequencies
)
#
freq_mean = mean(frequencies)
alpha
=
0.6
alpha
=
0.6
# alpha = c()
# alpha = c()
...
...
examples/example-esgemme-script.sh
View file @
2d117e24
...
@@ -10,7 +10,7 @@ then
...
@@ -10,7 +10,7 @@ then
rm
-rf
BLAT
*
rm
-rf
BLAT
*
rm
-f
default.conf caracTest.dat
rm
-f
default.conf caracTest.dat
rm
-rf
bin
*
.fasta
rm
-rf
bin
*
.fasta
rm
-rf
blat-af2.pdb.dssp
blat-af2.pdb.dssp.new
rm
-rf
../data/blat-af2.pdb.dssp ../data/
blat-af2.pdb.dssp.new
elif
[
"
$1
"
==
"jetoff"
]
elif
[
"
$1
"
==
"jetoff"
]
then
then
# If you have your own JET2 score file, you can turn off JET2 as follows:
# If you have your own JET2 score file, you can turn off JET2 as follows:
...
...
pred.R
View file @
2d117e24
...
@@ -91,87 +91,87 @@ computeNbSeqs<-function(mat,gap=FALSE){
...
@@ -91,87 +91,87 @@ computeNbSeqs<-function(mat,gap=FALSE){
return
(
res
)
return
(
res
)
}
}
# compute variability levels as the Shanon entropy
#
#
compute variability levels as the Shanon entropy
computeConsSE
<-
function
(
mat
,
N
){
#
computeConsSE<-function(mat,N){
mat
[
mat
==
0
]
=
1
#
mat[mat==0] = 1
return
(
apply
(
mat
/
N
,
2
,
f
<-
function
(
x
){
return
(
-
sum
(
x
*
log2
(
x
)))}))
#
return(apply(mat/N,2,f<-function(x){return(-sum(x*log2(x)))}))
}
#
}
# compute conservation levels as the Kullback-Leibler divergence
#
#
compute conservation levels as the Kullback-Leibler divergence
computeConsKL
<-
function
(
mat
){
#
computeConsKL<-function(mat){
n
=
apply
(
mat
,
2
,
sum
)
#
n = apply(mat,2,sum)
mat
=
t
(
t
(
mat
)
/
n
)
#
mat = t(t(mat)/n)
mat
=
mat
+
0.000001
#
mat = mat + 0.000001
bg
=
blosum62
[,
1
][
order
(
rownames
(
blosum62
))]
#
bg = blosum62[,1][order(rownames(blosum62))]
return
(
apply
(
mat
,
2
,
f
<-
function
(
x
){
return
(
sum
(
x
*
log2
(
x
/
bg
)))}))
#
return(apply(mat,2,f<-function(x){return(sum(x*log2(x/bg)))}))
}
#
}
computeSeqWeights
<-
function
(
mat
,
N
,
npos
){
#
computeSeqWeights<-function(mat,N,npos){
# compute the number of different observed amino acids in each column
#
# compute the number of different observed amino acids in each column
r
=
apply
(
mat
,
2
,
f
<-
function
(
x
){
return
(
length
(
unique
(
x
)))})
#
r = apply(mat,2,f<-function(x){return(length(unique(x)))})
# compute occurrence of each letter at each position
#
# compute occurrence of each letter at each position
occMat
=
computeNbSeqs
(
mat
,
TRUE
)
#
occMat = computeNbSeqs(mat,TRUE)
#print(occMat)
#
#print(occMat)
# compute weights for each sequence
#
# compute weights for each sequence
weightMat
=
matrix
(
nr
=
N
,
nc
=
npos
)
#
weightMat=matrix(nr=N,nc=npos)
for
(
k
in
1
:
N
){
#
for(k in 1:N){
midx
=
cbind
(
mat
[
k
,],
1
:
npos
)
#
midx=cbind(mat[k,],1:npos)
weightMat
[
k
,]
=
1
/
(
occMat
[
midx
]
*
r
)
#
weightMat[k,] = 1 / (occMat[midx] * r)
}
#
}
indObs
=
sum
(
r
)
/
npos
#
indObs = sum(r)/npos
return
(
list
(
weightMat
,
indObs
))
#
return(list(weightMat,indObs))
}
#
}
computePseudoCounts
<-
function
(
freqmat
,
npos
){
#
computePseudoCounts<-function(freqmat,npos){
PC
=
matrix
(
nr
=
length
(
aa
),
nc
=
npos
)
#
PC = matrix(nr=length(aa),nc=npos)
rownames
(
PC
)
=
aa
#
rownames(PC)= aa
for
(
a
in
aa
){
#
for(a in aa){
PC
[
a
,]
=
apply
(
freqmat
,
2
,
f
<-
function
(
x
){
return
(
sum
(
x
/
bg
*
bgp
[
a
,]))})
#
PC[a,] = apply(freqmat,2,f<-function(x){return(sum(x/bg*bgp[a,]))})
}
#
}
return
(
PC
)
#
return(PC)
}
#
}
computePSSM
<-
function
(
mat
,
N
,
npos
){
#
computePSSM<-function(mat,N,npos){
# compute sequence weights
#
# compute sequence weights
res
=
computeSeqWeights
(
mat
,
N
,
npos
)
#
res = computeSeqWeights(mat,N,npos)
weights
=
res
[[
1
]]
#
weights = res[[1]]
indObs
=
res
[[
2
]]
#
indObs = res[[2]]
# extend amino acid alphabet with gaps
#
# extend amino acid alphabet with gaps
aa
=
c
(
aa
,
"-"
)
#
aa=c(aa,"-")
# compute weighted occurrences
#
# compute weighted occurrences
occMat
=
matrix
(
0
,
nr
=
length
(
aa
),
nc
=
npos
)
#
occMat = matrix(0,nr=length(aa),nc=npos)
rownames
(
occMat
)
=
aa
#
rownames(occMat) = aa
for
(
i
in
1
:
npos
){
#
for(i in 1:npos){
counts
=
tapply
(
weights
[,
i
],
mat
[,
i
],
sum
)
#
counts = tapply(weights[,i],mat[,i],sum)
occMat
[
names
(
counts
),
i
]
=
counts
#
occMat[names(counts),i] = counts
}
#
}
# distribute gaps
#
# distribute gaps
occMat
=
occMat
[
1
:
20
,]
+
t
(
occMat
[
"-"
,]
%*%
t
(
bg
))
#
occMat = occMat[1:20,] + t(occMat["-",]%*%t(bg))
# compute pseudo-counts
#
# compute pseudo-counts
PC
=
computePseudoCounts
(
occMat
,
npos
)
#
PC = computePseudoCounts(occMat,npos)
# distribute pseudo-counts according to an empirical (?) weight
#
# distribute pseudo-counts according to an empirical (?) weight
occMat
=
(
occMat
*
(
indObs
-1
)
+
PC
)
/
indObs
#
occMat = (occMat*(indObs-1) + PC) /indObs
# normalize to relative frequencies
#
# normalize to relative frequencies
n
=
apply
(
occMat
,
2
,
sum
)
#
n = apply(occMat,2,sum)
freq
=
t
(
t
(
occMat
)
/
n
)
#
freq = t(t(occMat)/n)
# compute Shannon entropy
#
# compute Shannon entropy
SE
=
apply
(
freq
,
2
,
f
<-
function
(
x
){
return
(
-
sum
(
x
*
log2
(
x
)))})
#
SE = apply(freq,2,f<-function(x){return(-sum(x*log2(x)))})
# compute Kullback-Leibler divergence
#
# compute Kullback-Leibler divergence
KL
=
apply
(
freq
,
2
,
f
<-
function
(
x
){
return
(
sum
(
x
*
log2
(
x
/
bg
)))})
#
KL = apply(freq,2,f<-function(x){return(sum(x*log2(x/bg)))})
# divide by bg freqs and convertto log-scores
#
# divide by bg freqs and convertto log-scores
pssm
=
2
*
log2
(
freq
/
bg
)
#
pssm = 2 * log2(freq/bg)
return
(
list
(
SE
,
KL
,
pssm
))
#
return(list(SE,KL,pssm))
}
#
}
computePSSM2
<-
function
(
mat
){
#
computePSSM2<-function(mat){
occMat
=
computeNbSeqs
(
mat
)
#
occMat = computeNbSeqs(mat)
n
=
apply
(
occMat
,
2
,
sum
)
#
n = apply(occMat,2,sum)
pssm
=
t
(
t
(
occMat
)
/
n
)
#
pssm = t(t(occMat)/n)
pssm
=
pssm
+
0.000001
#
pssm = pssm + 0.000001
bg
=
blosum62
[,
1
][
order
(
rownames
(
blosum62
))]
#
bg = blosum62[,1][order(rownames(blosum62))]
res
=
2
*
log2
(
pssm
/
bg
)
#
res = 2 * log2(pssm/bg)
return
(
list
(
pssm
,
res
))
#
return(list(pssm,res))
}
#
}
computeNbSeqsAlph
<-
function
(
nbSeqs
,
alphabet
){
computeNbSeqsAlph
<-
function
(
nbSeqs
,
alphabet
){
path
=
paste
(
Sys.getenv
(
"ESGEMME_PATH"
),
"/data/alphabets/"
,
sep
=
""
)
path
=
paste
(
Sys.getenv
(
"ESGEMME_PATH"
),
"/data/alphabets/"
,
sep
=
""
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment