Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
PRESCOTT
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Mustafa Tekpinar
PRESCOTT
Commits
2d117e24
Commit
2d117e24
authored
Mar 31, 2023
by
Mustafa Tekpinar
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Commented unused code in pred.R and computePred.R
parent
5647766c
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
90 additions
and
92 deletions
+90
-92
README.md
README.md
+0
-2
computePred.R
computePred.R
+13
-13
example-esgemme-script.sh
examples/example-esgemme-script.sh
+1
-1
pred.R
pred.R
+76
-76
No files found.
README.md
View file @
2d117e24
...
...
@@ -55,9 +55,7 @@ JET2 configuration file is: default.conf.
JET2 output file is: myProt_jet.res.
### Analyzing the ESGEMME output
By default, ESGEMME will output the following files:
*
myProt_pred_evolEpi.txt
*
myProt_normPred_evolEpi.txt
*
myProt_pred_evolInd.txt
*
myProt_normPred_evolInd.txt
*
myProt_normPred_evolCombi.txt
...
...
computePred.R
View file @
2d117e24
...
...
@@ -36,11 +36,11 @@ aliCons = ali[pId>0.6,]
aliVeryCons
=
ali
[
pId
>
0.8
,]
# number of sequences
N
=
c
(
dim
(
ali
)[[
1
]],
dim
(
aliCons
)[[
1
]],
dim
(
aliVeryCons
)[[
1
]])
#resAliCons = computePSSM(aliCons)
res
=
list
(
computePSSM
(
ali
,
N
[
1
],
npos
),
computePSSM
(
aliCons
,
N
[
2
],
npos
),
computePSSM
(
aliVeryCons
,
N
[
3
],
npos
))
# write.table(res[[1]][[3]],paste0(prot,"_pssm.txt"))
# write.table(res[[2]][[3]],paste0(prot,"_pssm60.txt"))
# write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt"))
#
#
resAliCons = computePSSM(aliCons)
#
res = list(computePSSM(ali,N[1],npos),computePSSM(aliCons,N[2],npos),computePSSM(aliVeryCons,N[3],npos))
#
#
write.table(res[[1]][[3]],paste0(prot,"_pssm.txt"))
#
#
write.table(res[[2]][[3]],paste0(prot,"_pssm60.txt"))
#
#
write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt"))
# read evolutionary traces computed by JET
jet
=
read.table
(
paste
(
prot
,
"_jet.res"
,
sep
=
""
),
head
=
TRUE
)
...
...
@@ -176,14 +176,14 @@ print("done")
print
(
"running combined model..."
)
#Read frequencies from the jet output file.
frequencies
=
c
()
print
(
paste
(
"Reading frequencies:"
))
for
(
row
in
1
:
nrow
(
jet
))
{
frequencies
<-
append
(
frequencies
,
jet
[
row
,
"freq"
]
)
}
#print(frequencies)
freq_mean
=
mean
(
frequencies
)
#
#
Read frequencies from the jet output file.
#
frequencies = c()
#
print(paste("Reading frequencies:"))
#
for (row in 1:nrow(jet)) {
#
frequencies<-append(frequencies, jet[row, "freq"] )
#
}
#
#
print(frequencies)
#
freq_mean = mean(frequencies)
alpha
=
0.6
# alpha = c()
...
...
examples/example-esgemme-script.sh
View file @
2d117e24
...
...
@@ -10,7 +10,7 @@ then
rm
-rf
BLAT
*
rm
-f
default.conf caracTest.dat
rm
-rf
bin
*
.fasta
rm
-rf
blat-af2.pdb.dssp
blat-af2.pdb.dssp.new
rm
-rf
../data/blat-af2.pdb.dssp ../data/
blat-af2.pdb.dssp.new
elif
[
"
$1
"
==
"jetoff"
]
then
# If you have your own JET2 score file, you can turn off JET2 as follows:
...
...
pred.R
View file @
2d117e24
...
...
@@ -91,87 +91,87 @@ computeNbSeqs<-function(mat,gap=FALSE){
return
(
res
)
}
# compute variability levels as the Shanon entropy
computeConsSE
<-
function
(
mat
,
N
){
mat
[
mat
==
0
]
=
1
return
(
apply
(
mat
/
N
,
2
,
f
<-
function
(
x
){
return
(
-
sum
(
x
*
log2
(
x
)))}))
}
#
#
compute variability levels as the Shanon entropy
#
computeConsSE<-function(mat,N){
#
mat[mat==0] = 1
#
return(apply(mat/N,2,f<-function(x){return(-sum(x*log2(x)))}))
#
}
# compute conservation levels as the Kullback-Leibler divergence
computeConsKL
<-
function
(
mat
){
n
=
apply
(
mat
,
2
,
sum
)
mat
=
t
(
t
(
mat
)
/
n
)
mat
=
mat
+
0.000001
bg
=
blosum62
[,
1
][
order
(
rownames
(
blosum62
))]
return
(
apply
(
mat
,
2
,
f
<-
function
(
x
){
return
(
sum
(
x
*
log2
(
x
/
bg
)))}))
}
#
#
compute conservation levels as the Kullback-Leibler divergence
#
computeConsKL<-function(mat){
#
n = apply(mat,2,sum)
#
mat = t(t(mat)/n)
#
mat = mat + 0.000001
#
bg = blosum62[,1][order(rownames(blosum62))]
#
return(apply(mat,2,f<-function(x){return(sum(x*log2(x/bg)))}))
#
}
computeSeqWeights
<-
function
(
mat
,
N
,
npos
){
# compute the number of different observed amino acids in each column
r
=
apply
(
mat
,
2
,
f
<-
function
(
x
){
return
(
length
(
unique
(
x
)))})
# compute occurrence of each letter at each position
occMat
=
computeNbSeqs
(
mat
,
TRUE
)
#print(occMat)
# compute weights for each sequence
weightMat
=
matrix
(
nr
=
N
,
nc
=
npos
)
for
(
k
in
1
:
N
){
midx
=
cbind
(
mat
[
k
,],
1
:
npos
)
weightMat
[
k
,]
=
1
/
(
occMat
[
midx
]
*
r
)
}
indObs
=
sum
(
r
)
/
npos
return
(
list
(
weightMat
,
indObs
))
}
#
computeSeqWeights<-function(mat,N,npos){
#
# compute the number of different observed amino acids in each column
#
r = apply(mat,2,f<-function(x){return(length(unique(x)))})
#
# compute occurrence of each letter at each position
#
occMat = computeNbSeqs(mat,TRUE)
#
#print(occMat)
#
# compute weights for each sequence
#
weightMat=matrix(nr=N,nc=npos)
#
for(k in 1:N){
#
midx=cbind(mat[k,],1:npos)
#
weightMat[k,] = 1 / (occMat[midx] * r)
#
}
#
indObs = sum(r)/npos
#
return(list(weightMat,indObs))
#
}
computePseudoCounts
<-
function
(
freqmat
,
npos
){
PC
=
matrix
(
nr
=
length
(
aa
),
nc
=
npos
)
rownames
(
PC
)
=
aa
for
(
a
in
aa
){
PC
[
a
,]
=
apply
(
freqmat
,
2
,
f
<-
function
(
x
){
return
(
sum
(
x
/
bg
*
bgp
[
a
,]))})
}
return
(
PC
)
}
#
computePseudoCounts<-function(freqmat,npos){
#
PC = matrix(nr=length(aa),nc=npos)
#
rownames(PC)= aa
#
for(a in aa){
#
PC[a,] = apply(freqmat,2,f<-function(x){return(sum(x/bg*bgp[a,]))})
#
}
#
return(PC)
#
}
computePSSM
<-
function
(
mat
,
N
,
npos
){
# compute sequence weights
res
=
computeSeqWeights
(
mat
,
N
,
npos
)
weights
=
res
[[
1
]]
indObs
=
res
[[
2
]]
# extend amino acid alphabet with gaps
aa
=
c
(
aa
,
"-"
)
# compute weighted occurrences
occMat
=
matrix
(
0
,
nr
=
length
(
aa
),
nc
=
npos
)
rownames
(
occMat
)
=
aa
for
(
i
in
1
:
npos
){
counts
=
tapply
(
weights
[,
i
],
mat
[,
i
],
sum
)
occMat
[
names
(
counts
),
i
]
=
counts
}
# distribute gaps
occMat
=
occMat
[
1
:
20
,]
+
t
(
occMat
[
"-"
,]
%*%
t
(
bg
))
# compute pseudo-counts
PC
=
computePseudoCounts
(
occMat
,
npos
)
# distribute pseudo-counts according to an empirical (?) weight
occMat
=
(
occMat
*
(
indObs
-1
)
+
PC
)
/
indObs
# normalize to relative frequencies
n
=
apply
(
occMat
,
2
,
sum
)
freq
=
t
(
t
(
occMat
)
/
n
)
# compute Shannon entropy
SE
=
apply
(
freq
,
2
,
f
<-
function
(
x
){
return
(
-
sum
(
x
*
log2
(
x
)))})
# compute Kullback-Leibler divergence
KL
=
apply
(
freq
,
2
,
f
<-
function
(
x
){
return
(
sum
(
x
*
log2
(
x
/
bg
)))})
# divide by bg freqs and convertto log-scores
pssm
=
2
*
log2
(
freq
/
bg
)
return
(
list
(
SE
,
KL
,
pssm
))
}
#
computePSSM<-function(mat,N,npos){
#
# compute sequence weights
#
res = computeSeqWeights(mat,N,npos)
#
weights = res[[1]]
#
indObs = res[[2]]
#
# extend amino acid alphabet with gaps
#
aa=c(aa,"-")
#
# compute weighted occurrences
#
occMat = matrix(0,nr=length(aa),nc=npos)
#
rownames(occMat) = aa
#
for(i in 1:npos){
#
counts = tapply(weights[,i],mat[,i],sum)
#
occMat[names(counts),i] = counts
#
}
#
# distribute gaps
#
occMat = occMat[1:20,] + t(occMat["-",]%*%t(bg))
#
# compute pseudo-counts
#
PC = computePseudoCounts(occMat,npos)
#
# distribute pseudo-counts according to an empirical (?) weight
#
occMat = (occMat*(indObs-1) + PC) /indObs
#
# normalize to relative frequencies
#
n = apply(occMat,2,sum)
#
freq = t(t(occMat)/n)
#
# compute Shannon entropy
#
SE = apply(freq,2,f<-function(x){return(-sum(x*log2(x)))})
#
# compute Kullback-Leibler divergence
#
KL = apply(freq,2,f<-function(x){return(sum(x*log2(x/bg)))})
#
# divide by bg freqs and convertto log-scores
#
pssm = 2 * log2(freq/bg)
#
return(list(SE,KL,pssm))
#
}
computePSSM2
<-
function
(
mat
){
occMat
=
computeNbSeqs
(
mat
)
n
=
apply
(
occMat
,
2
,
sum
)
pssm
=
t
(
t
(
occMat
)
/
n
)
pssm
=
pssm
+
0.000001
bg
=
blosum62
[,
1
][
order
(
rownames
(
blosum62
))]
res
=
2
*
log2
(
pssm
/
bg
)
return
(
list
(
pssm
,
res
))
}
#
computePSSM2<-function(mat){
#
occMat = computeNbSeqs(mat)
#
n = apply(occMat,2,sum)
#
pssm = t(t(occMat)/n)
#
pssm = pssm + 0.000001
#
bg = blosum62[,1][order(rownames(blosum62))]
#
res = 2 * log2(pssm/bg)
#
return(list(pssm,res))
#
}
computeNbSeqsAlph
<-
function
(
nbSeqs
,
alphabet
){
path
=
paste
(
Sys.getenv
(
"ESGEMME_PATH"
),
"/data/alphabets/"
,
sep
=
""
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment