Commit 2d1df3ad by Mustafa Tekpinar

Initial Commit

parents
*~
._*
\ No newline at end of file
MIT License
Copyright (c) 2018: Elodie Laine, Yasaman Karami and Alessandra Carbone.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
\ No newline at end of file
############################################################################################################
# #
# GEMME: a tool to predict mutational outcomes using evolutionary conservation and global epistasis #
# #
############################################################################################################
#
#
# GEMME is implemented in Python and R.
# https://www.python.org/
# https://cran.r-project.org/
#
#
##################
# Dependencies: #
##################
#
# Joint Evolutionary Trees: http://www.lcqb.upmc.fr/JET2/
# seqinr R package: https://cran.r-project.org/web/packages/seqinr/index.html
# These tools should be installed to be able to use GEMME.
#
#
##################
# Installation: #
##################
#
# Download the GEMME.tgz archive from http://www.lcqb.upmc.fr/GEMME/.
# Uncompress the archive in the directory of your choice.
# Define and export the environment variable GEMME_PATH=/path-to-GEMME-directory/
# Run the program by typing "python $GEMME_PATH/gemme.py inputAli.fasta".
# A help can be accessed by typing "python $GEMME_PATH/gemme.py --help".
#
#
#################
# Usage notes: #
#################
#
# The inputAli.fasta is a mandatory argument that corresponds to the input multiple sequence
# alignment file, in FASTA format. The query sequence is taken as the first sequence in the alignment.
#
# By default, GEMME will predict the effect of all possible single mutations at all positions in the
# query sequence. Alternatively, a set of single or multiple mutations can be given with the option -m.
# Eachline of the file should contain a mutation (e.g. D136R) or combination of mutations separated
# by commas and ordered according to their positions in the sequence (e.g. D136R,V271A).
#
# GEMME calls JET2 to compute evolutionary conservation levels. By default, JET2 will retrieve a set
# of sequences related to the query, independent from the input set, according to specific criteria.
# The retrieval method used in JET2 is PSI-BLAST, which can perform the search either locally (by
# default) or remotely (-r server). Alternatively, the user can provide her/his own psiblast file
# (-r input-b pFile) or her/his own multiple sequence alignment in FASTA format (-r input -f fFile).
# JET is run in its iterative mode, iJET, 10 times and the final conservation levels are the maxium
# values obtained over the 10 iterations.
# JET2 configuration file is: default.conf.
# JET2 output file is: myProt_jet.res.
#
# By default, GEMME will output mutational effects predictions obtained from the global epistatic model,
# the independent model, and a combination of those two using a reduced alphabet (alphabets/lw-i.11.txt):
# myProt_pred_evolEpi.txt
# myProt_normPred_evolEpi.txt
# myProt_pred_evolInd.txt
# myProt_normPred_evolInd.txt
# myProt_normPred_evolCombi.txt
# The values of interest are the normalized predictions (normPred). Each file contains a 20 x n matrix,
# where n is the number of positions in the query sequence.
# If the user provides her/his own list of mutations, then only the global epistatic model will be run
# and the output file will contain 2 columns, the first one with the mutations, the second one with the
# normalized predicted effects.
#
#
####################
# Main reference: #
####################
#
# Laine E, Karami Y, Carbone A. GEMME: A Simple and Fast Global Epistatic Model Predicting Mutational Effects
# Molecular Biology and Evolution, Volume 36, Issue 11, November 2019, Pages 2604–2619
#
#
#
############################################################################################################
P
G
EKRQ
DSN
T
HC
IV
WYF
A
LM
P
G
EKRQ
D
SN
T
HC
IV
WYF
A
LM
P
G
EKRQ
D
SN
T
H
C
IV
WYF
A
LM
P
G
E
KRQ
D
SN
T
H
C
IV
WYF
A
LM
P
G
E
KRQ
D
S
N
T
H
C
IV
WYF
A
LM
P
G
E
KRQ
D
S
N
T
H
C
IV
W
YF
A
LM
P
G
E
KRQ
D
S
N
T
H
C
IV
W
YF
A
L
M
P
G
E
K
RQ
D
S
N
T
H
C
IV
W
YF
A
L
M
P
G
E
K
RQ
D
S
N
T
H
C
I
V
W
YF
A
L
M
P
G
E
K
R
Q
D
S
N
T
H
C
I
V
W
YF
A
L
M
PGEKRQDSNTHC
IVWYFALM
PG
EKRQDSNTHC
IVWYFALM
PG
EKRQDSNTHC
IVWYF
ALM
PG
EKRQ
DSNTHC
IVWYF
ALM
PG
EKRQ
DSN
THC
IVWYF
ALM
PG
EKRQ
DSN
THC
IVWYF
A
LM
P
G
EKRQ
DSN
THC
IVWYF
A
LM
P
G
EKRQ
DSN
THC
IV
WYF
A
LM
LFIMVWCY
HATGPRQSNEDK
LFI
MVWCY
HA
TGPRQSNED
K
EKQR
IV
LY
F
AM
W
HT
C
DNS
GP
AEKQR
IV
LM
F
Y
W
C
H
T
DNS
GP
AEKQR
I
V
LM
F
Y
W
C
H
T
DNS
GP
AEKQR
I
V
L
M
F
Y
W
C
H
T
DNS
GP
DEKQ
R
A
I
V
L
M
F
Y
W
C
H
T
GNPS
ACEFHIKLMQRVWY
DGNPST
\ No newline at end of file
AEHKQR
CFILMVWY
DGNPST
\ No newline at end of file
AEHKQR
CFILMVWY
DNST
GP
\ No newline at end of file
AEHKQR
FILMVWY
CST
DN
GP
\ No newline at end of file
AEKQR
FIV
LMWY
HCT
DNS
GP
EKQR
FIV
LMWY
ACH
ST
DN
GP
EKQR
IV
LWY
AM
CF
HT
DNS
GP
EKQR
IV
L
F
AMW
CY
HT
DNS
GP
G
D
N
AEFIKLMQRVW
Y
H
C
T
S
P
\ No newline at end of file
G
D
N
AEFIKLMQRV
W
Y
H
C
T
S
P
\ No newline at end of file
G
D
N
AEFIKLMQV
R
W
Y
H
C
T
S
P
\ No newline at end of file
G
D
N
AEFIKLMV
Q
R
W
Y
H
C
T
S
P
\ No newline at end of file
G
D
N
AEFIKLV
M
Q
R
W
Y
H
C
T
S
P
\ No newline at end of file
GP
DNAEFIKLVMQRWYHCTS
G
DNAEFIKLVMQRWYHCTS
P
G
ADEKNQRST
CFHILMVWY
P
G
ND
AEHKQRST
CFILMVWY
P
\ No newline at end of file
G
DN
AEFHIKLMQRVWY
CT
S
P
\ No newline at end of file
G
DN
AEFIKLMQRVWY
CH
T
S
P
\ No newline at end of file
G
D
N
AEFIKLMQRVWY
CH
T
S
P
\ No newline at end of file
G
D
N
AEFIKLMQRVWY
H
C
T
S
P
\ No newline at end of file
FWYM
CLIV
AP
NST
GH
DE
KQR
FWYM
H
LIVC
A
NST
P
G
DE
KQR
LIVFM
Y
W
C
DN
TSKEQR
A
G
P
H
\ No newline at end of file
LIVF
M
Y
W
C
DN
TSKEQ
R
A
G
P
H
\ No newline at end of file
LIV
F
M
Y
W
C
DN
TS
KEQ
R
A
G
P
H
\ No newline at end of file
LIV
F
M
Y
W
C
D
N
TS
KEQ
R
A
G
P
H
\ No newline at end of file
LIV
F
M
Y
W
C
D
N
TS
KE
Q
R
A
G
P
H
\ No newline at end of file
LIV
F
M
Y
W
C
D
N
T
S
KE
Q
R
A
G
P
H
\ No newline at end of file
LIVFMYWC
DNTSKEQRAGPH
\ No newline at end of file
LIVFMYW
C
DNTSKEQRAGPH
\ No newline at end of file
LIVFMYW
C
DNTSKEQRAGP
H
\ No newline at end of file
LIVFMY
W
C
DNTSKEQRAGP
H
\ No newline at end of file
LIVFMY
W
C
DNTSKEQRAG
P
H
\ No newline at end of file
LIVFMY
W
C
DNTSKEQRA
G
P
H
\ No newline at end of file
LIVFM
Y
W
C
DNTSKEQRA
G
P
H
\ No newline at end of file
LIVFM
Y
W
C
DNTSKEQR
A
G
P
H
\ No newline at end of file
AST
C
DEN
FY
G
H
ILMV
KQR
P
W
FWY
ML
IV
CA
TS
NH
P
G
DE
QRK
FWY
ML
IV
CA
TS
NH
P
G
D
QE
RK
FWY
ML
IV
C
A
TS
NH
P
G
D
QE
RK
FWY
ML
IV
C
A
T
S
NH
P
G
D
QE
RK
FWY
ML
IV
C
A
T
S
NH
P
G
D
QE
R
K
FWY
ML
IV
C
A
T
S
N
H
P
G
D
QE
R
K
W
FY
ML
IV
C
A
T
S
N
H
P
G
D
QE
R
K
W
FY
ML
IV
C
A
T
S
N
H
P
G
D
Q
E
R
K
W
FY
M
L
IV
C
A
T
S
N
H
P
G
D
Q
E
R
K
W
F
Y
M
L
IV
C
A
T
S
N
H
P
G
D
Q
E
R
K
CMFILVWY
AGTSNQDEHRKP
CMFILVWY
AGTSP
NQDEHRK
CMFWY
ILV
AGTS
NQDEHRKP
FWYH
MILV
CATSP
G
NQDERK
FWYH
MILV
CATS
P
G
NQDERK
FWYH
MILV
CATS
P
G
NQDE
RK
FWYH
MILV
CA
NTS
P
G
DE
QRK
FWYH
ML
IV
CA
NTS
P
G
DE
QRK
C
FYW
ML
IV
G
P
ATS
NH
QED
RK
C
FYW
ML
IV
G
P
A
TS
NH
QED
RK
C
FYW
ML
IV
G
P
A
TS
NH
QE
D
RK
C
FYW
ML
IV
G
P
A
T
S
NH
QE
D
RK
C
FYW
ML
IV
G
P
A
T
S
N
H
QE
D
RK
FWY
ML
IV
C
A
T
S
N
H
P
G
D
QE
R
K
W
FY
ML
IV
C
A
T
S
N
H
P
G
D
QE
R
K
W
FY
ML
IV
C
A
T
S
N
H
P
G
D
Q
E
R
K
W
FY
M
L
IV
C
A
T
S
N
H
P
G
D
Q
E
R
K
W
F
Y
M
L
IV
C
A
T
S
N
H
P
G
D
Q
E
R
K
CFYWMLIV
GPATSNHQEDRK
CFYWMLIV
GPATS
NHQEDRK
CFYW
MLIV
GPATS
NHQEDRK
CFYW
MLIV
G
PATS
NHQEDRK
CFYW
MLIV
G
P
ATS
NHQEDRK
CFYW
MLIV
G
P
ATS
NHQED
RK
CFYW
MLIV
G
P
ATS
NH
QED
RK
CFYW
ML
IV
G
P
ATS
NH
QED
RK
IMV
L
FWY
G
P
C
A
STNH
RKQE
D
IMV
L
FWY
G
P
C
A
STNH
RKQ
E
D
IMV
L
FWY
G
P
C
A
ST
N
HRKQ
E
D
IMV
L
F
WY
G
P
C
A
ST
N
HRKQ
E
D
IMV
L
F
WY
G
P
C
A
S
T
N
HRKQ
E
D
IMV
L
F
WY
G
P
C
A
S
T
N
H
RKQ
E
D
IMV
L
F
W
Y
G
P
C
A
S
T
N
H
RKQ
E
D
IMVLFWY
GPCASTNHQEDRK
IMVLFWY
GPCAST
NHQEDRK
IMVLFWY
G
PCAST
NHQEDRK
IMVL
FWY
G
PCAST
NHQEDRK
IMVL
FWY
G
P
CAST
NHQEDRK
IMVL
FWY
G
P
CAST
NHQED
RK
IMV
L
FWY
G
P
CAST
NHQED
RK
IMV
L
FWY
G
P
C
AST
NHQED
RK
MF
ILV
A
C
WYQHP
G
TSN
RK
D
E
MF
IL
V
A
C
WYQHP
G
TSN
RK
D
E
MF
IL
V
A
C
WYQHP
G
TS
N
RK
D
E
MF
IL
V
A
C
WYQHP
G
T
S
N
RK
D
E
MF
I
L
V
A
C
WYQHP
G
T
S
N
RK
D
E
MF
IL
V
A
C
WYQ
H
P
G
T
S
N
RK
D
E
MF
I
L
V
A
C
WYQ
H
P
G
T
S
N
RK
D
E
MFILVAW
CYQHPGTSNRKDE
MFILVAW
CYQHPGTSNRK
DE
MFILV
ACW
YQHPGTSNRK
DE
MFILV
ACW
YQHPGTSN
RK
DE
MFILV
A
C
WYQHPGTSN
RK
DE
MFILV
A
C
WYQHP
GTSN
RK
DE
MFILV
A
C
WYQHP
G
TSN
RK
DE
MF
ILV
A
C
WYQHP
G
TSN
RK
DE
LVIM
C
A
G
ST
P
FYW
EDNQ
KR
H
LVIM
C
A
G
S
T
P
FY
W
E
D
N
Q
KR
H
LVIMCAGSTPFYW
EDNQKRH
LVIMC
AGSTP
FYW
EDNQKRH
LVIMC
AG
ST
P
FYW
EDNQ
KR
H
AG
C
DEKNPQRST
FILMVWY
H
AVLIMC
FWYH
STNQ
KR
DE
GP
YF
LIVM
C
W
DN
TSQKER
A
G
H
P
\ No newline at end of file
YF
LIVM
C
W
DN
TSQ
KER
A
G
H
P
\ No newline at end of file
YF
LIVM
C
W
D
N
TSQ
KER
A
G
H
P
\ No newline at end of file
Y
F
LIVM
C
W
D
N
TSQ
KER
A
G
H
P
\ No newline at end of file
Y
F
LIV
M
C
W
D
N
TSQ
KER
A
G
H
P
\ No newline at end of file
YFLIVMCW
DNTSQKERAGHP
\ No newline at end of file
YFLIVMC
W
DNTSQKERAGHP
\ No newline at end of file
YFLIVMC
W
DNTSQKERAGH
P
\ No newline at end of file
YFLIVM
C
W
DNTSQKERAG
H
P
\ No newline at end of file
YFLIVM
C
W
DNTSQKERA
G
H
P
\ No newline at end of file
YFLIVM
C
W
DN
TSQKERA
G
H
P
\ No newline at end of file
P
G
EKRQ
DSN
T
HC
IV
WYF
A
LM
P
G
EKRQ
D
SN
T
HC
IV
WYF
A
LM
P
G
EKRQ
D
SN
T
H
C
IV
WYF
A
LM
P
G
E
KRQ
D
SN
T
H
C
IV
WYF
A
LM
P
G
E
KRQ
D
S
N
T
H
C
IV
WYF
A
LM
P
G
E
KRQ
D
S
N
T
H
C
IV
W
YF
A
LM
P
G
E
KRQ
D
S
N
T
H
C
IV
W
YF
A
L
M
P
G
E
K
RQ
D
S
N
T
H
C
IV
W
YF
A
L
M
P
G
E
K
RQ
D
S
N
T
H
C
I
V
W
YF
A
L
M
P
G
E
K
R
Q
D
S
N
T
H
C
I
V
W
YF
A
L
M
PGEKRQDSNTHC
IVWYFALM
PG
EKRQDSNTHC
IVWYFALM
PG
EKRQDSNTHC
IVWYF
ALM
PG
EKRQ
DSNTHC
IVWYF
ALM
PG
EKRQ
DSN
THC
IVWYF
ALM
PG
EKRQ
DSN
THC
IVWYF
A
LM
P
G
EKRQ
DSN
THC
IVWYF
A
LM
P
G
EKRQ
DSN
THC
IV
WYF
A
LM
CMFILVWY
ATH
GP
DE
SNQRK
a r n d c q e g h i l k m f p s t w y v
a 0.0215 0.0023 0.0019 0.0022 0.0016 0.0019 0.003 0.0058 0.0011 0.0032 0.0044 0.0033 0.0013 0.0016 0.0022 0.0063 0.0037 4e-04 0.0013 0.0051
r 0.0023 0.0178 0.002 0.0016 4e-04 0.0025 0.0027 0.0017 0.0012 0.0012 0.0024 0.0062 8e-04 9e-04 0.001 0.0023 0.0018 3e-04 9e-04 0.0016
n 0.0019 0.002 0.0141 0.0037 4e-04 0.0015 0.0022 0.0029 0.0014 0.001 0.0014 0.0024 5e-04 8e-04 9e-04 0.0031 0.0022 2e-04 7e-04 0.0012
d 0.0022 0.0016 0.0037 0.0213 4e-04 0.0016 0.0049 0.0025 0.001 0.0012 0.0015 0.0024 5e-04 8e-04 0.0012 0.0028 0.0019 2e-04 6e-04 0.0013
c 0.0016 4e-04 4e-04 4e-04 0.0119 3e-04 4e-04 8e-04 2e-04 0.0011 0.0016 5e-04 4e-04 5e-04 4e-04 0.001 9e-04 1e-04 3e-04 0.0014
q 0.0019 0.0025 0.0015 0.0016 3e-04 0.0073 0.0035 0.0014 0.001 9e-04 0.0016 0.0031 7e-04 5e-04 8e-04 0.0019 0.0014 2e-04 7e-04 0.0012
e 0.003 0.0027 0.0022 0.0049 4e-04 0.0035 0.0161 0.0019 0.0014 0.0012 0.002 0.0041 7e-04 9e-04 0.0014 0.003 0.002 3e-04 9e-04 0.0017
g 0.0058 0.0017 0.0029 0.0025 8e-04 0.0014 0.0019 0.0378 0.001 0.0014 0.0021 0.0025 7e-04 0.0012 0.0014 0.0038 0.0022 4e-04 8e-04 0.0018
h 0.0011 0.0012 0.0014 0.001 2e-04 0.001 0.0014 0.001 0.0093 6e-04 0.001 0.0012 4e-04 8e-04 5e-04 0.0011 7e-04 2e-04 0.0015 6e-04
i 0.0032 0.0012 0.001 0.0012 0.0011 9e-04 0.0012 0.0014 6e-04 0.0184 0.0114 0.0016 0.0025 0.003 0.001 0.0017 0.0027 4e-04 0.0014 0.012
l 0.0044 0.0024 0.0014 0.0015 0.0016 0.0016 0.002 0.0021 0.001 0.0114 0.0371 0.0025 0.0049 0.0054 0.0014 0.0024 0.0033 7e-04 0.0022 0.0095
k 0.0033 0.0062 0.0024 0.0024 5e-04 0.0031 0.0041 0.0025 0.0012 0.0016 0.0025 0.0161 9e-04 9e-04 0.0016 0.0031 0.0023 3e-04 0.001 0.0019
m 0.0013 8e-04 5e-04 5e-04 4e-04 7e-04 7e-04 7e-04 4e-04 0.0025 0.0049 9e-04 0.004 0.0012 4e-04 9e-04 0.001 2e-04 6e-04 0.0023
f 0.0016 9e-04 8e-04 8e-04 5e-04 5e-04 9e-04 0.0012 8e-04 0.003 0.0054 9e-04 0.0012 0.0183 5e-04 0.0012 0.0012 8e-04 0.0042 0.0026
p 0.0022 0.001 9e-04 0.0012 4e-04 8e-04 0.0014 0.0014 5e-04 0.001 0.0014 0.0016 4e-04 5e-04 0.0191 0.0017 0.0014 1e-04 5e-04 0.0012
s 0.0063 0.0023 0.0031 0.0028 0.001 0.0019 0.003 0.0038 0.0011 0.0017 0.0024 0.0031 9e-04 0.0012 0.0017 0.0126 0.0047 3e-04 0.001 0.0024
t 0.0037 0.0018 0.0022 0.0019 9e-04 0.0014 0.002 0.0022 7e-04 0.0027 0.0033 0.0023 0.001 0.0012 0.0014 0.0047 0.0125 3e-04 9e-04 0.0036
w 4e-04 3e-04 2e-04 2e-04 1e-04 2e-04 3e-04 4e-04 2e-04 4e-04 7e-04 3e-04 2e-04 8e-04 1e-04 3e-04 3e-04 0.0065 9e-04 4e-04
y 0.0013 9e-04 7e-04 6e-04 3e-04 7e-04 9e-04 8e-04 0.0015 0.0014 0.0022 0.001 6e-04 0.0042 5e-04 0.001 9e-04 9e-04 0.0102 0.0015
v 0.0051 0.0016 0.0012 0.0013 0.0014 0.0012 0.0017 0.0018 6e-04 0.012 0.0095 0.0019 0.0023 0.0026 0.0012 0.0024 0.0036 4e-04 0.0015 0.0196
# Copyright (c) 2018: Elodie Laine
# This code is part of the gemme package and governed by its license.
# Please see the LICENSE.txt file included as part of this package.
# Usage: Rscript --save runPred.R XXXX
library("seqinr")
source(paste(Sys.getenv("GEMME_PATH"),"/pred.R",sep=""))
# amino acid full alphabet
aa = c("a","c","d","e","f","g","h","i","k","l","m","n","p","q","r","s","t","v","w","y")
# BLOSUM62 matrix
blosum62p = as.matrix(read.table(paste0(Sys.getenv("GEMME_PATH"),"/blosum62p.txt"),row.names=1))
bgp = blosum62p[order(rownames(blosum62p)),order(colnames(blosum62p))]
bg = apply(bgp,1,sum)
# get arguemnts
args = commandArgs(trailingOnly = TRUE)
prot = args[1]
aliFile = args[2]
simple = args[3]
fname = args[4]
# read alignment and convert to matrix
ali=as.matrix.alignment(read.alignment(aliFile,format="fasta"))
ali[!ali%in%aa] = "-"
npos = dim(ali)[[2]]
# convert to binary matrix with respect to the query (0: identical, 1: different)
binAli = t(apply(ali,1,f<-function(x){return(x!=ali[1,])}))
pId =apply(binAli,1,f<-function(x){return(1-sum(x)/npos)})
aliCons = ali[pId>0.6,]
aliVeryCons = ali[pId>0.8,]
# number of sequences
N = c(dim(ali)[[1]],dim(aliCons)[[1]],dim(aliVeryCons)[[1]])
#resAliCons = computePSSM(aliCons)
res = list(computePSSM(ali,N[1],npos),computePSSM(aliCons,N[2],npos),computePSSM(aliVeryCons,N[3],npos))
write.table(res[[1]][[3]],paste0(prot,"_pssm.txt"))
write.table(res[[2]][[3]],paste0(prot,"_pssm60.txt"))
write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt"))
# read evolutionary traces computed by JET
jet=read.table(paste(prot,"_jet.res",sep=""),head=TRUE)
if(sum(colnames(jet)=="traceMax")==1){trace=jet[,"traceMax"]}else{trace=jet[,"trace"]}
#traceAli = sweep(binAli, MARGIN=2, trace, `*`)
# compute evolutionary distances of all sequences with respect to the query
distTrace = binAli[2:N[1],] %*% trace^2
wt=read.fasta(paste(prot,".fasta",sep=""))
n = length(wt[[1]])
wt = wt[[1]][1:n]
#indicesQ = getIndicesAli(ali$seq,1)
if(simple==FALSE){
print("reading selected mutations...")
res=readMut(fname)
pos = res[[1]]
subsaa = res[[2]]
rawMut = res[[3]]
print("done")
}
print("running independent model...")
# compute sequence counts for each position and each aa
nbSeqs = computeNbSeqs(ali)
# compute gap counts for each position
nbGaps = N[1] - apply(nbSeqs,2,sum)
# output the conservation values
dat=rbind(trace,KL=res[[1]][[2]],SE=res[[1]][[1]],gap=nbGaps/N[1],KL60=res[[2]][[2]],SE60=res[[2]][[1]],KL80=res[[3]][[2]],SE80=res[[3]][[1]])
write.table(dat,paste0(prot,"_conservation.txt"))
# compute log-odd ratios between mutated and wt sequence counts
predInd = computePredNbSeqs(wt,nbSeqs)
rownames(predInd)=aa
if(simple){
normPredInd = normalizePredWithNbSeqsZero(predInd,trace,wt)
rownames(normPredInd)=aa
}else{
normPredInd = normalizePredWithNbSeqsZeroSelMult(predInd, trace, wt, list(pos,subsaa))
names(normPredInd)=rawMut
}
# output the sequence counts log-odd ratios
write.table(predInd,paste0(prot,"_pred_evolInd.txt"))
# output the predicted mutational effects based on sequence counts (conservation at the bottom)
write.table(normPredInd,paste0(prot,"_normPred_evolInd.txt"))
print("done")
print("running global epistatic model...")
pred=computePredSimple(ali,distTrace,wt,5)
rownames(pred)=aa
if(simple){
normPred=normalizePred(pred, trace, wt)
rownames(normPred)=aa
}else{
normPred=normalizePredSelMult(pred, trace, wt, list(pos,subsaa))
names(normPred)=rawMut
}
# output the evolutionary distances between the query and the closest variants
evolDist = pred/sum(trace^2)
evolDist[is.na(evolDist)] = 1
write.table(evolDist,paste0(prot,"_pred_evolEpi.txt"))
# output the predicted mutational effects based on evolutionary distance (conservation at the bottom)
write.table(normPred,paste0(prot,"_normPred_evolEpi.txt"))
print("done")
print("running combined model...")
alpha = 0.6
alphabet = "lz-bl.7"
if(simple){
normPredCombi = normalizePredWithNbSeqsPC(pred,trace,wt,alpha,nbSeqs,alphabet)
rownames(normPredCombi)=aa
}else{
normPredCombi = normalizePredWithNbSeqsPCSelMult(pred,trace,wt,list(pos,subsaa),alpha,nbSeqs[[1]],alphabet)
names(normPredCombi)=rawMut
}
# output the predicted mutational effects based on sequence counts (conservation at the bottom)
write.table(normPredCombi,paste0(prot,"_normPred_evolCombi.txt"))
print("done")
if(simple){
print("generating the plots...")
plotMatBlues(paste(prot,"_normPred_evolEpi",sep=""))
plotMatGreens(paste(prot,"_normPred_evolInd",sep=""))
plotMatOranges(paste(prot,"_normPred_evolCombi",sep=""))
print("done")
}
*****************************************
>SequenceRetrieving
method input change this parameter with -r option of JET. See usage of JET to obtain description of this parameter
format fasta fasta: fasta input file, fasta: fasta input file
*****************************************
>QBlast
eValue 1.0E-5 psiblast maximum expected value threshold
results 5000 maximum number of results
url http://www.ncbi.nlm.nih.gov/BLAST/Blast.cgi BlastQ server URL
database nr database used
matrix blosum62 matrix used to fetch homologs
gap_existence 11 BLOSUM62=11, PAM30=9, BLOSUM45=15, PAM70=BLOSUM80=10
gap_extension 1 BLOSUM62=1, PAM30=1, BLOSUM45=2, PAM70=BLOSUM80=1
max_iter 3 number of iteration for psi-blast
****************************************
>PDB
url http://www.rcsb.org/pdb/downloadFile.do URL of PDB server
*****************************************
>Filter
min_identity 0.20 min sequence identity
max_identity 0.98 max sequence identity
*****************************************
>Sample
length_cutoff 0.8 minimum sequence length expressed in number of residues
*****************************************
>Software
clustalW /usr/local/bin/clustalw2 clustalW system dependent command
muscle /usr/bin/muscle muscle system dependent command
naccess /opt/JET2/naccess2.1.1/naccess naccess system dependent command
psiblast /opt/blast-2.2.27+/bin/psiblast psiblast system dependent command
*****************************************
>Data
substMatrix /opt/JET2/matrix directory location of matrices used in JET (Blosum62, gonnet and hsdm)
blastDatabases /disk1/blastdb/ directory location of databases used for local blast (nr{0-7})
*****************************************
>ET
coverage 0.95 maximum coverage percentage of trace
freq_cutoff 0.0 minimum frequency of trace residue
msaNumber -1 number of alignments (trees), -1 for JET computting
seqNumber -1 number of sequences in alignments, -1 for JET computting
*****************************************
>Access
probe_radius 1.4 radius of probe used for accessible surface detection
res_cutoff 0.05 minimum percentage accessible surface of a residu
atom_cutoff 0.01 minimum accessible surface of an atom
accessType chain change this parameter with -d option of JET. See usage of JET to obtain description of this parameter
*****************************************
>CV
max_dist 20.0 max distance
*****************************************
>Interface
cutoff 0 minimum percentage accessible surface variation of an interface residu
ligand no (yes|no) keep contact of ligand (SUBSTRATE, PRODUCT and COFACTOR of database ENZYME) to compute interface of protein
enzymeCpd /opt/JET/jet/data/enzyme.txt location of file containing database ENZYME
homologousPDB no (yes|no) add interface residues of homologous structures (find in pdb database clustered at 95% of identities) to interface of protein
clusteredPDB /opt/JET/jet/data/clusters95.txt location of pdb database clustered at 95% of identities
*****************************************
>Cluster
max_dist 5.0 max distance between atoms to aggregate
analysis 2 change this parameter with -a option of JET. See usage of JET to obtain description of this parameter
namePcCol pc name of the column in results file containing the phisical-chemical score of residues (do not change this parameter)
namePcLCol pcL name of the column in results file containing the residues propensities to be found at prot-lig interfaces (do not change this parameter)
nameTraceCol trace name of the column in results file containing the conservation score of residues (do not change this parameter)
coverage -1 change this parameter with -s option of JET. See usage of JET to obtain description of this parameter
This source diff could not be displayed because it is too large. You can view the blob instead.
# -*- coding: utf-8 -*-
# Copyright (c) 2018: Elodie Laine
# This code is part of the gemme package and governed by its license.
# Please see the LICENSE.txt file included as part of this package.
import sys
import os
import argparse
import re
import subprocess
import math
from gemmeAnal import *
def check_argument_groups(parser, arg_dict, group, argument):
"""
Check for use of arguments.
Raise an error if the parser uses an argument that belongs to an argument
group (i.e. mode) but the group flag is not used or if the argument is
required in the argument group and it's not used.
Notes:
- argument and group should be strings and should start with - or --
- group should be a flag with action 'store_true'
- argument shouldn't be a flag, and should be None if it isn't used.
>>> import argparse
>>> parser = argparse.ArgumentParser()
>>> parser.add_argument('--phylo', action='store_true')
_StoreTrueAction(...)
>>> parser.add_argument('--inseq')
_StoreAction(...)
>>> args = parser.parse_args(['--phylo', '--inseq', 'not_none'])
>>> check_argument_groups(parser, vars(args), '--phylo', '--inseq', True)
"""
c = 0
for myArg in argument:
arg_name = myArg.replace("-", "")
if arg_dict[arg_name]!='':
c = c+1
group_name = group.replace("-", "")
if arg_dict[group_name]=="input":
if c!=1:
parser.error("gemme requires " + str(argument) +
" if " + group + " is set to input.")
else:
if c>0:
parser.error("gemme requires " + group +
" to be set to input if " + str(argument) + " is used.")
return None
def parse_command_line():
"""
Parse command line.
It uses argparse to parse phylosofs' command line arguments and returns the
argparse parser.
"""
parser = argparse.ArgumentParser(
prog="gemme",
description="""
GEMME (Global Epistasis Model for predicting Mutational Effects)
is a tool to predict mutational outcomes based on sequence analysis
""",
epilog="""
It has been developed at LCQB (Laboratory of Computational and
Quantitative Biology), UMR 7238 CNRS, Sorbonne Université.
If you use it, please cite:
Laine E, Karami Y, Carbone A. Predicting Protein Mutational Landscapes
using Evolutionary Conservation and Global Epistasis
""",
)
parser.add_argument(
'input', metavar='aliFile', type=str,
help='input alignment file in FASTA format (first sequence = ungapped query sequence)',
)
parser.add_argument(
'-m', '--mutations',
help='text file containing the mutations of interest. Each line of the file should contain a mutation (e.g. D136R) or combination of mutations separated by commas and ordered according to their positions in the sequence (e.g. D136R,V271A)',
default=''
)
retMet_args = parser.add_argument_group('Conservation levels calculation', """
Arguments used for computing the conservation levels.
""")
retMet_args.add_argument(
'-n', '--nIter',
help='number of iterations to compute the conservation levels',
default='1'
)
retMet_args.add_argument(
'-N', '--NSeqs',
help='maximum number of sequences to compute the conservation levels',
default='20000'
)
retMet_args.add_argument(
'-r', '--retrievingMethod',
help='mode to retrieve related sequences for conservation levels calculation (input, local or server)',
default='local'
)
retMet_args.add_argument(
'-b', '--blastFile',
help='psiblast file containing related sequences',
default=''
)
retMet_args.add_argument(
'-f', '--fastaFile',
help='fasta file containing related sequences',
default=''
)
args = parser.parse_args()
arg_dict = vars(args)
check_argument_groups(parser, arg_dict, '--retrievingMethod', ['--blastFile','--fastaFile'])
# Check flag arguments
if args.input is None:
parser.error("gemme requires an input alignment.")
return args
def doit(inAli,mutFile,retMet,bFile,fFile,n,N):
"""
doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,args.fastaFile)
"""
prot,seq,nl=extractQuerySeq(inAli)
createPDB(prot,seq)
print "query protein: "+prot
print "computing conservation levels..."
launchJET(prot,retMet,bFile,fFile,n,N,nl)
print "done"
launchPred(prot,inAli,mutFile)
cleanTheMess(prot,bFile,fFile)
def main():
args = parse_command_line()
doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,args.fastaFile,args.nIter,args.NSeqs)
if (__name__ == '__main__'):
main()
# -*- coding: utf-8 -*-
# Copyright (c) 2018: Elodie Laine
# This code is part of the gemme package and governed by its license.
# Please see the LICENSE.txt file included as part of this package.
import sys
import os
import argparse
import re
import math
import subprocess
# Extract the query sequence from the input alignment
def extractQuerySeq(filename):
fIN = open(filename,"r")
lines = fIN.readlines()
fIN.close()
if lines[0][0]!=">":
raise Exception('bad FASTA format')
else:
prot = re.compile("[^A-Z0-9a-z]").split(lines[0][1:])[0]
fOUT = open(prot+".fasta","w")
fOUT.write(">"+prot+"\n")
seq=""
i = 1
while lines[i][0]!=">":
seq = seq + lines[i].strip().strip(".").strip("-")
fOUT.write(lines[i])
i = i + 1
fOUT.close()
return prot,seq,i-1
# Get the number of sequences in a multi-fasta file
def getNbSeq(filename):
if filename!='':
proc=subprocess.Popen("grep -c '^>' "+filename,stdout=subprocess.PIPE,shell=True)
return int(proc.stdout.read())
else:
return 0
# Create a PDB file with dummy CA atoms based on the query sequence
def createPDB(prot,seq):
d = {'C': 'CYS', 'D': 'ASP', 'S': 'SER', 'Q': 'GLN', 'K': 'LYS',
'I': 'ILE', 'P': 'PRO', 'T': 'THR', 'F': 'PHE', 'N': 'ASN',
'G': 'GLY', 'H': 'HIS', 'L': 'LEU', 'R': 'ARG', 'W': 'TRP',
'A': 'ALA', 'V': 'VAL', 'E': 'GLU', 'Y': 'TYR', 'M': 'MET'}
fOUT = open(prot+'.pdb','w')
i = 1
for let in seq:
fOUT.write('ATOM%7d CA %s A%4d 43.524 70.381 46.465 1.00 0.0\n'%(i,d[let],i))
i += 1
fOUT.close()
# Edit JET configuration file with correct number of Seqs & MSA
def editConfJET(N):
reCode=subprocess.call("sed -i 's/results\t\t5000/results\t\t"+str(N)+"/' default.conf",shell=True)
return(reCode)
# Run JET to compute TJET values
def launchJET(prot,retMet,bFile,fFile,n,N,nl):
subprocess.call("cp $GEMME_PATH/default.conf .",shell=True)
if retMet=="input":
if bFile!='':
subprocess.call("cp "+bFile+" "+prot+"_A.psiblast",shell=True)
jetcmd = "java -Xmx1000m -cp $JET_PATH:$JET_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+prot+".pdb -o `pwd` -p J -r input -b "+prot+"_A.psiblast -d chain -n "+n+" > "+prot+".out"
else:
print N
editConfJET(N)
#subprocess.call("cp "+fFile+" "+prot+"_A.fasta",shell=True)
subprocess.call("grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+" > "+prot+"_A.fasta",shell=True)
jetcmd = "java -Xmx1000m -cp $JET_PATH:$JET_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+prot+".pdb -o `pwd` -p J -r input -f "+prot+"_A.fasta -d chain -n "+n+" > "+prot+".out"
else:
jetcmd = "java -Xmx1000m -cp $JET_PATH:$JET_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+prot+".pdb -o `pwd` -p J -r "+retMet+" -d chain -n "+n+" > "+prot+".out"
print jetcmd
reCode=subprocess.call(jetcmd,shell=True)
if os.path.isfile(prot+"/"+prot+"_jet.res"):
os.rename(prot+"/"+prot+"_jet.res",prot+"_jet.res")
return(reCode)
# Run Rscript to compute predictions
def launchPred(prot,inAli,mutFile):
if mutFile!='':
rcmd="Rscript --save $GEMME_PATH/computePred.R "+prot+" "+inAli+" FALSE "+mutFile
else:
rcmd="Rscript --save $GEMME_PATH/computePred.R "+prot+" "+inAli+" TRUE none"
print rcmd
reCode=subprocess.call(rcmd,shell=True)
return(reCode)
# Remove temporary files
def cleanTheMess(prot,bFile,fFile):
if bFile!='':
if bFile!=prot+"_A.psiblast":
os.remove(prot+"_A.psiblast")
else:
if os.path.isfile(prot+"/"+prot+"_A.psiblast"):
os.rename(prot+"/"+prot+"_A.psiblast",prot+"_A.psiblast")
if fFile!='':
if fFile!=prot+"_A.fasta":
if os.path.isfile(prot+"_A.fasta"):
os.remove(prot+"_A.fasta")
# if os.path.isfile(prot+"/"+prot+"_jet.res"):
# os.rename(prot+"/"+prot+"_jet.res",prot+"_jet.res")
os.remove(prot+".pdb")
dir_name = prot+"/"
if os.path.isdir(dir_name):
for f in os.listdir(dir_name):
f_path = os.path.join(dir_name, f)
if os.path.isfile(f_path):
os.remove(f_path)
os.rmdir(dir_name)
# Copyright (c) 2018: Elodie Laine
# This code is part of the gemme package and governed by its license.
# Please see the LICENSE.txt file included as part of this package.
# module containing all R functions to manipulate data and compute pedictions
###########################################################################################
###################### Functions to manipulate alignment data #############################
###########################################################################################
# Given an alignment seqAli, the number of the sequence of interest (i)
# and a position (pos) in the sequence, get the corresponding
# index/position in the alignment (used by countSeq, see below)
getIndexMatAli<-function(matAli, i, pos){
seq = matAli[i,]
countAll = 0
count = 0
while(count<pos){
countAll = countAll + 1
if(seq[countAll]!="-"){count = count + 1}
}
return(countAll)
}
# Given an alignment seqAli, the number of the sequence of interest (i)
# and a vector of positions (pos) in the sequence, get the corresponding
# indices/positions in the alignment (used by countSeq, see below)
getIndexAli<-function(seqAli, i, pos){
seq = strsplit(seqAli[i][[1]],"")[[1]]
res = c()
countAll = 0
count = 0
for(p in pos){
while(count<p){
countAll = countAll + 1
if(seq[countAll]!="-"){count = count + 1}
}
res = c(res,countAll)
}
return(res)
}
# Given an alignment seqAli and the number of the sequence of interest (i)
# get all the indices/positions in the alignment
getIndicesAli<-function(seqAli, i){
seq = strsplit(seqAli[i][[1]],"")[[1]]
n = sum(seq!="-"&seq!=".")
res = c()
countAll = 0
count = 0
for(p in 1:n){
while(count<p){
countAll = countAll + 1
if(seq[countAll]!="-"&seq[countAll]!="."){count = count + 1}
}
res = c(res,countAll)
}
return(res)
}
# Compute the weights for sequences in the alignment
computeWeights<-function(matAli,theta=0.2){
nbSeq = dim(matAli)[[1]]
nbPos = dim(matAli)[[2]]
res=c()
matD=matrix(0,nr=nbSeq,nc=nbSeq)
for(i in 1:(nbSeq-1)){
for(j in (i+1):nbSeq){
matD[i,j]=sum(matAli[i,]!=matAli[j,])/nbPos
matD[j,i]=matD[i,j]}
val=1/(sum(matD[i,]<theta))
res=c(res,val)
#print(val)
}
return(res)
}
computeNbSeqs<-function(mat,gap=FALSE){
if(gap){aa=c(aa,"-")}
occ = apply(mat,2,table)
n = length(occ)
res = matrix(0,nc=n,nr=length(aa))
colnames(res) = 1:n
rownames(res) = aa
for(i in 1:n){
reducedAA = intersect(aa,names(occ[[i]]))
res[reducedAA,i] = occ[[i]][reducedAA]
}
return(res)
}
# compute variability levels as the Shanon entropy
computeConsSE<-function(mat,N){
mat[mat==0] = 1
return(apply(mat/N,2,f<-function(x){return(-sum(x*log2(x)))}))
}
# compute conservation levels as the Kullback-Leibler divergence
computeConsKL<-function(mat){
n = apply(mat,2,sum)
mat = t(t(mat)/n)
mat = mat + 0.000001
bg = blosum62[,1][order(rownames(blosum62))]
return(apply(mat,2,f<-function(x){return(sum(x*log2(x/bg)))}))
}
computeSeqWeights<-function(mat,N,npos){
# compute the number of different observed amino acids in each column
r = apply(mat,2,f<-function(x){return(length(unique(x)))})
# compute occurrence of each letter at each position
occMat = computeNbSeqs(mat,TRUE)
#print(occMat)
# compute weights for each sequence
weightMat=matrix(nr=N,nc=npos)
for(k in 1:N){
midx=cbind(mat[k,],1:npos)
weightMat[k,] = 1 / (occMat[midx] * r)
}
indObs = sum(r)/npos
return(list(weightMat,indObs))
}
computePseudoCounts<-function(freqmat,npos){
PC = matrix(nr=length(aa),nc=npos)
rownames(PC)= aa
for(a in aa){
PC[a,] = apply(freqmat,2,f<-function(x){return(sum(x/bg*bgp[a,]))})
}
return(PC)
}
computePSSM<-function(mat,N,npos){
# compute sequence weights
res = computeSeqWeights(mat,N,npos)
weights = res[[1]]
indObs = res[[2]]
# extend amino acid alphabet with gaps
aa=c(aa,"-")
# compute weighted occurrences
occMat = matrix(0,nr=length(aa),nc=npos)
rownames(occMat) = aa
for(i in 1:npos){
counts = tapply(weights[,i],mat[,i],sum)
occMat[names(counts),i] = counts
}
# distribute gaps
occMat = occMat[1:20,] + t(occMat["-",]%*%t(bg))
# compute pseudo-counts
PC = computePseudoCounts(occMat,npos)
# distribute pseudo-counts according to an empirical (?) weight
occMat = (occMat*(indObs-1) + PC) /indObs
# normalize to relative frequencies
n = apply(occMat,2,sum)
freq = t(t(occMat)/n)
# compute Shannon entropy
SE = apply(freq,2,f<-function(x){return(-sum(x*log2(x)))})
# compute Kullback-Leibler divergence
KL = apply(freq,2,f<-function(x){return(sum(x*log2(x/bg)))})
# divide by bg freqs and convertto log-scores
pssm = 2 * log2(freq/bg)
return(list(SE,KL,pssm))
}
computePSSM2<-function(mat){
occMat = computeNbSeqs(mat)
n = apply(occMat,2,sum)
pssm = t(t(occMat)/n)
pssm = pssm + 0.000001
bg = blosum62[,1][order(rownames(blosum62))]
res = 2 * log2(pssm/bg)
return(list(pssm,res))
}
computeNbSeqsAlph<-function(nbSeqs,alphabet){
path=paste(Sys.getenv("GEMME_PATH"),"/alphabets/",sep="")
AA = read.table(paste(path,alphabet,".txt",sep=""))$V1
facAA = rep(NA,20)
for(Let in AA){
Let = toString(Let)
lets = strsplit(Let,"")[[1]]
for(let in lets){
facAA[aa==tolower(let)]=Let
}
}
newNbSeqs = apply(nbSeqs,2,f<-function(x){return(tapply(x,facAA,sum))})
return(newNbSeqs)
}
readMut<-function(fname){
rawMut = read.table(fname,colClass="character")$V1
n = length(rawMut)
pos = list()
subsaa = list()
for(i in 1:n){
mut = strsplit(rawMut[i],",")[[1]]
tmpPos = c()
tmpSubsaa = c()
for(m in mut){
le=nchar(m)
tmpPos = c(tmpPos,as.numeric(substr(m,2,le-1)))
tmpSubsaa = c(tmpSubsaa,substr(m,le,le))
}
pos[[i]] = tmpPos
subsaa[[i]] = tmpSubsaa
}
return(list(pos,subsaa,rawMut))
}
which.class<-function(a,alphabet){
path=paste(Sys.getenv("GEMME_PATH"),"/alphabets/",sep="")
AA = read.table(paste(path,alphabet,".txt",sep=""))$V1
for(Let in AA){
Let = toString(Let)
lets = tolower(strsplit(Let,"")[[1]])
if(sum(lets==a)>0){return(Let)}
}
}
###########################################################################################
###################### Functions to compute predictions ##################################
###########################################################################################
# log-odd ratio between the mutated and wild-type sequence counts
computePredNbSeqs<-function(wt, nbseqs){
n = length(wt)
pred = matrix(nc=n,nr=20)
rownames(pred)=aa
for(i in 1:n){
for(a in aa){
if(a!=wt[i]){
pred[a,i] = log(max(1,nbseqs[a,i])/nbseqs[wt[i],i])}
else{
pred[a,i] = 0}
}
}
return(pred)
}
computePredNbSeqsSelMult<-function(wt, nbseqs, listMut){
# check mutation list
pos = listMut[[1]]
let = listMut[[2]]
N = length(pos)
if(N!=length(let)){print("Problem!!!! not the same number of positions and aas !!")}
# compute predictions
pred = list()
for(i in 1:N){
myPos = pos[[i]]
myAA = tolower(let[[i]])
n = length(myPos)
predTMP = c()
subsStr=""
# treat individual mutations
for(k in 1:n){
# subsStr=paste(subsStr,paste(j[k],myAA[k],sep=""),",")
predTMP = c(predTMP,log(max(1,nbseqs[myAA[k],myPos[k]])/nbseqs[wt[myPos[k]],myPos[k]]))
}
pred[[i]]=predTMP
}
return(pred)
}
normalizePredWithNbSeqsZero<-function(pred, trace, wt){
n = length(trace)
normPred = matrix(nc=n,nr=20)
rownames(normPred) = aa
for(i in 1:n){
normPred[,i] = pred[,i] * trace[i]
normPred[aa==wt[i],i] = NA
}
return(normPred)
}
normalizePredWithNbSeqsZeroSelMult<-function(pred, trace, wt, listMut){
pos = listMut[[1]]
let = listMut[[2]]
N = length(pos)
normPred = rep(NA,N)
for(i in 1:N){
myPos = pos[[i]]
myAA = tolower(let[[i]])
n = length(myPos)
predTMP = c()
for(k in 1:n){predTMP=c(predTMP,pred[myAA[k],myPos[k]])}
normPred[i] = sum(predTMP * trace[myPos])
}
return(normPred)
}
normalizePredWithNbSeqs<-function(pred, trace, wt, nbseqs, alpha){
n = length(trace)
normPred = matrix(nc=n,nr=20)
rownames(normPred) = aa
maxVal = max(apply(nbseqs,2,sum))
for(i in 1:n){
normPred[,i] = pred[,i] / max(pred[!is.na(pred)]) * log(1/maxVal) * (-1)
normPred[is.na(normPred[,i]),i] = - log(1/maxVal)
for(a in aa){
normPred[a,i] = alpha * normPred[a,i] - (1-alpha) * log(max(1,nbseqs[a,i])/nbseqs[wt[i],i])
}
normPred[,i] = normPred[,i] * trace[i]
normPred[aa==wt[i],i] = NA
}
return(-normPred)
}
findMinDist<-function(vec,delta){
le = length(vec)
# if no sequence was found with the substituting aa
# then no predicted value is computed
if(le==0){res = NA}
else{
# if only one sequence was found
# then just take this distance
if(le==1){res = NA}
else{
# if the minimum distance is null
# we are in the presence of the wild type
if(vec[1]==0){res = vec[1]}
# otherwise, it means we have at least two non-zero values
else{
# if there is no significant gap between the first and second values
# take the first value
if((vec[2]-vec[1])<=delta){res = vec[1]}
# otherwise, take the second one
else{res = vec[2]}
}
}
}
return(res)
}
# Compute all evolutionary distances with all homologous
# sequences in an alignment (mat) with respect to all positions
# and all possible amino acid substitutions
# returns a matrix with aa rows and nbPos columns
# for each aa and each position, we retain only the smallest distance
computePredSimple<-function(mat, distTrace, wt, thresh){
# number of positions
leSeq = length(wt)
# result matrix
pred = matrix(nc=leSeq,nr=20)
# count nb seqs
N = dim(mat)[[2]]
# the resulting matrix contains trace values
# only in mutated sequences and at relevant positions
# matAli = convertAliToTrace(mat,trace,wt)
# go over all positions
for(i in 1:leSeq){
res = vector()
for(a in aa){
if(a!=wt[i]){
sel = which(mat[,i]==a)
if(length(sel)>0){sortedDist=sort(distTrace[sel-1])}
else{sortedDist = c()}
res = c(res,findMinDist(sortedDist,thresh))
}
else{
res = c(res,0)
}
}
pred[,i] = res
}
return(pred)
}
# Normalize predicted values
# scale between 0 and 2, with 2 replacing NAs,
# and then multiply by the trace of each position
# if we have not found any sequence with the substitution of interest
# we assume the effect is maximal (2)
# then the effects are scaled according to the trace of the positions
# the more conserved a position, the more important the effect
normalizePred<-function(pred, trace, wt){
n = length(trace)
normPred = matrix(nc=n,nr=20)
for(i in 1:n){
normPred[,i] = pred[,i] / max(pred[!is.na(pred)])
normPred[is.na(normPred[,i]),i] = 1
normPred[,i] = normPred[,i] * trace[i]
normPred[aa==wt[i],i] = NA
}
return(-normPred)
}
normalizePredWithNbSeqsPC<-function(pred, trace, wt, alpha, nbSeqs, alphabet){
n = length(trace)
normPred = matrix(nc=n,nr=20)
rownames(normPred) = aa
nbseqs = computeNbSeqsAlph(nbSeqs,alphabet)
maxRefVal = max(apply(nbseqs,2,sum))
# for each position in the sequence
for(i in 1:n){
normPred[,i] = pred[,i] / max(pred[!is.na(pred)]) * log(1/maxRefVal) * (-1)
normPred[is.na(normPred[,i]),i] = - log(1/maxRefVal)
# for each possible substitution
for(a in aa){
A = which.class(wt[i],alphabet)
B = which.class(a,alphabet)
normPred[a,i] = alpha * normPred[a,i] - (1-alpha) * log(max(1,nbseqs[B,i])/nbseqs[A,i])
}
normPred[,i] = normPred[,i] * trace[i]
normPred[aa==wt[i],i] = NA
}
return(-normPred)
}
normalizePredWithNbSeqsPCSelMult<-function(pred, trace, wt, listMut, alpha, nbSeqs, alphabet){
# check mutation list
pos = listMut[[1]]
let = listMut[[2]]
N = length(pos)
if(N!=length(let)){print("Problem!!!! not the same number of positions and aas !!")}
normPred = rep(NA,N)
allVal = unlist(pred)
maxVal = max(allVal[!is.na(allVal)])
nbseqs = computeNbSeqsAlph(nbSeqs,alphabet)
maxRefVal = max(apply(nbseqs,2,sum))
# for each position in the sequence
for(i in 1:N){
predTMP=c()
myPos = pos[[i]]
myAA = tolower(let[[i]])
n = length(myPos)
for(k in 1:n){
A = which.class(wt[myPos[k]],alphabet)
B = which.class(myAA[k],alphabet)
ratio = log(max(1,nbseqs[B,myPos[k]])/nbseqs[A,myPos[k]])
val = pred[myAA[k],myPos[k]] / maxVal * log(1/maxRefVal) * (-1)
if(is.na(val)){val=- log(1/maxRefVal)}
predTMP=c(predTMP, alpha * val - (1-alpha) * ratio)
}
normPred[i] = sum(predTMP * trace[myPos])
}
return(-normPred)
}
normalizePredSelMult<-function(pred, trace, wt, listMut){
pos = listMut[[1]]
let = listMut[[2]]
N = length(pos)
allVal = unlist(pred)
maxVal = max(allVal[!is.na(allVal)])
normPred=rep(NA,N)
for(i in 1:N){
myPos = pos[[i]]
myAA = tolower(let[[i]])
n = length(myPos)
predTMP = c()
for(k in 1:n){
val = pred[myAA[k],myPos[k]] / maxVal
if(is.na(val)){val = 1}
predTMP=c(predTMP,val)}
normPred[i] = sum(predTMP * trace[myPos])
}
return(-normPred)
}
library("RColorBrewer")
##############################################
### functions for ploting the results
##############################################
plotMatOranges<-function(pred_name){
pred = as.matrix(read.table(paste(pred_name, ".txt", sep="")))
sel=seq(1,dim(pred)[[2]])
Iwidth = round(sqrt(dim(pred)[[2]])) + 3
prop=99
pred[is.na(pred)]=0
minVal = floor(min(pred))
maxVal = ceiling(max(pred))
p = (100-prop)/100
cutVal = quantile(pred,c(0,p))[2]
jpeg(paste(pred_name, '.jpg', sep=""), width = Iwidth, height = 5, units = 'in', res = 300)
print(c(-maxVal,-cutVal,-minVal))
image(sel,1:20,-t(pred[20:1,sel]),breaks=c(seq(-maxVal,-cutVal,le=80),-minVal),col=c(colorRampPalette(brewer.pal(9,"Greys"))(80)[1:10],colorRampPalette(brewer.pal(9,"Oranges"))(80)[11:70],colorRampPalette(brewer.pal(9,"Greys"))(80)[61:70]),yaxt="n",xaxt="n",ylab="",xlab="",bty="n")
axis(1,c(sel[1]-1,sel[seq(0,length(sel),by=10)]),las=2)
axis(2,1:20,toupper(rownames(pred)[20:1]),las=2,tick=FALSE)
dev.off()
}
plotMatGreens<-function(pred_name){
pred = as.matrix(read.table(paste(pred_name, ".txt", sep="")))
Iwidth = round(sqrt(dim(pred)[[2]])) + 3
sel=seq(1,dim(pred)[[2]])
prop=99
pred[is.na(pred)]=0
minVal = floor(min(pred))
maxVal = ceiling(max(pred))
p = (100-prop)/100
cutVal = quantile(pred,c(0,p))[2]
jpeg(paste(pred_name, '.jpg', sep=""), width = Iwidth, height = 5, units = 'in', res = 300)
print(c(-maxVal,-cutVal,-minVal))
image(sel,1:20,-t(pred[20:1,sel]),breaks=c(seq(-maxVal,-cutVal,le=80),-minVal),col=c(colorRampPalette(brewer.pal(9,"Greys"))(80)[1:10],colorRampPalette(brewer.pal(9,"Greens"))(80)[11:70],colorRampPalette(brewer.pal(9,"Greys"))(80)[61:70]),yaxt="n",xaxt="n",ylab="",xlab="",bty="n")
axis(1,c(sel[1]-1,sel[seq(0,length(sel),by=10)]),las=2)
axis(2,1:20,toupper(rownames(pred)[20:1]),las=2,tick=FALSE)
dev.off()
}
plotMatBlues<-function(pred_name){
pred = as.matrix(read.table(paste(pred_name, ".txt", sep="")))
sel=seq(1,dim(pred)[[2]])
Iwidth = round(sqrt(dim(pred)[[2]])) + 3
prop=99 #95
pred[is.na(pred)]=0
minVal = floor(min(pred))
maxVal = ceiling(max(pred))
p = (100-prop)/100
cutVal = quantile(pred,seq(0,p,by=p))[2]
print(c(-maxVal,-minVal,-cutVal))
jpeg(paste(pred_name, '.jpg', sep=""), width = Iwidth, height = 5, units = 'in', res = 300)
#image(sel,1:20,-t(pred[,sel]),breaks=c(seq(-maxVal,-cutVal,le=80),-minVal),col=c(colorRampPalette(brewer.pal(9,"Greys"))(80)[11:20],colorRampPalette(brewer.pal(9,"Blues"))(80)[11:70],colorRampPalette(brewer.pal(9,"Greys"))(80)[71:80]),yaxt="n",xaxt="n",ylab="",xlab="",bty="n")
image(sel,1:20,-t(pred[20:1,sel]),breaks=c(seq(-maxVal,-cutVal,le=80),-minVal),col=c(colorRampPalette(brewer.pal(9,"Greys"))(80)[1:10],colorRampPalette(brewer.pal(9,"Blues"))(80)[11:70],colorRampPalette(brewer.pal(9,"Greys"))(80)[61:70]),yaxt="n",xaxt="n",ylab="",xlab="",bty="n")
axis(1,c(sel[1]-1,sel[seq(0,length(sel),by=10)]),las=2)
axis(2,1:20,toupper(rownames(pred)[20:1]),las=2,tick=FALSE)
dev.off()
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment