Initial Commit

2d1df3ad · Mustafa Tekpinar · 2d1df3ad · 2d1df3ad · 2d1df3ad · 2d1df3ad
Commit 2d1df3ad authored Apr 25, 2022 by Mustafa Tekpinar
176 changed files
--- a/.gitignore
+++ b/.gitignore
+*~
+._*
\ No newline at end of file
--- a/LICENSE.txt
+++ b/LICENSE.txt
+MIT License
+
+Copyright (c) 2018: Elodie Laine, Yasaman Karami and Alessandra Carbone.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
--- a/README
+++ b/README
+############################################################################################################
+#                                                                                                          #
+#    GEMME: a tool to predict mutational outcomes using evolutionary conservation and global epistasis     #
+#                                                                                                          #
+############################################################################################################
+#
+#
+# GEMME is implemented in Python and R.
+# https://www.python.org/
+# https://cran.r-project.org/
+#
+#
+##################
+# Dependencies:  #
+##################
+#
+#  Joint Evolutionary Trees: http://www.lcqb.upmc.fr/JET2/
+#  seqinr R package: https://cran.r-project.org/web/packages/seqinr/index.html
+#  These tools should be installed to be able to use GEMME.
+#
+#
+##################
+# Installation:  #
+##################
+#
+# Download the GEMME.tgz archive from http://www.lcqb.upmc.fr/GEMME/.
+# Uncompress the archive in the directory of your choice.
+# Define and export the environment variable GEMME_PATH=/path-to-GEMME-directory/
+# Run the program by typing "python $GEMME_PATH/gemme.py inputAli.fasta".
+# A help can be accessed by typing "python $GEMME_PATH/gemme.py --help".
+#
+#
+#################
+# Usage notes:  #
+#################
+# 
+# The inputAli.fasta is a mandatory argument that corresponds to the input multiple sequence 
+# alignment file, in FASTA format. The query sequence is taken as the first sequence in the alignment.
+#
+# By default, GEMME will predict the effect of all possible single mutations at all positions in the 
+# query sequence. Alternatively, a set of single or multiple mutations can be given with the option -m.
+# Eachline of the file should contain a mutation (e.g. D136R) or combination of mutations separated 
+# by commas and ordered according to their positions in the sequence (e.g. D136R,V271A).
+#
+# GEMME calls JET2 to compute evolutionary conservation levels. By default, JET2 will retrieve a set
+# of sequences related to the query, independent from the input set, according to specific criteria.
+# The retrieval method used in JET2 is PSI-BLAST, which can perform the search either locally (by 
+# default) or remotely (-r server). Alternatively, the user can provide her/his own psiblast file  
+# (-r input-b pFile) or her/his own multiple sequence alignment in FASTA format (-r input -f fFile).
+# JET is run in its iterative mode, iJET, 10 times and the final conservation levels are the maxium 
+# values obtained over the 10 iterations. 
+# JET2 configuration file is: default.conf.
+# JET2 output file is: myProt_jet.res.
+#
+# By default, GEMME will output mutational effects predictions obtained from the global epistatic model,
+# the independent model, and a combination of those two using a reduced alphabet (alphabets/lw-i.11.txt):
+# myProt_pred_evolEpi.txt
+# myProt_normPred_evolEpi.txt
+# myProt_pred_evolInd.txt
+# myProt_normPred_evolInd.txt
+# myProt_normPred_evolCombi.txt
+# The values of interest are the normalized predictions (normPred). Each file contains a 20 x n matrix, 
+# where n is the number of positions in the query sequence.
+# If the user provides her/his own list of mutations, then only the global epistatic model will be run 
+# and the output file will contain 2 columns, the first one with the mutations, the second one with the 
+# normalized predicted effects.
+#
+#
+####################
+# Main reference:  #
+####################
+#
+# Laine E, Karami Y, Carbone A. GEMME: A Simple and Fast Global Epistatic Model Predicting Mutational Effects
+# Molecular Biology and Evolution, Volume 36, Issue 11, November 2019, Pages 2604–2619
+#
+#
+#
+############################################################################################################
+
--- a/alphabets/ab.10.txt
+++ b/alphabets/ab.10.txt
+P
+G
+EKRQ
+DSN
+T
+HC
+IV
+WYF
+A
+LM
--- a/alphabets/ab.11.txt
+++ b/alphabets/ab.11.txt
+P
+G
+EKRQ
+D
+SN
+T
+HC
+IV
+WYF
+A
+LM
--- a/alphabets/ab.12.txt
+++ b/alphabets/ab.12.txt
+P
+G
+EKRQ
+D
+SN
+T
+H
+C
+IV
+WYF
+A
+LM
--- a/alphabets/ab.13.txt
+++ b/alphabets/ab.13.txt
+P
+G
+E
+KRQ
+D
+SN
+T
+H
+C
+IV
+WYF
+A
+LM
--- a/alphabets/ab.14.txt
+++ b/alphabets/ab.14.txt
+P
+G
+E
+KRQ
+D
+S
+N
+T
+H
+C
+IV
+WYF
+A
+LM
--- a/alphabets/ab.15.txt
+++ b/alphabets/ab.15.txt
+P
+G
+E
+KRQ
+D
+S
+N
+T
+H
+C
+IV
+W
+YF
+A
+LM
--- a/alphabets/ab.16.txt
+++ b/alphabets/ab.16.txt
+P
+G
+E
+KRQ
+D
+S
+N
+T
+H
+C
+IV
+W
+YF
+A
+L
+M
--- a/alphabets/ab.17.txt
+++ b/alphabets/ab.17.txt
+P
+G
+E
+K
+RQ
+D
+S
+N
+T
+H
+C
+IV
+W
+YF
+A
+L
+M
--- a/alphabets/ab.18.txt
+++ b/alphabets/ab.18.txt
+P
+G
+E
+K
+RQ
+D
+S
+N
+T
+H
+C
+I
+V
+W
+YF
+A
+L
+M
--- a/alphabets/ab.19.txt
+++ b/alphabets/ab.19.txt
+P
+G
+E
+K
+R
+Q
+D
+S
+N
+T
+H
+C
+I
+V
+W
+YF
+A
+L
+M
--- a/alphabets/ab.2.txt
+++ b/alphabets/ab.2.txt
+PGEKRQDSNTHC
+IVWYFALM
--- a/alphabets/ab.3.txt
+++ b/alphabets/ab.3.txt
+PG
+EKRQDSNTHC
+IVWYFALM
--- a/alphabets/ab.4.txt
+++ b/alphabets/ab.4.txt
+PG
+EKRQDSNTHC
+IVWYF
+ALM
--- a/alphabets/ab.5.txt
+++ b/alphabets/ab.5.txt
+PG
+EKRQ
+DSNTHC
+IVWYF
+ALM
--- a/alphabets/ab.6.txt
+++ b/alphabets/ab.6.txt
+PG
+EKRQ
+DSN
+THC
+IVWYF
+ALM
--- a/alphabets/ab.7.txt
+++ b/alphabets/ab.7.txt
+PG
+EKRQ
+DSN
+THC
+IVWYF
+A
+LM
--- a/alphabets/ab.8.txt
+++ b/alphabets/ab.8.txt
+P
+G
+EKRQ
+DSN
+THC
+IVWYF
+A
+LM
--- a/alphabets/ab.9.txt
+++ b/alphabets/ab.9.txt
+P
+G
+EKRQ
+DSN
+THC
+IV
+WYF
+A
+LM
--- a/alphabets/cb.2.txt
+++ b/alphabets/cb.2.txt
+LFIMVWCY
+HATGPRQSNEDK
--- a/alphabets/cb.5.txt
+++ b/alphabets/cb.5.txt
+LFI
+MVWCY
+HA
+TGPRQSNED
+K
--- a/alphabets/dssp.10.txt
+++ b/alphabets/dssp.10.txt
+EKQR
+IV
+LY
+F
+AM
+W
+HT
+C
+DNS
+GP
--- a/alphabets/dssp.11.txt
+++ b/alphabets/dssp.11.txt
+AEKQR
+IV
+LM
+F
+Y
+W
+C
+H
+T
+DNS
+GP
--- a/alphabets/dssp.12.txt
+++ b/alphabets/dssp.12.txt
+AEKQR
+I
+V
+LM
+F
+Y
+W
+C
+H
+T
+DNS
+GP
--- a/alphabets/dssp.13.txt
+++ b/alphabets/dssp.13.txt
+AEKQR
+I
+V
+L
+M
+F
+Y
+W
+C
+H
+T
+DNS
+GP
--- a/alphabets/dssp.14.txt
+++ b/alphabets/dssp.14.txt
+DEKQ
+R
+A
+I
+V
+L
+M
+F
+Y
+W
+C
+H
+T
+GNPS
--- a/alphabets/dssp.2.txt
+++ b/alphabets/dssp.2.txt
+ACEFHIKLMQRVWY
+DGNPST
\ No newline at end of file
--- a/alphabets/dssp.3.txt
+++ b/alphabets/dssp.3.txt
+AEHKQR
+CFILMVWY
+DGNPST
\ No newline at end of file
--- a/alphabets/dssp.4.txt
+++ b/alphabets/dssp.4.txt
+AEHKQR
+CFILMVWY
+DNST
+GP
\ No newline at end of file
--- a/alphabets/dssp.5.txt
+++ b/alphabets/dssp.5.txt
+AEHKQR
+FILMVWY
+CST
+DN
+GP
\ No newline at end of file
--- a/alphabets/dssp.6.txt
+++ b/alphabets/dssp.6.txt
+AEKQR
+FIV
+LMWY
+HCT
+DNS
+GP
--- a/alphabets/dssp.7.txt
+++ b/alphabets/dssp.7.txt
+EKQR
+FIV
+LMWY
+ACH
+ST
+DN
+GP
--- a/alphabets/dssp.8.txt
+++ b/alphabets/dssp.8.txt
+EKQR
+IV
+LWY
+AM
+CF
+HT
+DNS
+GP
--- a/alphabets/dssp.9.txt
+++ b/alphabets/dssp.9.txt
+EKQR
+IV
+L
+F
+AMW
+CY
+HT
+DNS
+GP
--- a/alphabets/gbmr.10.txt
+++ b/alphabets/gbmr.10.txt
+G
+D
+N
+AEFIKLMQRVW
+Y
+H
+C
+T
+S
+P
\ No newline at end of file
--- a/alphabets/gbmr.11.txt
+++ b/alphabets/gbmr.11.txt
+G
+D
+N
+AEFIKLMQRV
+W
+Y
+H
+C
+T
+S
+P
\ No newline at end of file
--- a/alphabets/gbmr.12.txt
+++ b/alphabets/gbmr.12.txt
+G
+D
+N
+AEFIKLMQV
+R
+W
+Y
+H
+C
+T
+S
+P
\ No newline at end of file
--- a/alphabets/gbmr.13.txt
+++ b/alphabets/gbmr.13.txt
+G
+D
+N
+AEFIKLMV
+Q
+R
+W
+Y
+H
+C
+T
+S
+P
\ No newline at end of file
--- a/alphabets/gbmr.14.txt
+++ b/alphabets/gbmr.14.txt
+G
+D
+N
+AEFIKLV
+M
+Q
+R
+W
+Y
+H
+C
+T
+S
+P
\ No newline at end of file
--- a/alphabets/gbmr.2.txt
+++ b/alphabets/gbmr.2.txt
+GP
+DNAEFIKLVMQRWYHCTS
--- a/alphabets/gbmr.3.txt
+++ b/alphabets/gbmr.3.txt
+G
+DNAEFIKLVMQRWYHCTS
+P
--- a/alphabets/gbmr.4.txt
+++ b/alphabets/gbmr.4.txt
+G
+ADEKNQRST
+CFHILMVWY
+P
--- a/alphabets/gbmr.5.txt
+++ b/alphabets/gbmr.5.txt
+G
+ND
+AEHKQRST
+CFILMVWY
+P
\ No newline at end of file
--- a/alphabets/gbmr.6.txt
+++ b/alphabets/gbmr.6.txt
+G
+DN
+AEFHIKLMQRVWY
+CT
+S
+P
\ No newline at end of file
--- a/alphabets/gbmr.7.txt
+++ b/alphabets/gbmr.7.txt
+G
+DN
+AEFIKLMQRVWY
+CH
+T
+S
+P
\ No newline at end of file
--- a/alphabets/gbmr.8.txt
+++ b/alphabets/gbmr.8.txt
+G
+D
+N
+AEFIKLMQRVWY
+CH
+T
+S
+P
\ No newline at end of file
--- a/alphabets/gbmr.9.txt
+++ b/alphabets/gbmr.9.txt
+G
+D
+N
+AEFIKLMQRVWY
+H
+C
+T
+S
+P
\ No newline at end of file
--- a/alphabets/grud7.txt
+++ b/alphabets/grud7.txt
+FWYM
+CLIV
+AP
+NST
+GH
+DE
+KQR
--- a/alphabets/grud9.txt
+++ b/alphabets/grud9.txt
+FWYM
+H
+LIVC
+A
+NST
+P
+G
+DE
+KQR
--- a/alphabets/hsdm.10.txt
+++ b/alphabets/hsdm.10.txt
+LIVFM
+Y
+W
+C
+DN
+TSKEQR
+A
+G
+P
+H
\ No newline at end of file
--- a/alphabets/hsdm.12.txt
+++ b/alphabets/hsdm.12.txt
+LIVF
+M
+Y
+W
+C
+DN
+TSKEQ
+R
+A
+G
+P
+H
\ No newline at end of file
--- a/alphabets/hsdm.14.txt
+++ b/alphabets/hsdm.14.txt
+LIV
+F
+M
+Y
+W
+C
+DN
+TS
+KEQ
+R
+A
+G
+P
+H
\ No newline at end of file
--- a/alphabets/hsdm.15.txt
+++ b/alphabets/hsdm.15.txt
+LIV
+F
+M
+Y
+W
+C
+D
+N
+TS
+KEQ
+R
+A
+G
+P
+H
\ No newline at end of file
--- a/alphabets/hsdm.16.txt
+++ b/alphabets/hsdm.16.txt
+LIV
+F
+M
+Y
+W
+C
+D
+N
+TS
+KE
+Q
+R
+A
+G
+P
+H
\ No newline at end of file
--- a/alphabets/hsdm.17.txt
+++ b/alphabets/hsdm.17.txt
+LIV
+F
+M
+Y
+W
+C
+D
+N
+T
+S
+KE
+Q
+R
+A
+G
+P
+H
\ No newline at end of file
--- a/alphabets/hsdm.2.txt
+++ b/alphabets/hsdm.2.txt
+LIVFMYWC
+DNTSKEQRAGPH
\ No newline at end of file
--- a/alphabets/hsdm.3.txt
+++ b/alphabets/hsdm.3.txt
+LIVFMYW
+C
+DNTSKEQRAGPH
\ No newline at end of file
--- a/alphabets/hsdm.4.txt
+++ b/alphabets/hsdm.4.txt
+LIVFMYW
+C
+DNTSKEQRAGP
+H
\ No newline at end of file
--- a/alphabets/hsdm.5.txt
+++ b/alphabets/hsdm.5.txt
+LIVFMY
+W
+C
+DNTSKEQRAGP
+H
\ No newline at end of file
--- a/alphabets/hsdm.6.txt
+++ b/alphabets/hsdm.6.txt
+LIVFMY
+W
+C
+DNTSKEQRAG
+P
+H
\ No newline at end of file
--- a/alphabets/hsdm.7.txt
+++ b/alphabets/hsdm.7.txt
+LIVFMY
+W
+C
+DNTSKEQRA
+G
+P
+H
\ No newline at end of file
--- a/alphabets/hsdm.8.txt
+++ b/alphabets/hsdm.8.txt
+LIVFM
+Y
+W
+C
+DNTSKEQRA
+G
+P
+H
\ No newline at end of file
--- a/alphabets/hsdm.9.txt
+++ b/alphabets/hsdm.9.txt
+LIVFM
+Y
+W
+C
+DNTSKEQR
+A
+G
+P
+H
\ No newline at end of file
--- a/alphabets/lr.10.txt
+++ b/alphabets/lr.10.txt
+AST
+C
+DEN
+FY
+G
+H
+ILMV
+KQR
+P
+W
--- a/alphabets/lw-i.10.txt
+++ b/alphabets/lw-i.10.txt
+FWY
+ML
+IV
+CA
+TS
+NH
+P
+G
+DE
+QRK
--- a/alphabets/lw-i.11.txt
+++ b/alphabets/lw-i.11.txt
+FWY
+ML
+IV
+CA
+TS
+NH
+P
+G
+D
+QE
+RK
--- a/alphabets/lw-i.12.txt
+++ b/alphabets/lw-i.12.txt
+FWY
+ML
+IV
+C
+A
+TS
+NH
+P
+G
+D
+QE
+RK
--- a/alphabets/lw-i.13.txt
+++ b/alphabets/lw-i.13.txt
+FWY
+ML
+IV
+C
+A
+T
+S
+NH
+P
+G
+D
+QE
+RK
--- a/alphabets/lw-i.14.txt
+++ b/alphabets/lw-i.14.txt
+FWY
+ML
+IV
+C
+A
+T
+S
+NH
+P
+G
+D
+QE
+R
+K
--- a/alphabets/lw-i.15.txt
+++ b/alphabets/lw-i.15.txt
+FWY
+ML
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+QE
+R
+K
--- a/alphabets/lw-i.16.txt
+++ b/alphabets/lw-i.16.txt
+W
+FY
+ML
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+QE
+R
+K
--- a/alphabets/lw-i.17.txt
+++ b/alphabets/lw-i.17.txt
+W
+FY
+ML
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+Q
+E
+R
+K
--- a/alphabets/lw-i.18.txt
+++ b/alphabets/lw-i.18.txt
+W
+FY
+M
+L
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+Q
+E
+R
+K
--- a/alphabets/lw-i.19.txt
+++ b/alphabets/lw-i.19.txt
+W
+F
+Y
+M
+L
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+Q
+E
+R
+K
--- a/alphabets/lw-i.2.txt
+++ b/alphabets/lw-i.2.txt
+CMFILVWY
+AGTSNQDEHRKP
--- a/alphabets/lw-i.3.txt
+++ b/alphabets/lw-i.3.txt
+CMFILVWY
+AGTSP
+NQDEHRK
--- a/alphabets/lw-i.4.txt
+++ b/alphabets/lw-i.4.txt
+CMFWY
+ILV
+AGTS
+NQDEHRKP
--- a/alphabets/lw-i.5.txt
+++ b/alphabets/lw-i.5.txt
+FWYH
+MILV
+CATSP
+G
+NQDERK
--- a/alphabets/lw-i.6.txt
+++ b/alphabets/lw-i.6.txt
+FWYH
+MILV
+CATS
+P
+G
+NQDERK
--- a/alphabets/lw-i.7.txt
+++ b/alphabets/lw-i.7.txt
+FWYH
+MILV
+CATS
+P
+G
+NQDE
+RK
--- a/alphabets/lw-i.8.txt
+++ b/alphabets/lw-i.8.txt
+FWYH
+MILV
+CA
+NTS
+P
+G
+DE
+QRK
--- a/alphabets/lw-i.9.txt
+++ b/alphabets/lw-i.9.txt
+FWYH
+ML
+IV
+CA
+NTS
+P
+G
+DE
+QRK
--- a/alphabets/lw-ni.10.txt
+++ b/alphabets/lw-ni.10.txt
+C
+FYW
+ML
+IV
+G
+P
+ATS
+NH
+QED
+RK
--- a/alphabets/lw-ni.11.txt
+++ b/alphabets/lw-ni.11.txt
+C
+FYW
+ML
+IV
+G
+P
+A
+TS
+NH
+QED
+RK
--- a/alphabets/lw-ni.12.txt
+++ b/alphabets/lw-ni.12.txt
+C
+FYW
+ML
+IV
+G
+P
+A
+TS
+NH
+QE
+D
+RK
--- a/alphabets/lw-ni.13.txt
+++ b/alphabets/lw-ni.13.txt
+C
+FYW
+ML
+IV
+G
+P
+A
+T
+S
+NH
+QE
+D
+RK
--- a/alphabets/lw-ni.14.txt
+++ b/alphabets/lw-ni.14.txt
+C
+FYW
+ML
+IV
+G
+P
+A
+T
+S
+N
+H
+QE
+D
+RK
--- a/alphabets/lw-ni.15.txt
+++ b/alphabets/lw-ni.15.txt
+FWY
+ML
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+QE
+R
+K
--- a/alphabets/lw-ni.16.txt
+++ b/alphabets/lw-ni.16.txt
+W
+FY
+ML
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+QE
+R
+K
--- a/alphabets/lw-ni.17.txt
+++ b/alphabets/lw-ni.17.txt
+W
+FY
+ML
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+Q
+E
+R
+K
--- a/alphabets/lw-ni.18.txt
+++ b/alphabets/lw-ni.18.txt
+W
+FY
+M
+L
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+Q
+E
+R
+K
--- a/alphabets/lw-ni.19.txt
+++ b/alphabets/lw-ni.19.txt
+W
+F
+Y
+M
+L
+IV
+C
+A
+T
+S
+N
+H
+P
+G
+D
+Q
+E
+R
+K
--- a/alphabets/lw-ni.2.txt
+++ b/alphabets/lw-ni.2.txt
+CFYWMLIV
+GPATSNHQEDRK
--- a/alphabets/lw-ni.3.txt
+++ b/alphabets/lw-ni.3.txt
+CFYWMLIV
+GPATS
+NHQEDRK
--- a/alphabets/lw-ni.4.txt
+++ b/alphabets/lw-ni.4.txt
+CFYW
+MLIV
+GPATS
+NHQEDRK
--- a/alphabets/lw-ni.5.txt
+++ b/alphabets/lw-ni.5.txt
+CFYW
+MLIV
+G
+PATS
+NHQEDRK
--- a/alphabets/lw-ni.6.txt
+++ b/alphabets/lw-ni.6.txt
+CFYW
+MLIV
+G
+P
+ATS
+NHQEDRK
--- a/alphabets/lw-ni.7.txt
+++ b/alphabets/lw-ni.7.txt
+CFYW
+MLIV
+G
+P
+ATS
+NHQED
+RK
--- a/alphabets/lw-ni.8.txt
+++ b/alphabets/lw-ni.8.txt
+CFYW
+MLIV
+G
+P
+ATS
+NH
+QED
+RK
--- a/alphabets/lw-ni.9.txt
+++ b/alphabets/lw-ni.9.txt
+CFYW
+ML
+IV
+G
+P
+ATS
+NH
+QED
+RK
--- a/alphabets/lz-bl.10.txt
+++ b/alphabets/lz-bl.10.txt
+IMV
+L
+FWY
+G
+P
+C
+A
+STNH
+RKQE
+D
--- a/alphabets/lz-bl.11.txt
+++ b/alphabets/lz-bl.11.txt
+IMV
+L
+FWY
+G
+P
+C
+A
+STNH
+RKQ
+E
+D
--- a/alphabets/lz-bl.12.txt
+++ b/alphabets/lz-bl.12.txt
+IMV
+L
+FWY
+G
+P
+C
+A
+ST
+N
+HRKQ
+E
+D
--- a/alphabets/lz-bl.13.txt
+++ b/alphabets/lz-bl.13.txt
+IMV
+L
+F
+WY
+G
+P
+C
+A
+ST
+N
+HRKQ
+E
+D
--- a/alphabets/lz-bl.14.txt
+++ b/alphabets/lz-bl.14.txt
+IMV
+L
+F
+WY
+G
+P
+C
+A
+S
+T
+N
+HRKQ
+E
+D
--- a/alphabets/lz-bl.15.txt
+++ b/alphabets/lz-bl.15.txt
+IMV
+L
+F
+WY
+G
+P
+C
+A
+S
+T
+N
+H
+RKQ
+E
+D
--- a/alphabets/lz-bl.16.txt
+++ b/alphabets/lz-bl.16.txt
+IMV
+L
+F
+W
+Y
+G
+P
+C
+A
+S
+T
+N
+H
+RKQ
+E
+D
--- a/alphabets/lz-bl.2.txt
+++ b/alphabets/lz-bl.2.txt
+IMVLFWY
+GPCASTNHQEDRK
--- a/alphabets/lz-bl.3.txt
+++ b/alphabets/lz-bl.3.txt
+IMVLFWY
+GPCAST
+NHQEDRK
--- a/alphabets/lz-bl.4.txt
+++ b/alphabets/lz-bl.4.txt
+IMVLFWY
+G
+PCAST
+NHQEDRK
--- a/alphabets/lz-bl.5.txt
+++ b/alphabets/lz-bl.5.txt
+IMVL
+FWY
+G
+PCAST
+NHQEDRK
--- a/alphabets/lz-bl.6.txt
+++ b/alphabets/lz-bl.6.txt
+IMVL
+FWY
+G
+P
+CAST
+NHQEDRK
--- a/alphabets/lz-bl.7.txt
+++ b/alphabets/lz-bl.7.txt
+IMVL
+FWY
+G
+P
+CAST
+NHQED
+RK
--- a/alphabets/lz-bl.8.txt
+++ b/alphabets/lz-bl.8.txt
+IMV
+L
+FWY
+G
+P
+CAST
+NHQED
+RK
--- a/alphabets/lz-bl.9.txt
+++ b/alphabets/lz-bl.9.txt
+IMV
+L
+FWY
+G
+P
+C
+AST
+NHQED
+RK
--- a/alphabets/lz-mj.10.txt
+++ b/alphabets/lz-mj.10.txt
+MF
+ILV
+A
+C
+WYQHP
+G
+TSN
+RK
+D
+E
--- a/alphabets/lz-mj.11.txt
+++ b/alphabets/lz-mj.11.txt
+MF
+IL
+V
+A
+C
+WYQHP
+G
+TSN
+RK
+D
+E
--- a/alphabets/lz-mj.12.txt
+++ b/alphabets/lz-mj.12.txt
+MF
+IL
+V
+A
+C
+WYQHP
+G
+TS
+N
+RK
+D
+E
--- a/alphabets/lz-mj.13.txt
+++ b/alphabets/lz-mj.13.txt
+MF
+IL
+V
+A
+C
+WYQHP
+G
+T
+S
+N
+RK
+D
+E
--- a/alphabets/lz-mj.14.txt
+++ b/alphabets/lz-mj.14.txt
+MF
+I
+L
+V
+A
+C
+WYQHP
+G
+T
+S
+N
+RK
+D
+E
--- a/alphabets/lz-mj.15.txt
+++ b/alphabets/lz-mj.15.txt
+MF
+IL
+V
+A
+C
+WYQ
+H
+P
+G
+T
+S
+N
+RK
+D
+E
--- a/alphabets/lz-mj.16.txt
+++ b/alphabets/lz-mj.16.txt
+MF
+I
+L
+V
+A
+C
+WYQ
+H
+P
+G
+T
+S
+N
+RK
+D
+E
--- a/alphabets/lz-mj.2.txt
+++ b/alphabets/lz-mj.2.txt
+MFILVAW
+CYQHPGTSNRKDE
--- a/alphabets/lz-mj.3.txt
+++ b/alphabets/lz-mj.3.txt
+MFILVAW
+CYQHPGTSNRK
+DE
--- a/alphabets/lz-mj.4.txt
+++ b/alphabets/lz-mj.4.txt
+MFILV
+ACW
+YQHPGTSNRK
+DE
--- a/alphabets/lz-mj.5.txt
+++ b/alphabets/lz-mj.5.txt
+MFILV
+ACW
+YQHPGTSN
+RK
+DE
--- a/alphabets/lz-mj.6.txt
+++ b/alphabets/lz-mj.6.txt
+MFILV
+A
+C
+WYQHPGTSN
+RK
+DE
--- a/alphabets/lz-mj.7.txt
+++ b/alphabets/lz-mj.7.txt
+MFILV
+A
+C
+WYQHP
+GTSN
+RK
+DE
--- a/alphabets/lz-mj.8.txt
+++ b/alphabets/lz-mj.8.txt
+MFILV
+A
+C
+WYQHP
+G
+TSN
+RK
+DE
--- a/alphabets/lz-mj.9.txt
+++ b/alphabets/lz-mj.9.txt
+MF
+ILV
+A
+C
+WYQHP
+G
+TSN
+RK
+DE
--- a/alphabets/ml.10.txt
+++ b/alphabets/ml.10.txt
+LVIM
+C
+A
+G
+ST
+P
+FYW
+EDNQ
+KR
+H
--- a/alphabets/ml.15.txt
+++ b/alphabets/ml.15.txt
+LVIM
+C
+A
+G
+S
+T
+P
+FY
+W
+E
+D
+N
+Q
+KR
+H
--- a/alphabets/ml.2.txt
+++ b/alphabets/ml.2.txt
+LVIMCAGSTPFYW
+EDNQKRH
--- a/alphabets/ml.4.txt
+++ b/alphabets/ml.4.txt
+LVIMC
+AGSTP
+FYW
+EDNQKRH
--- a/alphabets/ml.8.txt
+++ b/alphabets/ml.8.txt
+LVIMC
+AG
+ST
+P
+FYW
+EDNQ
+KR
+H
--- a/alphabets/mm.5.txt
+++ b/alphabets/mm.5.txt
+AG
+C
+DEKNPQRST
+FILMVWY
+H
--- a/alphabets/ms.6.txt
+++ b/alphabets/ms.6.txt
+AVLIMC
+FWYH
+STNQ
+KR
+DE
+GP
--- a/alphabets/sdm.10.txt
+++ b/alphabets/sdm.10.txt
+YF
+LIVM
+C
+W
+DN
+TSQKER
+A
+G
+H
+P
\ No newline at end of file
--- a/alphabets/sdm.11.txt
+++ b/alphabets/sdm.11.txt
+YF
+LIVM
+C
+W
+DN
+TSQ
+KER
+A
+G
+H
+P
\ No newline at end of file
--- a/alphabets/sdm.12.txt
+++ b/alphabets/sdm.12.txt
+YF
+LIVM
+C
+W
+D
+N
+TSQ
+KER
+A
+G
+H
+P
\ No newline at end of file
--- a/alphabets/sdm.13.txt
+++ b/alphabets/sdm.13.txt
+Y
+F
+LIVM
+C
+W
+D
+N
+TSQ
+KER
+A
+G
+H
+P
\ No newline at end of file
--- a/alphabets/sdm.14.txt
+++ b/alphabets/sdm.14.txt
+Y
+F
+LIV
+M
+C
+W
+D
+N
+TSQ
+KER
+A
+G
+H
+P
\ No newline at end of file
--- a/alphabets/sdm.2.txt
+++ b/alphabets/sdm.2.txt
+YFLIVMCW
+DNTSQKERAGHP
\ No newline at end of file
--- a/alphabets/sdm.3.txt
+++ b/alphabets/sdm.3.txt
+YFLIVMC
+W
+DNTSQKERAGHP
\ No newline at end of file
--- a/alphabets/sdm.4.txt
+++ b/alphabets/sdm.4.txt
+YFLIVMC
+W
+DNTSQKERAGH
+P
\ No newline at end of file
--- a/alphabets/sdm.6.txt
+++ b/alphabets/sdm.6.txt
+YFLIVM
+C
+W
+DNTSQKERAG
+H
+P
\ No newline at end of file
--- a/alphabets/sdm.7.txt
+++ b/alphabets/sdm.7.txt
+YFLIVM
+C
+W
+DNTSQKERA
+G
+H
+P
\ No newline at end of file
--- a/alphabets/sdm.8.txt
+++ b/alphabets/sdm.8.txt
+YFLIVM
+C
+W
+DN
+TSQKERA
+G
+H
+P
\ No newline at end of file
--- a/alphabets/td.10.txt
+++ b/alphabets/td.10.txt
+P
+G
+EKRQ
+DSN
+T
+HC
+IV
+WYF
+A
+LM
--- a/alphabets/td.11.txt
+++ b/alphabets/td.11.txt
+P
+G
+EKRQ
+D
+SN
+T
+HC
+IV
+WYF
+A
+LM
--- a/alphabets/td.12.txt
+++ b/alphabets/td.12.txt
+P
+G
+EKRQ
+D
+SN
+T
+H
+C
+IV
+WYF
+A
+LM
--- a/alphabets/td.13.txt
+++ b/alphabets/td.13.txt
+P
+G
+E
+KRQ
+D
+SN
+T
+H
+C
+IV
+WYF
+A
+LM
--- a/alphabets/td.14.txt
+++ b/alphabets/td.14.txt
+P
+G
+E
+KRQ
+D
+S
+N
+T
+H
+C
+IV
+WYF
+A
+LM
--- a/alphabets/td.15.txt
+++ b/alphabets/td.15.txt
+P
+G
+E
+KRQ
+D
+S
+N
+T
+H
+C
+IV
+W
+YF
+A
+LM
--- a/alphabets/td.16.txt
+++ b/alphabets/td.16.txt
+P
+G
+E
+KRQ
+D
+S
+N
+T
+H
+C
+IV
+W
+YF
+A
+L
+M
--- a/alphabets/td.17.txt
+++ b/alphabets/td.17.txt
+P
+G
+E
+K
+RQ
+D
+S
+N
+T
+H
+C
+IV
+W
+YF
+A
+L
+M
--- a/alphabets/td.18.txt
+++ b/alphabets/td.18.txt
+P
+G
+E
+K
+RQ
+D
+S
+N
+T
+H
+C
+I
+V
+W
+YF
+A
+L
+M
--- a/alphabets/td.19.txt
+++ b/alphabets/td.19.txt
+P
+G
+E
+K
+R
+Q
+D
+S
+N
+T
+H
+C
+I
+V
+W
+YF
+A
+L
+M
--- a/alphabets/td.2.txt
+++ b/alphabets/td.2.txt
+PGEKRQDSNTHC
+IVWYFALM
--- a/alphabets/td.3.txt
+++ b/alphabets/td.3.txt
+PG
+EKRQDSNTHC
+IVWYFALM
--- a/alphabets/td.4.txt
+++ b/alphabets/td.4.txt
+PG
+EKRQDSNTHC
+IVWYF
+ALM
--- a/alphabets/td.5.txt
+++ b/alphabets/td.5.txt
+PG
+EKRQ
+DSNTHC
+IVWYF
+ALM
--- a/alphabets/td.6.txt
+++ b/alphabets/td.6.txt
+PG
+EKRQ
+DSN
+THC
+IVWYF
+ALM
--- a/alphabets/td.7.txt
+++ b/alphabets/td.7.txt
+PG
+EKRQ
+DSN
+THC
+IVWYF
+A
+LM
--- a/alphabets/td.8.txt
+++ b/alphabets/td.8.txt
+P
+G
+EKRQ
+DSN
+THC
+IVWYF
+A
+LM
--- a/alphabets/td.9.txt
+++ b/alphabets/td.9.txt
+P
+G
+EKRQ
+DSN
+THC
+IV
+WYF
+A
+LM
--- a/alphabets/ww.5.txt
+++ b/alphabets/ww.5.txt
+CMFILVWY
+ATH
+GP
+DE
+SNQRK
--- a/blosum62p.txt
+++ b/blosum62p.txt
+a r n d c q e g h i l k m f p s t w y v
+a 0.0215 0.0023 0.0019 0.0022 0.0016 0.0019 0.003 0.0058 0.0011 0.0032 0.0044 0.0033 0.0013 0.0016 0.0022 0.0063 0.0037 4e-04 0.0013 0.0051
+r 0.0023 0.0178 0.002 0.0016 4e-04 0.0025 0.0027 0.0017 0.0012 0.0012 0.0024 0.0062 8e-04 9e-04 0.001 0.0023 0.0018 3e-04 9e-04 0.0016
+n 0.0019 0.002 0.0141 0.0037 4e-04 0.0015 0.0022 0.0029 0.0014 0.001 0.0014 0.0024 5e-04 8e-04 9e-04 0.0031 0.0022 2e-04 7e-04 0.0012
+d 0.0022 0.0016 0.0037 0.0213 4e-04 0.0016 0.0049 0.0025 0.001 0.0012 0.0015 0.0024 5e-04 8e-04 0.0012 0.0028 0.0019 2e-04 6e-04 0.0013
+c 0.0016 4e-04 4e-04 4e-04 0.0119 3e-04 4e-04 8e-04 2e-04 0.0011 0.0016 5e-04 4e-04 5e-04 4e-04 0.001 9e-04 1e-04 3e-04 0.0014
+q 0.0019 0.0025 0.0015 0.0016 3e-04 0.0073 0.0035 0.0014 0.001 9e-04 0.0016 0.0031 7e-04 5e-04 8e-04 0.0019 0.0014 2e-04 7e-04 0.0012
+e 0.003 0.0027 0.0022 0.0049 4e-04 0.0035 0.0161 0.0019 0.0014 0.0012 0.002 0.0041 7e-04 9e-04 0.0014 0.003 0.002 3e-04 9e-04 0.0017
+g 0.0058 0.0017 0.0029 0.0025 8e-04 0.0014 0.0019 0.0378 0.001 0.0014 0.0021 0.0025 7e-04 0.0012 0.0014 0.0038 0.0022 4e-04 8e-04 0.0018
+h 0.0011 0.0012 0.0014 0.001 2e-04 0.001 0.0014 0.001 0.0093 6e-04 0.001 0.0012 4e-04 8e-04 5e-04 0.0011 7e-04 2e-04 0.0015 6e-04
+i 0.0032 0.0012 0.001 0.0012 0.0011 9e-04 0.0012 0.0014 6e-04 0.0184 0.0114 0.0016 0.0025 0.003 0.001 0.0017 0.0027 4e-04 0.0014 0.012
+l 0.0044 0.0024 0.0014 0.0015 0.0016 0.0016 0.002 0.0021 0.001 0.0114 0.0371 0.0025 0.0049 0.0054 0.0014 0.0024 0.0033 7e-04 0.0022 0.0095
+k 0.0033 0.0062 0.0024 0.0024 5e-04 0.0031 0.0041 0.0025 0.0012 0.0016 0.0025 0.0161 9e-04 9e-04 0.0016 0.0031 0.0023 3e-04 0.001 0.0019
+m 0.0013 8e-04 5e-04 5e-04 4e-04 7e-04 7e-04 7e-04 4e-04 0.0025 0.0049 9e-04 0.004 0.0012 4e-04 9e-04 0.001 2e-04 6e-04 0.0023
+f 0.0016 9e-04 8e-04 8e-04 5e-04 5e-04 9e-04 0.0012 8e-04 0.003 0.0054 9e-04 0.0012 0.0183 5e-04 0.0012 0.0012 8e-04 0.0042 0.0026
+p 0.0022 0.001 9e-04 0.0012 4e-04 8e-04 0.0014 0.0014 5e-04 0.001 0.0014 0.0016 4e-04 5e-04 0.0191 0.0017 0.0014 1e-04 5e-04 0.0012
+s 0.0063 0.0023 0.0031 0.0028 0.001 0.0019 0.003 0.0038 0.0011 0.0017 0.0024 0.0031 9e-04 0.0012 0.0017 0.0126 0.0047 3e-04 0.001 0.0024
+t 0.0037 0.0018 0.0022 0.0019 9e-04 0.0014 0.002 0.0022 7e-04 0.0027 0.0033 0.0023 0.001 0.0012 0.0014 0.0047 0.0125 3e-04 9e-04 0.0036
+w 4e-04 3e-04 2e-04 2e-04 1e-04 2e-04 3e-04 4e-04 2e-04 4e-04 7e-04 3e-04 2e-04 8e-04 1e-04 3e-04 3e-04 0.0065 9e-04 4e-04
+y 0.0013 9e-04 7e-04 6e-04 3e-04 7e-04 9e-04 8e-04 0.0015 0.0014 0.0022 0.001 6e-04 0.0042 5e-04 0.001 9e-04 9e-04 0.0102 0.0015
+v 0.0051 0.0016 0.0012 0.0013 0.0014 0.0012 0.0017 0.0018 6e-04 0.012 0.0095 0.0019 0.0023 0.0026 0.0012 0.0024 0.0036 4e-04 0.0015 0.0196
--- a/computePred.R
+++ b/computePred.R
+# Copyright (c) 2018: Elodie Laine
+# This code is part of the gemme package and governed by its license.
+# Please see the LICENSE.txt file included as part of this package.
+
+# Usage: Rscript --save runPred.R XXXX
+
+library("seqinr")
+source(paste(Sys.getenv("GEMME_PATH"),"/pred.R",sep=""))
+
+# amino acid full alphabet
+aa = c("a","c","d","e","f","g","h","i","k","l","m","n","p","q","r","s","t","v","w","y")
+
+# BLOSUM62  matrix
+blosum62p = as.matrix(read.table(paste0(Sys.getenv("GEMME_PATH"),"/blosum62p.txt"),row.names=1))
+bgp = blosum62p[order(rownames(blosum62p)),order(colnames(blosum62p))]
+bg = apply(bgp,1,sum)
+
+# get arguemnts 
+args = commandArgs(trailingOnly = TRUE)
+prot = args[1]
+aliFile = args[2]
+simple = args[3]
+fname = args[4]
+
+# read alignment and convert to matrix
+ali=as.matrix.alignment(read.alignment(aliFile,format="fasta"))
+ali[!ali%in%aa] = "-"
+npos = dim(ali)[[2]]
+# convert to binary matrix with respect to the query (0: identical, 1: different)
+binAli = t(apply(ali,1,f<-function(x){return(x!=ali[1,])}))
+pId =apply(binAli,1,f<-function(x){return(1-sum(x)/npos)})
+aliCons = ali[pId>0.6,]
+aliVeryCons = ali[pId>0.8,]
+# number of sequences
+N = c(dim(ali)[[1]],dim(aliCons)[[1]],dim(aliVeryCons)[[1]])
+#resAliCons = computePSSM(aliCons)
+res = list(computePSSM(ali,N[1],npos),computePSSM(aliCons,N[2],npos),computePSSM(aliVeryCons,N[3],npos))
+write.table(res[[1]][[3]],paste0(prot,"_pssm.txt"))
+write.table(res[[2]][[3]],paste0(prot,"_pssm60.txt"))
+write.table(res[[3]][[3]],paste0(prot,"_pssm80.txt"))
+
+# read evolutionary traces computed by JET
+jet=read.table(paste(prot,"_jet.res",sep=""),head=TRUE)
+if(sum(colnames(jet)=="traceMax")==1){trace=jet[,"traceMax"]}else{trace=jet[,"trace"]}
+#traceAli = sweep(binAli, MARGIN=2, trace, `*`)
+# compute evolutionary distances of all sequences with respect to the query
+distTrace = binAli[2:N[1],] %*% trace^2
+
+wt=read.fasta(paste(prot,".fasta",sep=""))
+n = length(wt[[1]])
+wt = wt[[1]][1:n]
+#indicesQ = getIndicesAli(ali$seq,1)
+
+if(simple==FALSE){
+  print("reading selected mutations...")
+  res=readMut(fname)
+  pos = res[[1]]
+  subsaa = res[[2]]
+  rawMut = res[[3]]
+  print("done")
+}
+
+print("running independent model...")
+# compute sequence counts for each position and each aa
+nbSeqs = computeNbSeqs(ali)
+# compute gap counts for each position
+nbGaps = N[1] - apply(nbSeqs,2,sum)
+# output the conservation values
+dat=rbind(trace,KL=res[[1]][[2]],SE=res[[1]][[1]],gap=nbGaps/N[1],KL60=res[[2]][[2]],SE60=res[[2]][[1]],KL80=res[[3]][[2]],SE80=res[[3]][[1]])
+write.table(dat,paste0(prot,"_conservation.txt"))
+# compute log-odd ratios between mutated and wt sequence counts
+predInd = computePredNbSeqs(wt,nbSeqs)
+rownames(predInd)=aa
+if(simple){
+  normPredInd = normalizePredWithNbSeqsZero(predInd,trace,wt)
+  rownames(normPredInd)=aa
+}else{
+  normPredInd = normalizePredWithNbSeqsZeroSelMult(predInd, trace, wt, list(pos,subsaa))
+  names(normPredInd)=rawMut
+}
+# output the sequence counts log-odd ratios
+write.table(predInd,paste0(prot,"_pred_evolInd.txt"))
+# output the predicted mutational effects based on sequence counts (conservation at the bottom)
+write.table(normPredInd,paste0(prot,"_normPred_evolInd.txt"))
+print("done")
+
+print("running global epistatic model...")
+pred=computePredSimple(ali,distTrace,wt,5)
+rownames(pred)=aa
+if(simple){
+  normPred=normalizePred(pred, trace, wt)
+  rownames(normPred)=aa
+}else{
+  normPred=normalizePredSelMult(pred, trace, wt, list(pos,subsaa))
+  names(normPred)=rawMut
+}
+# output the evolutionary distances between the query and the closest variants
+evolDist = pred/sum(trace^2)
+evolDist[is.na(evolDist)] = 1
+write.table(evolDist,paste0(prot,"_pred_evolEpi.txt"))
+# output the predicted mutational effects based on evolutionary distance (conservation at the bottom)
+write.table(normPred,paste0(prot,"_normPred_evolEpi.txt"))
+print("done")
+
+print("running combined model...")
+alpha = 0.6
+alphabet = "lz-bl.7"
+if(simple){
+  normPredCombi = normalizePredWithNbSeqsPC(pred,trace,wt,alpha,nbSeqs,alphabet)
+  rownames(normPredCombi)=aa
+}else{
+  normPredCombi = normalizePredWithNbSeqsPCSelMult(pred,trace,wt,list(pos,subsaa),alpha,nbSeqs[[1]],alphabet)
+  names(normPredCombi)=rawMut
+}
+# output the predicted mutational effects based on sequence counts (conservation at the bottom)
+write.table(normPredCombi,paste0(prot,"_normPred_evolCombi.txt"))
+print("done")
+
+if(simple){
+  print("generating the plots...")
+  plotMatBlues(paste(prot,"_normPred_evolEpi",sep=""))
+  plotMatGreens(paste(prot,"_normPred_evolInd",sep=""))
+  plotMatOranges(paste(prot,"_normPred_evolCombi",sep=""))
+  print("done")
+}
+
+
+	
--- a/default.conf
+++ b/default.conf
+*****************************************
+>SequenceRetrieving
+method		input						change this parameter with -r option of JET. See usage of JET to obtain description of this parameter
+format		fasta						fasta: fasta input file, fasta: fasta input file
+*****************************************
+>QBlast					
+eValue 		1.0E-5						psiblast maximum expected value threshold
+results		5000	            				maximum number of results 
+url 		http://www.ncbi.nlm.nih.gov/BLAST/Blast.cgi	BlastQ server URL
+database 	nr						database used
+matrix 		blosum62					matrix used to fetch homologs
+gap_existence	11						BLOSUM62=11, PAM30=9, BLOSUM45=15, PAM70=BLOSUM80=10  
+gap_extension	1						BLOSUM62=1, PAM30=1, BLOSUM45=2, PAM70=BLOSUM80=1
+max_iter	3						number of iteration for psi-blast
+****************************************
+>PDB
+url		http://www.rcsb.org/pdb/downloadFile.do		URL of PDB server
+
+*****************************************
+>Filter
+min_identity	0.20						min sequence identity
+max_identity	0.98						max sequence identity
+*****************************************
+>Sample
+length_cutoff	0.8						minimum sequence length expressed in number of residues	
+						
+*****************************************
+>Software
+clustalW	/usr/local/bin/clustalw2			clustalW system dependent command
+muscle		/usr/bin/muscle					muscle system dependent command
+naccess		/opt/JET2/naccess2.1.1/naccess			naccess system dependent command
+psiblast	/opt/blast-2.2.27+/bin/psiblast 		psiblast system dependent command     
+*****************************************
+>Data
+substMatrix	/opt/JET2/matrix				directory location of matrices used in JET (Blosum62, gonnet and hsdm)
+blastDatabases	/disk1/blastdb/					directory location of databases used for local blast (nr{0-7})
+
+*****************************************
+>ET
+coverage	0.95						maximum coverage percentage of trace
+freq_cutoff	0.0						minimum frequency of trace residue
+msaNumber	-1						number of alignments (trees), -1 for JET computting
+seqNumber	-1						number of sequences in alignments, -1 for JET computting
+
+*****************************************
+>Access
+probe_radius	1.4						radius of probe used for accessible surface detection
+res_cutoff	0.05						minimum percentage accessible surface of a residu
+atom_cutoff	0.01						minimum accessible surface of an atom 
+accessType	chain						change this parameter with -d option of JET. See usage of JET to obtain description of this parameter
+
+*****************************************
+>CV
+max_dist        20.0                                            max distance 
+*****************************************
+>Interface
+cutoff		0         					minimum percentage accessible surface variation of an interface residu
+ligand		no						(yes|no) keep contact of ligand (SUBSTRATE, PRODUCT and COFACTOR of database ENZYME) to compute interface of protein
+enzymeCpd	/opt/JET/jet/data/enzyme.txt			location of file containing database ENZYME
+homologousPDB	no						(yes|no) add interface residues of homologous structures (find in pdb database clustered at 95% of identities) to interface of protein
+clusteredPDB	/opt/JET/jet/data/clusters95.txt		location of pdb database clustered at 95% of identities
+
+*****************************************
+>Cluster
+max_dist	5.0						max distance between atoms to aggregate
+analysis	2						change this parameter with -a option of JET. See usage of JET to obtain description of this parameter
+namePcCol	pc						name of the column in results file containing the phisical-chemical score of residues (do not change this parameter)
+namePcLCol       pcL                                            name of the column in results file containing the residues propensities to be found at prot-lig interfaces (do not change this parameter)
+nameTraceCol	trace						name of the column in results file containing the conservation score of residues (do not change this parameter)
+coverage	-1						change this parameter with -s option of JET. See usage of JET to obtain description of this parameter
+	
--- a/example/aliBLAT.fasta
+++ b/example/aliBLAT.fasta
--- a/gemme.py
+++ b/gemme.py
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2018: Elodie Laine
+# This code is part of the gemme package and governed by its license.
+# Please see the LICENSE.txt file included as part of this package.
+
+import sys
+import os
+import argparse
+import re
+import subprocess
+import math
+from gemmeAnal import *
+
+
+def check_argument_groups(parser, arg_dict, group, argument):
+    """
+    Check for use of arguments.
+    Raise an error if the parser uses an argument that belongs to an argument
+    group (i.e. mode) but the group flag is not used or if the argument is
+    required in the argument group and it's not used.
+    Notes:
+    - argument and group should be strings and should start with - or --
+    - group should be a flag with action 'store_true'
+    - argument shouldn't be a flag, and should be None if it isn't used.
+    >>> import argparse
+    >>> parser = argparse.ArgumentParser()
+    >>> parser.add_argument('--phylo', action='store_true')
+    _StoreTrueAction(...)
+    >>> parser.add_argument('--inseq')
+    _StoreAction(...)
+    >>> args = parser.parse_args(['--phylo', '--inseq', 'not_none'])
+    >>> check_argument_groups(parser, vars(args), '--phylo', '--inseq', True)
+    """
+    c = 0
+    for myArg in argument:
+    	arg_name = myArg.replace("-", "")
+    	if arg_dict[arg_name]!='':
+    		c = c+1
+    group_name = group.replace("-", "")
+    if arg_dict[group_name]=="input":
+    	if c!=1:
+    		parser.error("gemme requires " + str(argument) +
+                         " if " + group + " is set to input.")
+    else:
+        if c>0:
+            parser.error("gemme requires " + group +
+                         " to be set to input if " + str(argument) + " is used.")
+    return None
+
+def parse_command_line():
+    """
+    Parse command line.
+    It uses argparse to parse phylosofs' command line arguments and returns the
+    argparse parser.
+    """
+    parser = argparse.ArgumentParser(
+        prog="gemme",
+        description="""
+        GEMME (Global Epistasis Model for predicting Mutational Effects) 
+        is a tool to predict mutational outcomes based on sequence analysis
+        """,
+        epilog="""
+        It has been developed at LCQB (Laboratory of Computational and
+        Quantitative Biology), UMR 7238 CNRS, Sorbonne Université.
+        If you use it, please cite:
+        Laine E, Karami Y, Carbone A. Predicting Protein Mutational Landscapes
+        using Evolutionary Conservation and Global Epistasis
+        """,
+    )
+
+    parser.add_argument(
+    	'input', metavar='aliFile', type=str,
+        help='input alignment file in FASTA format (first sequence = ungapped query sequence)',
+    )
+
+    parser.add_argument(
+        '-m', '--mutations',
+        help='text file containing the mutations of interest. Each line of the file should contain a mutation (e.g. D136R) or combination of mutations separated by commas and ordered according to their positions in the sequence (e.g. D136R,V271A)',
+        default=''
+    )
+
+    retMet_args = parser.add_argument_group('Conservation levels calculation', """
+        Arguments used for computing the conservation levels.
+        """)
+    
+    retMet_args.add_argument(
+        '-n', '--nIter',
+        help='number of iterations to compute the conservation levels',
+        default='1'
+    )
+
+    retMet_args.add_argument(
+        '-N', '--NSeqs',
+        help='maximum number of sequences to compute the conservation levels',
+        default='20000'
+    )
+        
+    retMet_args.add_argument(
+        '-r', '--retrievingMethod',
+        help='mode to retrieve related sequences for conservation levels calculation (input, local or server)',
+        default='local'
+    )
+
+    retMet_args.add_argument(
+        '-b', '--blastFile',
+        help='psiblast file containing related sequences',
+        default=''
+    )
+
+    retMet_args.add_argument(
+        '-f', '--fastaFile',
+        help='fasta file containing related sequences',
+        default=''
+    )
+
+    args = parser.parse_args()
+
+    arg_dict = vars(args)
+
+    check_argument_groups(parser, arg_dict, '--retrievingMethod', ['--blastFile','--fastaFile'])
+ 
+    # Check flag arguments
+    if args.input is None:
+    	parser.error("gemme requires an input alignment.")
+
+    return args
+
+
+def doit(inAli,mutFile,retMet,bFile,fFile,n,N):
+    """
+    doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,args.fastaFile)
+    """
+    prot,seq,nl=extractQuerySeq(inAli)
+    createPDB(prot,seq)
+    print "query protein: "+prot
+    print "computing conservation levels..."
+    launchJET(prot,retMet,bFile,fFile,n,N,nl)
+    print "done"
+    launchPred(prot,inAli,mutFile)
+    cleanTheMess(prot,bFile,fFile)
+
+
+def main():
+    args = parse_command_line()
+    doit(args.input,args.mutations,args.retrievingMethod,args.blastFile,args.fastaFile,args.nIter,args.NSeqs)
+
+if (__name__ == '__main__'):
+    main()
+
--- a/gemmeAnal.py
+++ b/gemmeAnal.py
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2018: Elodie Laine
+# This code is part of the gemme package and governed by its license.
+# Please see the LICENSE.txt file included as part of this package.
+
+import sys
+import os
+import argparse
+import re
+import math
+import subprocess
+
+
+# Extract the query sequence from the input alignment
+def extractQuerySeq(filename):
+
+	fIN = open(filename,"r")
+	lines = fIN.readlines()
+	fIN.close()
+
+	if lines[0][0]!=">":
+		raise Exception('bad FASTA format')
+	else:
+		prot = re.compile("[^A-Z0-9a-z]").split(lines[0][1:])[0]
+		fOUT = open(prot+".fasta","w")
+		fOUT.write(">"+prot+"\n")
+		seq=""
+		i = 1
+		while lines[i][0]!=">":
+			seq = seq + lines[i].strip().strip(".").strip("-")
+			fOUT.write(lines[i])
+			i = i + 1
+	fOUT.close()
+	return prot,seq,i-1
+
+# Get the number of sequences in a multi-fasta file
+def getNbSeq(filename):
+
+	if filename!='':
+		proc=subprocess.Popen("grep -c '^>' "+filename,stdout=subprocess.PIPE,shell=True)
+		return int(proc.stdout.read())
+	else:
+		return 0
+
+
+# Create a PDB file with dummy CA atoms based on the query sequence
+def createPDB(prot,seq):
+
+	d = {'C': 'CYS', 'D': 'ASP', 'S': 'SER',  'Q': 'GLN', 'K': 'LYS',
+	'I': 'ILE', 'P': 'PRO', 'T': 'THR', 'F': 'PHE', 'N': 'ASN', 
+	'G': 'GLY',  'H': 'HIS', 'L': 'LEU', 'R': 'ARG', 'W': 'TRP', 
+	'A': 'ALA', 'V': 'VAL', 'E': 'GLU', 'Y': 'TYR', 'M': 'MET'}
+
+	fOUT = open(prot+'.pdb','w')
+	i = 1
+	for let in seq:
+		fOUT.write('ATOM%7d  CA  %s A%4d      43.524  70.381  46.465  1.00   0.0\n'%(i,d[let],i))
+		i += 1
+	fOUT.close()
+
+# Edit JET configuration file with correct number of Seqs & MSA
+def editConfJET(N):
+
+        reCode=subprocess.call("sed -i 's/results\t\t5000/results\t\t"+str(N)+"/' default.conf",shell=True) 
+	return(reCode)
+
+# Run JET to compute TJET values
+def launchJET(prot,retMet,bFile,fFile,n,N,nl):
+
+	subprocess.call("cp $GEMME_PATH/default.conf .",shell=True)
+	if retMet=="input":
+		if bFile!='':
+			subprocess.call("cp "+bFile+" "+prot+"_A.psiblast",shell=True)
+			jetcmd = "java -Xmx1000m -cp $JET_PATH:$JET_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+prot+".pdb -o `pwd` -p J -r input -b "+prot+"_A.psiblast -d chain -n "+n+" > "+prot+".out"
+		else:
+                        print N
+			editConfJET(N)
+			#subprocess.call("cp "+fFile+" "+prot+"_A.fasta",shell=True)
+			subprocess.call("grep -m "+str(int(N)+1)+" -A "+str(nl)+" '^>' "+fFile+" > "+prot+"_A.fasta",shell=True)
+			jetcmd = "java -Xmx1000m -cp $JET_PATH:$JET_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+prot+".pdb -o `pwd` -p J -r input -f "+prot+"_A.fasta -d chain -n "+n+" > "+prot+".out"
+	else:
+		jetcmd = "java -Xmx1000m -cp $JET_PATH:$JET_PATH/jet/extLibs/vecmath.jar jet.JET -c default.conf -i "+prot+".pdb -o `pwd` -p J -r "+retMet+" -d chain -n "+n+" > "+prot+".out"
+	print jetcmd
+	reCode=subprocess.call(jetcmd,shell=True)
+	if os.path.isfile(prot+"/"+prot+"_jet.res"):
+		os.rename(prot+"/"+prot+"_jet.res",prot+"_jet.res")
+	return(reCode)
+
+# Run Rscript to compute predictions
+def launchPred(prot,inAli,mutFile):
+
+	if mutFile!='':
+		rcmd="Rscript --save $GEMME_PATH/computePred.R "+prot+" "+inAli+" FALSE "+mutFile
+	else:
+		rcmd="Rscript --save $GEMME_PATH/computePred.R "+prot+" "+inAli+" TRUE none"
+	print rcmd
+	reCode=subprocess.call(rcmd,shell=True)
+	return(reCode)
+
+# Remove temporary files
+def cleanTheMess(prot,bFile,fFile):
+
+	if bFile!='':
+		if bFile!=prot+"_A.psiblast":
+			os.remove(prot+"_A.psiblast")
+	else:
+		if os.path.isfile(prot+"/"+prot+"_A.psiblast"):
+			os.rename(prot+"/"+prot+"_A.psiblast",prot+"_A.psiblast")
+	if fFile!='':
+		if fFile!=prot+"_A.fasta":
+			if os.path.isfile(prot+"_A.fasta"):
+				os.remove(prot+"_A.fasta")		
+#	if os.path.isfile(prot+"/"+prot+"_jet.res"):
+#		os.rename(prot+"/"+prot+"_jet.res",prot+"_jet.res")
+	os.remove(prot+".pdb")
+	dir_name = prot+"/"
+	if os.path.isdir(dir_name):
+		for f in os.listdir(dir_name): 
+			f_path = os.path.join(dir_name, f)
+			if os.path.isfile(f_path):
+				os.remove(f_path)
+		os.rmdir(dir_name)
--- a/pred.R
+++ b/pred.R
+# Copyright (c) 2018: Elodie Laine
+# This code is part of the gemme package and governed by its license.
+# Please see the LICENSE.txt file included as part of this package.
+
+# module containing all R functions to manipulate data and compute pedictions
+
+
+###########################################################################################
+###################### Functions to manipulate alignment data #############################
+###########################################################################################
+
+# Given an alignment seqAli, the number of the sequence of interest (i) 
+# and a position (pos) in the sequence, get the corresponding 
+# index/position in the alignment (used by countSeq, see below)
+getIndexMatAli<-function(matAli, i, pos){
+        seq = matAli[i,]
+        countAll = 0
+        count = 0
+        while(count<pos){
+                countAll = countAll + 1
+                if(seq[countAll]!="-"){count = count + 1}
+        }
+        return(countAll)
+}
+
+# Given an alignment seqAli, the number of the sequence of interest (i) 
+# and a vector of positions (pos) in the sequence, get the corresponding 
+# indices/positions in the alignment (used by countSeq, see below)
+getIndexAli<-function(seqAli, i, pos){
+  seq = strsplit(seqAli[i][[1]],"")[[1]]
+  res = c()
+  countAll = 0
+  count = 0
+  for(p in pos){
+    while(count<p){
+      countAll = countAll + 1
+      if(seq[countAll]!="-"){count = count + 1}
+    }
+    res = c(res,countAll)
+  }
+  return(res)
+}
+
+# Given an alignment seqAli and the number of the sequence of interest (i) 
+# get all the indices/positions in the alignment 
+getIndicesAli<-function(seqAli, i){
+  seq = strsplit(seqAli[i][[1]],"")[[1]]
+  n = sum(seq!="-"&seq!=".")
+  res = c()
+  countAll = 0
+  count = 0
+  for(p in 1:n){
+    while(count<p){
+      countAll = countAll + 1
+      if(seq[countAll]!="-"&seq[countAll]!="."){count = count + 1}
+    }
+    res = c(res,countAll)
+  }
+  return(res)
+}
+
+# Compute the weights for sequences in the alignment
+computeWeights<-function(matAli,theta=0.2){
+  nbSeq = dim(matAli)[[1]]
+  nbPos = dim(matAli)[[2]]
+  res=c()
+  matD=matrix(0,nr=nbSeq,nc=nbSeq)
+  for(i in 1:(nbSeq-1)){
+    for(j in (i+1):nbSeq){
+      matD[i,j]=sum(matAli[i,]!=matAli[j,])/nbPos
+      matD[j,i]=matD[i,j]}
+    val=1/(sum(matD[i,]<theta))
+    res=c(res,val)
+    #print(val)
+  }
+  return(res)
+}
+
+computeNbSeqs<-function(mat,gap=FALSE){
+  if(gap){aa=c(aa,"-")}
+  occ = apply(mat,2,table)
+  n = length(occ)
+  res = matrix(0,nc=n,nr=length(aa))
+  colnames(res) = 1:n
+  rownames(res) = aa
+  for(i in 1:n){
+    reducedAA = intersect(aa,names(occ[[i]]))
+    res[reducedAA,i] = occ[[i]][reducedAA]
+  }
+  return(res)
+}
+
+# compute variability levels as the Shanon entropy
+computeConsSE<-function(mat,N){
+  mat[mat==0] = 1
+  return(apply(mat/N,2,f<-function(x){return(-sum(x*log2(x)))}))
+}
+
+# compute conservation levels as the Kullback-Leibler divergence
+computeConsKL<-function(mat){
+  n = apply(mat,2,sum)
+  mat = t(t(mat)/n)
+  mat = mat + 0.000001
+  bg = blosum62[,1][order(rownames(blosum62))]
+  return(apply(mat,2,f<-function(x){return(sum(x*log2(x/bg)))}))
+}
+
+computeSeqWeights<-function(mat,N,npos){
+  # compute the number of different observed amino acids in each column
+  r = apply(mat,2,f<-function(x){return(length(unique(x)))})
+  # compute occurrence of each letter at each position
+  occMat = computeNbSeqs(mat,TRUE)
+  #print(occMat)
+  # compute weights for each sequence
+  weightMat=matrix(nr=N,nc=npos)
+  for(k in 1:N){
+    midx=cbind(mat[k,],1:npos)
+    weightMat[k,] = 1 / (occMat[midx] * r)
+  }
+  indObs = sum(r)/npos
+  return(list(weightMat,indObs))
+}
+
+computePseudoCounts<-function(freqmat,npos){
+  PC = matrix(nr=length(aa),nc=npos)
+  rownames(PC)= aa
+  for(a in aa){
+    PC[a,] = apply(freqmat,2,f<-function(x){return(sum(x/bg*bgp[a,]))})
+  }
+  return(PC)
+}
+
+computePSSM<-function(mat,N,npos){
+  # compute sequence weights
+  res  = computeSeqWeights(mat,N,npos)
+  weights = res[[1]]
+  indObs = res[[2]]
+  # extend amino acid alphabet with gaps
+  aa=c(aa,"-")
+  # compute  weighted occurrences
+  occMat = matrix(0,nr=length(aa),nc=npos)
+  rownames(occMat) = aa
+  for(i in 1:npos){
+    counts = tapply(weights[,i],mat[,i],sum)
+    occMat[names(counts),i] = counts
+  }
+  # distribute gaps
+  occMat = occMat[1:20,] + t(occMat["-",]%*%t(bg))
+  # compute pseudo-counts
+  PC = computePseudoCounts(occMat,npos)
+  # distribute pseudo-counts according to an empirical (?) weight
+  occMat = (occMat*(indObs-1) + PC) /indObs
+  # normalize to relative frequencies
+  n = apply(occMat,2,sum)
+  freq = t(t(occMat)/n)
+  # compute Shannon entropy
+  SE = apply(freq,2,f<-function(x){return(-sum(x*log2(x)))})
+  # compute Kullback-Leibler divergence
+  KL = apply(freq,2,f<-function(x){return(sum(x*log2(x/bg)))})
+  # divide by bg freqs and convertto log-scores
+  pssm = 2 * log2(freq/bg)
+  return(list(SE,KL,pssm))
+}
+
+computePSSM2<-function(mat){
+  occMat = computeNbSeqs(mat)
+  n = apply(occMat,2,sum)
+  pssm = t(t(occMat)/n)
+  pssm = pssm + 0.000001
+  bg = blosum62[,1][order(rownames(blosum62))]
+  res = 2 * log2(pssm/bg)
+  return(list(pssm,res))
+}
+
+computeNbSeqsAlph<-function(nbSeqs,alphabet){
+	path=paste(Sys.getenv("GEMME_PATH"),"/alphabets/",sep="")
+        AA = read.table(paste(path,alphabet,".txt",sep=""))$V1
+        facAA = rep(NA,20)
+        for(Let in AA){
+                Let = toString(Let)
+                lets = strsplit(Let,"")[[1]]
+                for(let in lets){
+                        facAA[aa==tolower(let)]=Let
+                }
+        }
+        newNbSeqs = apply(nbSeqs,2,f<-function(x){return(tapply(x,facAA,sum))})
+        return(newNbSeqs)
+}
+
+readMut<-function(fname){
+	rawMut = read.table(fname,colClass="character")$V1
+	n = length(rawMut)
+	pos = list()
+	subsaa = list()
+	for(i in 1:n){
+		mut = strsplit(rawMut[i],",")[[1]]
+		tmpPos = c()
+		tmpSubsaa = c()
+		for(m in mut){
+			le=nchar(m)
+			tmpPos = c(tmpPos,as.numeric(substr(m,2,le-1)))
+			tmpSubsaa = c(tmpSubsaa,substr(m,le,le))
+		}
+		pos[[i]] = tmpPos
+                subsaa[[i]] = tmpSubsaa
+	}
+	return(list(pos,subsaa,rawMut))
+}
+
+which.class<-function(a,alphabet){
+        path=paste(Sys.getenv("GEMME_PATH"),"/alphabets/",sep="")
+        AA = read.table(paste(path,alphabet,".txt",sep=""))$V1
+        for(Let in AA){
+                Let = toString(Let)
+                lets = tolower(strsplit(Let,"")[[1]])
+                if(sum(lets==a)>0){return(Let)}
+        }
+}
+
+###########################################################################################
+###################### Functions to compute predictions  ##################################
+###########################################################################################
+
+# log-odd ratio between the mutated and wild-type sequence counts
+computePredNbSeqs<-function(wt, nbseqs){
+  n = length(wt)
+  pred = matrix(nc=n,nr=20)
+  rownames(pred)=aa
+  for(i in 1:n){
+    for(a in aa){
+       if(a!=wt[i]){
+         pred[a,i] = log(max(1,nbseqs[a,i])/nbseqs[wt[i],i])}
+      else{
+        pred[a,i] = 0}
+    }
+  }
+  return(pred)
+}
+
+computePredNbSeqsSelMult<-function(wt, nbseqs, listMut){
+  # check mutation list
+  pos = listMut[[1]]
+  let = listMut[[2]]
+  N = length(pos)
+  if(N!=length(let)){print("Problem!!!! not the same number of positions and aas !!")}
+  # compute predictions
+  pred = list()
+  for(i in 1:N){
+    myPos = pos[[i]]
+    myAA = tolower(let[[i]])
+    n = length(myPos)
+    predTMP = c()
+    subsStr=""
+    # treat individual mutations
+    for(k in 1:n){
+    #  subsStr=paste(subsStr,paste(j[k],myAA[k],sep=""),",")
+      predTMP = c(predTMP,log(max(1,nbseqs[myAA[k],myPos[k]])/nbseqs[wt[myPos[k]],myPos[k]]))
+    }
+    pred[[i]]=predTMP
+  }
+  return(pred)
+}
+
+normalizePredWithNbSeqsZero<-function(pred, trace, wt){
+  n = length(trace)
+  normPred = matrix(nc=n,nr=20)
+  rownames(normPred) = aa
+  for(i in 1:n){
+    normPred[,i] = pred[,i] * trace[i]
+    normPred[aa==wt[i],i] = NA
+  }
+  return(normPred)
+}
+
+normalizePredWithNbSeqsZeroSelMult<-function(pred, trace, wt, listMut){
+  pos = listMut[[1]]
+  let = listMut[[2]]
+  N = length(pos)
+  normPred = rep(NA,N)
+  for(i in 1:N){
+    myPos = pos[[i]]
+    myAA = tolower(let[[i]])
+    n = length(myPos)
+    predTMP = c()
+    for(k in 1:n){predTMP=c(predTMP,pred[myAA[k],myPos[k]])}
+    normPred[i] = sum(predTMP * trace[myPos])
+  }
+  return(normPred)
+}
+
+normalizePredWithNbSeqs<-function(pred, trace, wt, nbseqs, alpha){
+  n = length(trace)
+  normPred = matrix(nc=n,nr=20)
+  rownames(normPred) = aa
+  maxVal = max(apply(nbseqs,2,sum))
+  for(i in 1:n){
+    normPred[,i] = pred[,i] / max(pred[!is.na(pred)]) * log(1/maxVal) * (-1)
+    normPred[is.na(normPred[,i]),i] = - log(1/maxVal)
+    for(a in aa){
+      normPred[a,i] = alpha * normPred[a,i] - (1-alpha) * log(max(1,nbseqs[a,i])/nbseqs[wt[i],i])
+    }
+    normPred[,i] = normPred[,i] * trace[i]
+    normPred[aa==wt[i],i] = NA
+  }
+  return(-normPred)
+}
+
+findMinDist<-function(vec,delta){
+  le = length(vec)
+    # if no sequence was found with the substituting aa
+    # then no predicted value is computed  
+    if(le==0){res = NA}
+    else{
+    	# if only one sequence was found
+    	# then just take this distance
+    	if(le==1){res = NA}
+    	else{
+    		# if the minimum distance is null
+            # we are in the presence of the wild type
+            if(vec[1]==0){res = vec[1]}
+            	# otherwise, it means we have at least two non-zero values
+                else{
+                	# if there is no significant gap between the first and second values 
+                	# take the first value 
+                	if((vec[2]-vec[1])<=delta){res = vec[1]}
+                    	# otherwise, take the second one
+                	else{res = vec[2]}
+                }
+            }
+        }
+    return(res)
+}
+
+# Compute all evolutionary distances with all homologous 
+# sequences in an alignment (mat) with respect to all positions
+# and all possible amino acid substitutions
+# returns a matrix with aa rows and nbPos columns
+# for each aa and each position, we retain only the smallest distance
+computePredSimple<-function(mat, distTrace, wt, thresh){
+  # number of positions
+  leSeq = length(wt)
+  # result matrix
+  pred = matrix(nc=leSeq,nr=20)
+  # count nb seqs
+  N = dim(mat)[[2]]
+  # the resulting matrix contains trace values
+  # only in mutated sequences and at relevant positions
+ # matAli = convertAliToTrace(mat,trace,wt)
+  # go over all positions
+  for(i in 1:leSeq){
+    res = vector()
+    for(a in aa){
+      if(a!=wt[i]){
+        sel = which(mat[,i]==a)
+        if(length(sel)>0){sortedDist=sort(distTrace[sel-1])}
+        else{sortedDist = c()}
+        res = c(res,findMinDist(sortedDist,thresh))
+      }
+      else{
+        res = c(res,0)
+      }
+    }
+    pred[,i] = res
+  }
+  return(pred)
+}
+
+# Normalize predicted values 
+# scale between 0 and 2, with 2 replacing NAs, 
+# and then multiply by the trace of each position
+# if we have not found any sequence with the substitution of interest
+# we assume the effect is maximal (2)
+# then the effects are scaled according to the trace of the positions
+# the more conserved a position, the more important the effect
+normalizePred<-function(pred, trace, wt){
+	n = length(trace)
+	normPred = matrix(nc=n,nr=20)
+	for(i in 1:n){
+		normPred[,i] = pred[,i] / max(pred[!is.na(pred)])
+		normPred[is.na(normPred[,i]),i] = 1
+		normPred[,i] = normPred[,i] * trace[i]
+		normPred[aa==wt[i],i] = NA
+	}
+	return(-normPred)
+}
+
+normalizePredWithNbSeqsPC<-function(pred, trace, wt, alpha, nbSeqs, alphabet){
+	n = length(trace)
+	normPred = matrix(nc=n,nr=20)
+	rownames(normPred) = aa
+	nbseqs = computeNbSeqsAlph(nbSeqs,alphabet)
+	maxRefVal = max(apply(nbseqs,2,sum))
+	# for each position in the sequence
+	for(i in 1:n){
+		normPred[,i] = pred[,i] / max(pred[!is.na(pred)]) * log(1/maxRefVal) * (-1)
+		normPred[is.na(normPred[,i]),i] = - log(1/maxRefVal)
+		# for each possible substitution
+		for(a in aa){
+			A = which.class(wt[i],alphabet)
+			B = which.class(a,alphabet)
+			normPred[a,i] = alpha * normPred[a,i] - (1-alpha) * log(max(1,nbseqs[B,i])/nbseqs[A,i])
+		}
+		normPred[,i] = normPred[,i] * trace[i]
+		normPred[aa==wt[i],i] = NA
+	}
+	return(-normPred)
+}
+
+normalizePredWithNbSeqsPCSelMult<-function(pred, trace, wt, listMut, alpha, nbSeqs, alphabet){
+  # check mutation list
+  pos = listMut[[1]]
+  let = listMut[[2]]
+  N = length(pos)
+  if(N!=length(let)){print("Problem!!!! not the same number of positions and aas !!")}
+  normPred = rep(NA,N)
+  allVal = unlist(pred)
+  maxVal = max(allVal[!is.na(allVal)])
+  nbseqs = computeNbSeqsAlph(nbSeqs,alphabet)
+  maxRefVal = max(apply(nbseqs,2,sum))
+  # for each position in the sequence
+  for(i in 1:N){
+    predTMP=c()
+    myPos = pos[[i]]
+    myAA = tolower(let[[i]])
+    n = length(myPos)
+    for(k in 1:n){
+	    A = which.class(wt[myPos[k]],alphabet)
+	    B = which.class(myAA[k],alphabet)
+	    ratio = log(max(1,nbseqs[B,myPos[k]])/nbseqs[A,myPos[k]])
+	    val = pred[myAA[k],myPos[k]] / maxVal * log(1/maxRefVal) * (-1)
+	    if(is.na(val)){val=- log(1/maxRefVal)}
+	    predTMP=c(predTMP, alpha * val - (1-alpha) * ratio)
+    }
+    normPred[i] = sum(predTMP * trace[myPos])
+  }
+  return(-normPred)
+}
+
+normalizePredSelMult<-function(pred, trace, wt, listMut){
+	pos = listMut[[1]]
+ 	let = listMut[[2]]
+  	N = length(pos)
+	allVal = unlist(pred)
+	maxVal = max(allVal[!is.na(allVal)])
+	normPred=rep(NA,N)
+	for(i in 1:N){
+		myPos = pos[[i]]
+    		myAA = tolower(let[[i]])
+    		n = length(myPos)
+    		predTMP = c()
+    		for(k in 1:n){
+			val = pred[myAA[k],myPos[k]] / maxVal
+			if(is.na(val)){val = 1}
+			predTMP=c(predTMP,val)}
+		normPred[i] = sum(predTMP * trace[myPos])
+	}
+	return(-normPred)
+}
+
+
+library("RColorBrewer")
+
+##############################################
+### functions for ploting the results
+##############################################
+plotMatOranges<-function(pred_name){
+  
+	pred = as.matrix(read.table(paste(pred_name, ".txt", sep="")))
+	sel=seq(1,dim(pred)[[2]])
+        Iwidth = round(sqrt(dim(pred)[[2]])) + 3
+	prop=99
+	pred[is.na(pred)]=0
+	minVal = floor(min(pred))
+	maxVal = ceiling(max(pred))
+	p = (100-prop)/100
+	cutVal = quantile(pred,c(0,p))[2]
+	jpeg(paste(pred_name, '.jpg', sep=""), width = Iwidth, height = 5, units = 'in', res = 300)
+	print(c(-maxVal,-cutVal,-minVal))
+	image(sel,1:20,-t(pred[20:1,sel]),breaks=c(seq(-maxVal,-cutVal,le=80),-minVal),col=c(colorRampPalette(brewer.pal(9,"Greys"))(80)[1:10],colorRampPalette(brewer.pal(9,"Oranges"))(80)[11:70],colorRampPalette(brewer.pal(9,"Greys"))(80)[61:70]),yaxt="n",xaxt="n",ylab="",xlab="",bty="n")
+	axis(1,c(sel[1]-1,sel[seq(0,length(sel),by=10)]),las=2)
+	axis(2,1:20,toupper(rownames(pred)[20:1]),las=2,tick=FALSE)
+	dev.off()
+}
+
+plotMatGreens<-function(pred_name){
+
+	pred = as.matrix(read.table(paste(pred_name, ".txt", sep="")))
+        Iwidth = round(sqrt(dim(pred)[[2]])) + 3
+	sel=seq(1,dim(pred)[[2]])
+	prop=99
+        pred[is.na(pred)]=0
+        minVal = floor(min(pred))
+        maxVal = ceiling(max(pred))
+        p = (100-prop)/100
+        cutVal = quantile(pred,c(0,p))[2]
+        jpeg(paste(pred_name, '.jpg', sep=""), width = Iwidth, height = 5, units = 'in', res = 300)
+        print(c(-maxVal,-cutVal,-minVal))
+        image(sel,1:20,-t(pred[20:1,sel]),breaks=c(seq(-maxVal,-cutVal,le=80),-minVal),col=c(colorRampPalette(brewer.pal(9,"Greys"))(80)[1:10],colorRampPalette(brewer.pal(9,"Greens"))(80)[11:70],colorRampPalette(brewer.pal(9,"Greys"))(80)[61:70]),yaxt="n",xaxt="n",ylab="",xlab="",bty="n")
+        axis(1,c(sel[1]-1,sel[seq(0,length(sel),by=10)]),las=2)
+        axis(2,1:20,toupper(rownames(pred)[20:1]),las=2,tick=FALSE)
+        dev.off()
+}
+
+plotMatBlues<-function(pred_name){
+  
+	pred = as.matrix(read.table(paste(pred_name, ".txt", sep="")))
+	sel=seq(1,dim(pred)[[2]])
+        Iwidth = round(sqrt(dim(pred)[[2]])) + 3
+	prop=99 #95
+	pred[is.na(pred)]=0
+	minVal = floor(min(pred))
+	maxVal = ceiling(max(pred))
+	p = (100-prop)/100
+	cutVal = quantile(pred,seq(0,p,by=p))[2]
+	print(c(-maxVal,-minVal,-cutVal))
+	jpeg(paste(pred_name, '.jpg', sep=""), width = Iwidth, height = 5, units = 'in', res = 300)
+	#image(sel,1:20,-t(pred[,sel]),breaks=c(seq(-maxVal,-cutVal,le=80),-minVal),col=c(colorRampPalette(brewer.pal(9,"Greys"))(80)[11:20],colorRampPalette(brewer.pal(9,"Blues"))(80)[11:70],colorRampPalette(brewer.pal(9,"Greys"))(80)[71:80]),yaxt="n",xaxt="n",ylab="",xlab="",bty="n")
+	image(sel,1:20,-t(pred[20:1,sel]),breaks=c(seq(-maxVal,-cutVal,le=80),-minVal),col=c(colorRampPalette(brewer.pal(9,"Greys"))(80)[1:10],colorRampPalette(brewer.pal(9,"Blues"))(80)[11:70],colorRampPalette(brewer.pal(9,"Greys"))(80)[61:70]),yaxt="n",xaxt="n",ylab="",xlab="",bty="n")
+	axis(1,c(sel[1]-1,sel[seq(0,length(sel),by=10)]),las=2)
+	axis(2,1:20,toupper(rownames(pred)[20:1]),las=2,tick=FALSE)
+	dev.off()
+}