Commit ed79a631 by vice1987

First commit

parents
Riccardo Vicedomini
Jean-Pierre Bouly
Soizic Cheminant-Navarro
Elodie Laine
Angela Falciatore
Alessandra Carbone
This diff is collapsed. Click to expand it.
#!/usr/bin/env bash
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
CMD_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}" )" && pwd)
CMD_NAME=$(basename "${BASH_SOURCE[0]}")
SCRIPTS_DIR="${CMD_DIR}"/scripts
# Include common definitions
source "${CMD_DIR}/profileview-common"
# Pressing CTRL-C will stop the whole execution of the script
trap ctrl_c INT; function ctrl_c() { exit 5; }
# Definition of functions and global variables specific to this script
PVLIB_FORCE=false
PVLIB_PATH=""
PVLIB_DOMID=""
NTHREADS=2
NJOBS=8
PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
if ! command -v parallel >/dev/null 2>&1; then
print_warning "cannot find GNU parallel, all jobs will be run sequentially"
PEXEC_CMD="/usr/bin/env bash --"
fi
function print_usage() {
echo -en "\n USAGE: ${CMD_NAME} -i <input_fasta> -d <hhblits_db> -n <lib_name> [options]\n"
echo -en "\n"
echo -en " MANDATORY OPTIONS:\n
-i, --input <name>\tInput file of domain sequences in FASTA format\n
-D, --domain-id <name>\tDomain identifier\n
\t(e.g., Pfam accession number PF03441)\n
-d, --db <name>\thhblits database name\n
\t(multiple databases may be specified with '-d <name1> -d <name2> ...')\n
-n, --lib-name <name>\tName of ProfileView library\n" | column -t -s $'\t'
echo -en "\n"
echo -en " OTHER OPTIONS:\n
--force\tForce the construction of ProfileView's library even if the directory already exists\n
\tpossibly overwriting previously generated files\n
-t, --threads <num>\tNumber of threads for each hhblits job (default:2)\n
-j, --max-jobs <num>\tNumber of parallel jobs (default:8)\n
-h, --help\tPrint this help message\n
-V, --version\tPrint version\n" | column -t -s $'\t'
echo -en "\n"
}
# retrieve provided arguments
opts="i:D:d:n:t:j:hV"
longopts="input:,domain-id:,db:,lib-name:,force,threads:,max-jobs:,help,version"
ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
if [ $? -ne 0 ] || [ $# -eq 0 ]; then # do not change the order of this test!
print_usage
exit 1
fi
eval set -- "${ARGS}"
declare -a DBNAMES
while [ -n "${1}" ]; do
case ${1} in
-i|--input)
shift
INPUT_FASTA=${1}
;;
-D|--domain-id)
shift
PVLIB_DOMID=${1}
;;
-d|--db)
shift
DBNAMES+=(${1})
;;
-n|--lib-name)
shift
PVLIB_PATH=${1}
;;
--force)
PVLIB_FORCE=true
;;
-t|--threads)
shift
NTHREADS=${1}
;;
-j|--max-jobs)
shift
NJOBS=${1}
;;
-h|--help)
print_usage
exit 0
;;
-V|--version)
print_version
exit 0
;;
--)
shift
break
;;
esac
shift
done
# Input parameters validation
if [ ${#DBNAMES[@]} -eq 0 ]; then
print_error "no hh-suite database provided with the -d|--db parameter"
exit 1
else
for db in "${DBNAMES[@]}"; do
db_dir=$(dirname "$db")
if [ ! -d "${db_dir}" ]; then
print_error "hh-suite database path \"${dbn_dir}\" cannot be found"
exit 1
fi
done
fi
if [ -z "${INPUT_FASTA}" ] || [ ! -f ${INPUT_FASTA} ]; then
print_error "-i|--input file is missing or does not exist"
exit 1
fi
if [ -z "${PVLIB_DOMID}" ]; then
print_error "-D|--domain-id parameter is mandatory"
exit 1
fi
if [ -z "${PVLIB_PATH}" ]; then
print_error "-n|--name parameter is mandatory"
exit 1
elif [ "${PVLIB_FORCE}" = false ] && [ -d "${PVLIB_PATH}" ]; then
print_error "it seems that profileview library \"${PVLIB_PATH}\" already exists; delete its directory or use a different name."
exit 1
fi
if ! [[ "${NTHREADS}" =~ ^[0-9]+$ ]] || [ ${NTHREADS} -lt 1 ] ; then
print_warning "-t|--threads parameter should be a positive integer; the default value of 1 will be used."
NTHREADS=1
fi
if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then
print_warning "-j|--max-jobs parameter should be a positive integer; the default value of 1 will be used."
NJOBS=1
fi
check_cmds "hhblits" "reformat.pl" "hmmbuild" "python3"
# Create ProfileView library directory
PVLIB_NAME=$(basename "${PVLIB_PATH}")
PVLIB_PREFIX="$(dirname "${PVLIB_PATH}")"/"${PVLIB_NAME}"
mkdir -p ${PVLIB_PREFIX}/{query,a3m,hhm,afa,fas,hmm}
if [ $? -ne 0 ]; then
print_error "cannot create all library directories"
exit 1
fi
PVLIB_DIR=$(abspath "${PVLIB_PREFIX}")
PVLIB_QUERY="${PVLIB_DIR}"/query
# First split input fasta file
print_status "splitting input file: ${INPUT_FASTA}"
awk -v queryDir="${PVLIB_QUERY}" '
/^>/ {
idstr=substr($0,2)
gsub(/[^A-Za-z0-9._-]/,"_",idstr)
split(idstr,ida)
sname=ida[1]
printf(">%s\n",idstr) >queryDir"/"sname".fa"
next
}
sname!="" {
print >queryDir"/"sname".fa"
}
' "${INPUT_FASTA}"
# Run hhblits on each sequences of the fas directory
#TODO: do not execute hhblits if model had been already built
print_status "building hhblits models"
HHBLITS_DBS=""; for hhdb in "${DBNAMES[@]}"; do HHBLITS_DBS="${HHBLITS_DBS} -d ${hhdb}"; done
for query in "${PVLIB_QUERY}"/*.fa; do
[ -e "${query}" ] || continue
queryBase=${query##*/}
queryName=${queryBase%.fa}
echo "hhblits ${HHBLITS_DBS} -i ${query} -o stdout -oa3m ${PVLIB_PREFIX}/a3m/${queryName}.a3m -ohhm ${PVLIB_PREFIX}/hhm/${queryName}.hhm -M first -qid 60 -cov 70 -id 98 -e 1e-10 -n 3 -cpu ${NTHREADS} -v 0 >/dev/null 2>&1"
done | ${PEXEC_CMD} # left it unquoted!
print_status "formatting a3m files"
for a3m in "${PVLIB_DIR}"/a3m/*.a3m; do
[ -e "${a3m}" ] || continue
a3mBase="${a3m##*/}"
a3mName="${a3mBase%.a3m}"
afaPath="${PVLIB_DIR}/afa/${a3mName}.afa"
fasPath="${PVLIB_DIR}/fas/${a3mName}.fas"
echo "reformat.pl a3m fas ${a3m} ${afaPath} -uc -l 80 -v 0 >/dev/null 2>&1 && reformat.pl a3m fas ${a3m} ${fasPath} -M first -r -uc -l 80 -v 0 >/dev/null 2>&1"
done | ${PEXEC_CMD} # left it unquoted!
print_status "building HMMER models"
for afa in "${PVLIB_DIR}"/afa/*.afa; do
[ -e "${afa}" ] || continue
afaBase="${afa##*/}"
afaName="${afaBase%.afa}"
hmm="${PVLIB_DIR}/hmm/${afaName}.hmm"
echo "hmmbuild ${hmm} ${afa} >/dev/null 2>&1"
done | ${PEXEC_CMD} # left it unquoted!
print_status "generating ProfileView library data"
for hmm in "${PVLIB_DIR}"/hmm/*.hmm; do
[ -e "${hmm}" ] || continue
awk -v domId="${PVLIB_DOMID}" '$1~/^NAME$/{hmm_name=$2;next} $1~/^LENG$/{hmm_len=$2;next} $1~/^NSEQ$/{hmm_nseq=$2;next} $1~/^HMM$/{ OFS=","; print hmm_name,domId,hmm_nseq,hmm_len }' ${hmm}
done > "${PVLIB_PREFIX}"/"${PVLIB_NAME}".models.list
python3 "${SCRIPTS_DIR}"/createHHdict.py --hhm-dir "${PVLIB_DIR}/hhm" --prefix "${PVLIB_DIR}"/${PVLIB_NAME}.hhdict >/dev/null 2>&1
python3 "${SCRIPTS_DIR}"/createHmmerDict.py --hmm-dir "${PVLIB_DIR}/hmm" --prefix "${PVLIB_DIR}"/${PVLIB_NAME}.hmmdict >/dev/null 2>&1
#!/usr/bin/env bash
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
# Common functions and definitions to be used in ProfileView
PV_NAME="ProfileView"
PV_VERSION='1.0'
PV_DATE='190531'
function abspath() { echo "$(cd "$(dirname "$1")"; pwd -P)/$(basename "$1")"; }
function print_error() { echo "[error] ${@}" >&2; }
function print_warning() { echo "[warning] ${@}" >&2; }
function print_status() { echo "[main] ${@}" >&2; }
function print_version() { echo "${PV_NAME} ${PV_VERSION}-${PV_DATE}"; }
function check_cmds() {
cmds=("$@")
declare -a notfound
for cmd in "${cmds[@]}"; do
if ! command -v "${cmd}" >/dev/null 2>&1; then
notfound+=("${cmd}")
fi
done
if [ ${#notfound[@]} -gt 0 ]; then
print_error "cannot find the following commands: ${notfound[@]}"
print_error "check your PATH environment variable"
exit 1
fi
return 0
}
function check_pymodules(){
check_cmds "python3"
mods=("$@")
declare -a notfound
for mod in "${mods[@]}"; do
if ! python3 -c "import ${mod}" >/dev/null 2>&1; then
notfound+=("${mod}")
fi
done
if [ ${#notfound[@]} -gt 0 ]; then
print_error "cannot find the following python3 modules: ${notfound[@]}"
exit 1
fi
return 0
}
function check_files() {
files=("$@")
declare -a notfound
for f in "${files[@]}"; do
if [ ! -e "${f}" ]; then notfound+=("${f}"); fi
done
if [ ${#notfound[@]} -gt 0 ]; then
print_error "cannot find the following files:"
for f in "${notfound[@]}"; do echo "$f"; done
exit 1
fi
return 0
}
#!/usr/bin/env bash
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
CMD_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}" )" && pwd)
CMD_NAME=$(basename "${BASH_SOURCE[0]}")
SCRIPTS_DIR="${CMD_DIR}"/scripts
# Include common definitions
source "${CMD_DIR}/profileview-common"
# Pressing CTRL-C will stop the whole execution of the script
trap ctrl_c INT;
function ctrl_c() { exit 5; }
# Definition of functions and global variables specific to this script
MODLIST=""
PV_LIBPATH=""
PV_OUTTREE=""
PV_SEQDESC=""
PV_OUTDIR="out_$(date +%Y%m%d_%H%M%S)"
PV_TMPDIR=""
NJOBS=8
PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
if ! command -v parallel >/dev/null 2>&1; then
print_warning "cannot find GNU parallel, all jobs will be run sequentially"
PEXEC_CMD="/usr/bin/env bash --"
fi
function print_usage() {
echo -en "\n USAGE: ${CMD_NAME} -l <profileview_lib> -m <model-list> [options]\n"
echo -en "\n"
echo -en " MANDATORY OPTIONS:\n
-l, --lib <name>\tProfileView library name\n
-m, --model-list <name>\tFile containing a list of models (one identifier per line)\n
" | column -t -s $'\t'
echo -en "\n"
echo -en " OTHER OPTIONS:\n
-o, --out-dir <name>\tPrefix of output files (default: out_<current-date>)\n
--temp-dir <name>\tTemporary result directory\n
-j, --max-jobs <num>\tNumber of parallel jobs (default:8)\n
-h, --help\tPrint this help message and exit\n
-V, --version\tPrint version and exit\n
" | column -t -s $'\t'
echo -en "\n"
}
# retrieve provided arguments
opts="l:m:o:j:hV"
longopts="lib:,models:,out-dir:,temp-dir:,max-jobs:,help,version"
ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important!
print_usage
exit 2
fi
eval set -- "${ARGS}"
declare -a DBNAMES
while [ -n "${1}" ]; do
case ${1} in
-l|--lib)
shift
PV_LIBPATH=${1}
;;
-m|--models)
shift
MODLIST=${1}
;;
-o|--out-dir)
shift
PV_OUTDIR=${1}
;;
--temp-dir)
shift
PV_TMPDIR=${1}
;;
-j|--max-jobs)
shift
NJOBS=${1}
;;
-h|--help)
print_usage
exit 0
;;
-V|--version)
print_version
exit 0
;;
--)
shift
break
;;
esac
shift
done
# Pre-requisites check
check_cmds "hhalign" "python3"
check_pymodules "Bio" "weblogolib" "BitVector"
# Input parameters validation
if [ -z "${PV_LIBPATH}" ]; then
print_error "-l|--lib parameter is mandatory"
print_usage
exit 1
elif [ ! -d "${PV_LIBPATH}" ]; then
print_error "ProfileView library not found: ${PV_LIBPATH}"
exit 1
fi
PV_LIBNAME=$(basename "${PV_LIBPATH}")
PV_LIBDIR=$(abspath "${PV_LIBPATH}")
if [ -z "${MODLIST}" ]; then
print_error "-m|--model-list parameter is mandatory"
print_usage
exit 1
elif [ ! -f "${MODLIST}" ]; then
print_error "cannot find model list file: ${MODLIST}"
exit 1
fi
# Create temp working directory
if [ -z "${PV_TMPDIR}" ]; then
PV_TMPDIR=$(mktemp -p "${TMPDIR:-.}" -d pvtmp-XXXXX) || {
print_error "cannot create temporary directory"
exit 1
}
fi
PV_TMPDIR=$(abspath "${PV_TMPDIR}")
if [ ! -d "${PV_TMPDIR}" ] || [ -n "$(ls -A "${PV_TMPDIR}")" ]; then
print_error "provided path \"${PV_TMPDIR}\" must be an empty directory"
exit 1
fi
print_status "using temporary directory: ${PV_TMPDIR}"
# Create log file
PV_LOGFILE="${PV_TMPDIR}/profileview-tree_$(date +%Y%m%d_%H%M%S).log"
touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
# define model paths
declare -a modlst
while read -r line; do
if [ -z $line ]; then continue; fi # skip empty lines
mod_path="${PV_LIBDIR}/hhm/${line}.hhm"
modlst+=("${mod_path}")
done <<< "$(awk -F, '!/^#/ && NF>0{printf("%s\n",$1)}' "${MODLIST}")"
###################
##### HHALIGN #####
###################
print_status "running hhalign jobs for pairwise comparison of models"
HHALIGN_HHRDIR="${PV_TMPDIR}/hhr"
for query in "${modlst[@]}"; do
[ -e "${query}" ] || continue
queryBase=${query##*/}
queryName=${queryBase%.hhm}
mkdir -p "${HHALIGN_HHRDIR}/${queryName}"
for target in "${modlst[@]}"; do
[ -e "${target}" ] || continue
targetBase=${target##*/}
targetName=${targetBase%.hhm}
[ "${queryName}" = "${targetName}" ] || echo "hhalign -i ${query} -t ${target} -o ${HHALIGN_HHRDIR}/${queryName}/${targetName}.hhr -v 0"
done
done | ${PEXEC_CMD} >>"${PV_LOGFILE}" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then
print_error "error during hhalign jobs, see log: ${PV_LOGFILE}"
exit 1
fi
# merge hhalign results
print_status "merging hhalign job results"
HHALIGN_RESDIR="${PV_TMPDIR}/hhalign"
mkdir -p "${HHALIGN_RESDIR}"
for query in "${modlst[@]}"; do
[ -e "${query}" ] || continue
queryBase=${query##*/}
queryName=${queryBase%.hhm}
#mkdir -p "${HHALIGN_HHRDIR}/${queryName}"
for target in "${modlst[@]}"; do
[ -e "${target}" ] || continue
targetBase=${target##*/}
targetName=${targetBase%.hhm}
[ "${queryName}" = "${targetName}" ] || cat "${HHALIGN_HHRDIR}/${queryName}/${targetName}.hhr"
done >"${HHALIGN_RESDIR}/${queryName}.hhr"
done
print_status "generating motifs"
HHDICT="${PV_LIBDIR}/${PV_LIBNAME}.hhdict.pgz"
mkdir -p "${PV_OUTDIR}"
if [ $? -ne 0 ]; then
print_error "cannot create output directory"
exit 1
fi
python3 "${SCRIPTS_DIR}/parseHHR.py" --hhm-list "${MODLIST}" --hhm-dict "${HHDICT}" --hhr-dir "${HHALIGN_RESDIR}" --fas-dir "${PV_LIBDIR}/fas" --out-dir "${PV_OUTDIR}" 2>>"${PV_LOGFILE}"
#!/usr/bin/env bash
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
CMD_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}" )" && pwd)
CMD_NAME=$(basename "${BASH_SOURCE[0]}")
SCRIPTS_DIR="${CMD_DIR}"/scripts
# Include common definitions
source "${CMD_DIR}/profileview-common"
# Pressing CTRL-C will stop the whole execution of the script
trap ctrl_c INT;
function ctrl_c() { exit 5; }
# Definition of functions and global variables specific to this script
INPUT_FASTA=""
PV_LIBPATH=""
PV_OUTTREE=""
PV_SEQDESC=""
PV_OUTPREFIX="out"
PV_TMPDIR=""
NJOBS=8
PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
if ! command -v parallel >/dev/null 2>&1; then
print_warning "cannot find GNU parallel, all jobs will be run sequentially"
PEXEC_CMD="/usr/bin/env bash --"
fi
function print_usage() {
echo -en "\n USAGE: ${CMD_NAME} -i <input_fasta> -l <profileview_lib> [options]\n"
echo -en "\n"
echo -en " MANDATORY OPTIONS:\n
-i, --input <name>\tFile containing the sequences to be classified in FASTA format\n
-l, --lib <name>\tProfileView library name\n" | column -t -s $'\t'
echo -en "\n"
echo -en " OTHER OPTIONS:\n
-o, --out-tree <name>\tOutput file which will contain the ProfileView tree\n
\t(if not provided, it will be printed on the standard output)\n
-s, --seq-desc <name>\tInput sequence descriptor file, that is a CSV file containing the follwing fileds:\n
\t<sequence_id>,<function_id>,<family_id>,<sequence_length>\n
-p, --prefix <name>\tPrefix of output files (default:${PV_OUTPREFIX})\n
--temp-dir <name>\tTemporary result directory\n
-j, --max-jobs <num>\tNumber of parallel jobs (default:8)\n
-h, --help\tPrint this help message\n
-V, --version\tPrint version\n" | column -t -s $'\t'
echo -en "\n"
}
# retrieve provided arguments
opts="i:l:o:s:p:j:hV"
longopts="input:,lib:,out-tree:,seq-desc:,prefix:,temp-dir:,max-jobs:,help,version"
ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important!
print_usage
exit 2
fi
eval set -- "${ARGS}"
declare -a DBNAMES
while [ -n "${1}" ]; do
case ${1} in
-i|--input)
shift
INPUT_FASTA=${1}
;;
-l|--lib)
shift
PV_LIBPATH=${1}
;;
-o|--out-tree)
shift
PV_OUTTREE=${1}
;;
-s|--seq-desc)
shift
PV_SEQDESC=${1}
;;
-p|--prefix)
shift
PV_OUTPREFIX=${1}
;;
--temp-dir)
shift
PV_TMPDIR=${1}
;;
-j|--max-jobs)
shift
NJOBS=${1}
;;
-h|--help)
print_usage
exit 0
;;
-V|--version)
print_version
exit 0
;;
--)
shift
break
;;
esac
shift
done
# Input parameters validation
if [ -z "${INPUT_FASTA}" ]; then
print_error "-i|--input parameter is mandatory"
print_usage
exit 1
elif [ ! -f "${INPUT_FASTA}" ]; then
print_error "Input file does not exist: ${INPUT_FASTA}"
exit 1
fi
if [ -z "${PV_LIBPATH}" ]; then
print_error "-l|--lib parameter is mandatory"
print_usage
exit 1
elif [ ! -d "${PV_LIBPATH}" ]; then
print_error "ProfileView library not found: ${PV_LIBPATH}"
exit 1
fi
PV_LIBNAME=$(basename "${PV_LIBPATH}")
PV_LIBDIR=$(abspath "${PV_LIBPATH}")
if [ ! -z "${PV_SEQDESC}" ] && [ ! -f "${PV_SEQDESC}" ]; then
print_error "Cannot find sequence descriptor file: ${PV_SEQDESC}"
exit 1
fi
if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then
print_warning "-j|--max-jobs parameter should be a positive integer; the default value of 1 will be used."
NJOBS=1
fi
check_cmds "hmmsearch" "python3" "Rscript"
check_pymodules "ete3" "numpy"
#check_files "${SCRIPTS_DIR}"/{createHHdict.py,createHmmerDict.py,hh_utils.py,pv_utils.py}
# Create temp working directory
if [ -z "${PV_TMPDIR}" ]; then
PV_TMPDIR=$(mktemp -p "${TMPDIR:-.}" -d pvtmp-XXXXX) || {
print_error "cannot create temporary directory"
exit 1
}
fi
PV_TMPDIR=$(abspath "${PV_TMPDIR}")
if [ ! -d "${PV_TMPDIR}" ] || [ -n "$(ls -A "${PV_TMPDIR}")" ]; then
print_error "provided path \"${PV_TMPDIR}\" must be an empty directory"
exit 1
fi
print_status "using temporary directory: ${PV_TMPDIR}"
# Create log file
PV_LOGFILE="${PV_TMPDIR}/profileview-tree_$(date +%Y%m%d_%H%M%S).log"
touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
# Possibly create a sequence descriptor file (if not provided)
if [ -z "${PV_SEQDESC}" ]; then
PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
print_status "creating temporary sequence descriptor file: ${PV_SEQDESC}"
awk '
BEGIN {
sname=""
seqlen=0
OFS=","
}
/^>/ {
if(sname!=""){ print sname,"NA","NA",seqlen }
sname=substr($1,2)
seqlen=0
next
}
{
for(i=1;i<=NF;i++){seqlen+=length($i)}
}
END {
if(sname!=""){ print sname,"NA","NA",seqlen }
}
' "${INPUT_FASTA}" >"${PV_SEQDESC}"
else
cp "${PV_SEQDESC}" "${PV_TMPDIR}/sequences.csv"
PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
fi
# Run hmmsearch for each model of the library against the input sequences
print_status "running hmmsearch against input sequences"
HMMSEARCH_RESDIR="${PV_TMPDIR}"/hmmsearch
mkdir -p "${HMMSEARCH_RESDIR}"
for hmm in "${PV_LIBDIR}"/hmm/*.hmm; do
[ -e "${hmm}" ] || continue
hmmBase=${hmm##*/}
hmmName=${hmmBase%.hmm}
echo "hmmsearch -o ${HMMSEARCH_RESDIR}/${hmmName}.out ${hmm} ${INPUT_FASTA} 2>>${PV_LOGFILE}"
done | ${PEXEC_CMD}
if [ $? -ne 0 ]; then
print_error "error during hmmsearch jobs, see log: ${PV_LOGFILE}"
exit 1
fi
print_status "processing hmmsearch output files"
PV_HMMLIB="${PV_LIBDIR}"/"${PV_LIBNAME}".hmmdict.pgz
PV_SCOREFILE="${PV_TMPDIR}"/hmmsearch.scores
for f in "${HMMSEARCH_RESDIR}"/*.out; do
[ -e "${f}" ] || continue
cat "${f}"
done | python3 "${SCRIPTS_DIR}"/parseHMMER.py --hmmer-dict "${PV_HMMLIB}" --seq-db "${PV_SEQDESC}" >"${PV_SCOREFILE}" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then
print_error "error during processing of hmmsearch results, see log: ${PV_LOGFILE}"
exit 1
fi
print_status "filtering sequences"
PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null
print_status "building representation space"
python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k 3 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then
print_error "error during feature generation, see log: ${PV_LOGFILE}"
exit 1
fi
print_status "building ProfileView tree"
Rscript --vanilla "${SCRIPTS_DIR}"/clusterSequences.R "${PV_TMPDIR}/out.feat" "${PV_TMPDIR}/out.tree" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then
print_error "could not create ProfileView tree, see log: ${PV_LOGFILE}"
exit 1
fi
print_status "finding representative models and generating annotated ProfileView tree"
python3 "${SCRIPTS_DIR}/findReprModels.py" -t "${PV_TMPDIR}/out.tree" -s "${PV_SCOREFILE}" -m "${PV_TMPDIR}/out.used_models.list" -o "${PV_OUTPREFIX}" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then
print_error "error during identification of representative models and the construction of the annotated ProfileView tree, see log: ${PV_LOGFILE}"
exit 1
fi
#!/usr/bin/env Rscript
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
library(ape)
args = commandArgs(trailingOnly=TRUE)
if (length(args)==0) {
stop("at least one argument must be supplied")
} else if (length(args)==1) {
args[2] <- ''
}
feat <- read.table( args[1], row.names=1, header=TRUE, sep="\t", na.strings=c("") )
pc <- prcomp(feat[,c(-1,-2)], scale=T)
eigs <- pc$sdev^2
cumvar = cumsum(eigs)/sum(eigs) # cumulative explaned variance of PCs
pc_i = min( c(length(cumvar),which(cumvar >= .99)) ) # get the PC that allows to explain at least the 99% of the variance
d <- dist(pc$x[,1:pc_i])
hc <- hclust(d,method="ward.D2")
my_tree <- as.phylo(hc)
if ( args[2] == '' ) {
write.tree(phy=my_tree,file=stdout())
} else {
write.tree(phy=my_tree,file=args[2])
}
#!/usr/bin/env python3
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
import sys, os, argparse
import time
import re
import math
import pickle, gzip
from pv_utils import *
from hh_utils import hhm, parse_hhm
script_name = re.sub( r'\.py$', '', os.path.basename(__file__) )
def main(argv=None):
start = time.perf_counter()
parser = argparse.ArgumentParser()
parser.add_argument('--hhm-file', dest='hhmFile', type=str, help='hhm file path')
parser.add_argument('--hhm-dir', dest='hhmDir', type=str, help='hhm directory path')
parser.add_argument('--prefix', dest='outPrefix', default="hhdict", type=str, help='output prefix of generated dictionary')
args = parser.parse_args()
# parameters check
validParameters = True;
if args.hhmFile is None and args.hhmDir is None:
validParameters = False
print_error('either one of --hhm-file or --hhm-dir arguments must be set')
if args.hhmFile is not None and args.hhmDir is not None:
validParameters = False
print_error('it is not possible to use both --hhm-file and --hhm-dir arguments')
elif args.hhmFile is not None and not os.path.isfile(args.hhmFile):
validParameters = False
print_error('--hhm-file \"{}\" is not a file'.format(args.hhmFile))
elif args.hhmDir is not None and not os.path.isdir(args.hhmDir):
validParameters = False
print_error('--hhm-dir \"{}\" is not a directory'.format(args.hhmDir))
if not validParameters:
return 1
fileList = []
if args.hhmFile is not None:
fileList = [ args.hhmFile ]
if args.hhmDir is not None:
hhms_dir = os.path.abspath(args.hhmDir)
fileList = [ '{}/{}'.format(hhms_dir,f) for f in os.listdir(hhms_dir) if f.endswith('.hhm') and os.path.isfile('{}/{}'.format(hhms_dir,f)) ]
if len(fileList) == 0:
print_warning('no hhm file found in directory "{}"'.format(args.hhmDir))
print_status("parsing HHM files")
hhmDict = {}
for fname in fileList:
with open(fname,'r') as fh:
hhm = parse_hhm(fh)
hhmDict[ hhm.name ] = hhm
hhmDictFileName = "{}.pgz".format(args.outPrefix)
print_status("writing hhm dictionary to \"{}\"".format(script_name,hhmDictFileName))
with gzip.GzipFile(hhmDictFileName, 'w') as hhmDictFile:
pickle.dump( hhmDict, hhmDictFile )
exec_time = time.perf_counter() - start
print_status('execution time: {:.2f}s'.format(script_name,exec_time));
return 0
# Check if the program is not being imported
if __name__ == "__main__":
sys.exit(main())
#!/usr/bin/env python3
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
import sys, os, argparse
import time
import gzip, pickle
from pv_utils import *
from hmm_utils import *
# a HMMER3 model is assumed
def parseHmmerModel( fh ):
hmm = hmm_model()
while True:
line = fh.readline()
if not line:
break
tokens = line.split()
if len(tokens) == 0:
continue
elif tokens[0] == "NAME":
hmm.name = tokens[1]
elif tokens[0] == "LENG":
hmm.length = int(tokens[1])
elif tokens[0] == "NSEQ":
hmm.nseq = int(tokens[1])
elif tokens[0] == "HMM":
fh.readline() # skip line after HMM: m->m m->i m->d i->m i->i d->m d->d
stateCount = 1
line = fh.readline()
while line and not line.startswith("//"):
tokens = line.split()
if len(tokens) == 26 and stateCount == int(tokens[0]):
hmm.states.append([float(v) for v in tokens[1:21]])
stateCount += 1
elif len(tokens) == 21 and tokens[0] == "COMPO":
hmm.bg_freq = [float(v) for v in tokens[1:21]]
line = fh.readline()
if line.startswith("//") and hmm.is_valid():
hmm.states = [hmm.bg_freq] + hmm.states
yield hmm
hmm = hmm_model()
def main(argv=None):
start = time.perf_counter()
parser = argparse.ArgumentParser()
parser.add_argument('--hmm-file', dest='hmmFile', type=str, help='hmm file path')
parser.add_argument('--hmm-dir', dest='hmmDir', type=str, help='hmm directory path')
parser.add_argument('--prefix', dest='outPrefix', default="hmmerDict", type=str, help='output prefix of generated dictionary')
args = parser.parse_args()
# parameters check
validParameters = True;
if args.hmmFile is None and args.hmmDir is None:
validParameters = False
print_error('either one of --hmm-file or --hmm-dir arguments must be set')
if args.hmmFile is not None and args.hmmDir is not None:
validParameters = False
print_error('it is not possible to use both --hmm-file and --hmm-dir arguments')
elif args.hmmFile is not None and not os.path.isfile(args.hmmFile):
validParameters = False
print_error('--hmm-file \"{}\" is not a file'.format(args.hmmFile))
elif args.hmmDir is not None and not os.path.isdir(args.hmmDir):
validParameters = False
print_error('--hmm-dir \"{}\" is not a directory'.format(args.hmmDir))
if not validParameters:
return 1
start = time.clock()
fileList = []
if args.hmmFile is not None:
fileList = [ args.hmmFile ]
if args.hmmDir is not None:
hmmdir = args.hmmDir[:-1] if args.hmmDir.endswith("/") else args.hmmDir
fileList = [ '{}/{}'.format(hmmdir,f) for f in os.listdir(hmmdir) if f.endswith('.hmm') and os.path.isfile('{}/{}'.format(hmmdir,f)) ]
print_status("parsing HMMER3 models");
hmmDict = {}
for fname in fileList:
with open(fname,'r') as fh:
for hmm in parseHmmerModel(fh):
hmmDict[ hmm.name ] = hmm
#print(str(hmm))
hmmDictFileName = "{}.pgz".format(args.outPrefix)
print_status("writing ProfileView's HMM dictionary to \"{}\"".format(hmmDictFileName))
with gzip.GzipFile(hmmDictFileName, 'w') as hmmDictFile:
pickle.dump( hmmDict, hmmDictFile )
exec_time = time.perf_counter() - start
print_status('execution time: {:.2f}s'.format(exec_time));
return 0
# Check if the program is not being imported
if __name__ == "__main__":
sys.exit(main())
#!/usr/bin/env python3
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
import sys, errno, os
import argparse
import math
from collections import defaultdict
import numpy as np
from ete3 import Tree
from pv_utils import *
def getBestSubtreeThreshold( mod_name, seq_scores, seq_names, seq_subtree ):
thr = [0.0,0.0]
points = [ seq_scores[i] for i,s in enumerate(seq_names) if s not in seq_subtree ]
return np.amax(points,axis=0) if len(points) > 0 else thr
def getCentroid(points):
a = np.asarray(points)
num = a.shape[0]
return [ np.sum(a[:,0])/num, np.sum(a[:,1])/num ] if num > 0 else [0.0,0.0]
def getRepresentedSequences( mod_name, seq_scores, seq_names, seq_subtree ):
thr = getBestSubtreeThreshold( mod_name, seq_scores, seq_names, seq_subtree )
indices = np.nonzero(np.any(np.greater(seq_scores,thr),axis=1))[0]
scores = [ seq_scores[i] for i in indices ]
names = uniq( seq_names[i] for i in indices )
ctr = getCentroid(scores)
ctr_norm = math.hypot(*ctr) if ctr is not None else 0.0
return { "model":mod_name, "covered":names, "num_sequences":len(seq_subtree), "covered_score":ctr_norm, "thr":thr }
def getBestRepresentativeModel( best, a ):
best_covered = len(best["covered"])
best_score = best["covered_score"]
a_covered = len(a["covered"])
a_score = a["covered_score"]
if (a_covered > best_covered) or (a_covered == best_covered and a_score > best_score):
best = a
return best
def hasBetterCoverage(a,b):
a_covered = len(a["covered"])
b_covered = len(b["covered"])
return a_covered > b_covered
def main( argv = None ):
# GET PARAMETERS
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--tree', dest='treeFile', required=True, help='Input tree (in newick format) obtained from the hierarchical clustering')
parser.add_argument('-s', '--scores', dest='scoreFile', required=True, help='Score file obtained by the script parseHMMER.py')
parser.add_argument('-m', '--models', dest='modelFile', help='List of model identifiers to consider')
parser.add_argument('--sorted', dest='isScoreSorted', action='store_true', help='Use this flag if score file is sorted by score (descending order)')
parser.add_argument('--min-coverage', dest='minCoverage', type=float, default=0.5)
parser.add_argument('-o', '--out', dest='outFile', default='out.reprTree', help='output prefix')
opts = parser.parse_args()
# VALIDATE PARAMETERS
validParameters = True
if not os.path.isfile(opts.treeFile):
validParameters = False
print_error("input tree file \"" + opts.treeFile + "\" does not exist.")
if not os.path.isfile(opts.scoreFile):
validParameters = False
print_error("score file \"" + opts.scoreFile + "\" does not exist.")
if not validParameters:
return 1
# LOAD MODEL FILE (IF PROVIDED)
modSet = set()
if opts.modelFile is not None:
with open(opts.modelFile,'r') as fh:
for line in fh:
modid = line.strip()
if modid != "":
modSet.add(modid)
# LOAD FUNCTION TREE
funTree = Tree(opts.treeFile)
seqSet = set([ n.name for n in funTree.iter_leaves() ])
# LOAD SCORE FILE
modSeqs = defaultdict(list)
modScores = defaultdict(list)
with open(opts.scoreFile,'r') as fh:
for line in fh:
if not line.startswith('#'):
fields = line.strip().split('\t') # hmm_name, hmm_len, seq_name, seq_len, seq_family, seq_func, bitscore, mean-bitscore, ...
mod_name = fields[0]
seq_name = fields[2]
if opts.modelFile is None: # no model file given, cosider the set of models mentioned in the score file
modSet.add(mod_name)
if seq_name in seqSet:
modSeqs[mod_name].append(seq_name)
modScores[mod_name].append([float(fields[x]) for x in [7,11]]) # [ bitscore, mean-bitscore ]
seqDict = defaultdict(set)
for mod_name in modScores: # in modDict
if mod_name in modSet: # and (len(modDict[mod_name]) > 0):
for i in np.argmax(modScores[mod_name],axis=0):
seq_name = modSeqs[mod_name][i]
seqDict[seq_name].add(mod_name)
# (POST-ORDER) VISIT OF THE FUNCTIONAL TREE TO FIND REPRESENTATIVE MODELS
nameIdx=1
for node in funTree.iter_descendants("postorder"):
if node.is_leaf(): # do not find best representative for leaves
node.add_feature("models",seqDict[node.name])
node.add_feature("sequences",set([node.name]))
node.add_feature("num_covered",0)
node.add_feature("mod_support",0.0)
#print( 'LEAF({}): {}'.format(node.name,len(node.models)) )
continue
# internal node (different from the root)
assert len(node.children) == 2
node.name = 'I{}'.format(nameIdx)
nameIdx += 1
# gather models and sequences from the children
node.add_feature("models", node.children[0].models | node.children[1].models)
node.add_feature("sequences", node.children[0].sequences | node.children[1].sequences)
#print( 'INTERNAL({}): {}'.format(node.name,node.models) )
# test whether any of the model is representative for the sequence set
best = { "model":"", "covered":[], "num_sequences":len(node.sequences), "covered_score":0.0, "thr":[0.0,0.0] }
best_list = []
good_list = []
for mod_name in node.models:
rs = getRepresentedSequences(mod_name, modScores[mod_name], modSeqs[mod_name], node.sequences)
rs_covered = len(rs["covered"])
rs_support = float(rs_covered)/rs["num_sequences"]
best_covered = len(best["covered"])
if rs_covered > best_covered:
best_list = [ (rs["model"],rs["covered_score"]) ]
elif rs_covered == best_covered:
best_list.append( (rs["model"],rs["covered_score"]) )
if rs_support >= .9:
good_list.append( (rs["model"],rs["covered_score"]) )
best = getBestRepresentativeModel(best,rs)
best_list.sort(key=lambda x:float(x[1]),reverse=True)
good_list.sort(key=lambda x:float(x[1]),reverse=True)
node.add_feature("repr_model",best["model"])
node.add_feature("best_models",best_list)
node.add_feature("good_models",good_list)
node.add_feature("covered",best["covered"])
node.add_feature("num_covered",len(node.covered))
node.add_feature("num_sequences",len(node.sequences))
node.add_feature("mod_support",float(node.num_covered)/node.num_sequences)
node.add_feature("covered_score",best["covered_score"])
node.add_feature("thr",best["thr"])
node.support = node.mod_support if (node.mod_support >= opts.minCoverage and node.num_covered != node.children[0].num_covered and node.num_covered != node.children[1].num_covered) else 0.0
node.support = node.support if len(node.sequences) > 2 or (node.num_covered == 2 and node.num_sequences == 2) else 0.0
node.add_feature("conf",node.support)
with open('{}.models.tsv'.format(opts.outFile),'w') as ofh:
fmt = ['{nodeid}','{repr_model}','{mod_support:.2f}','{num_covered}','{num_sequences}','{seq_name}','{is_covered}','{best_models}','{good_models}']
for node in funTree.iter_descendants('preorder'):
if node.is_leaf() or node.num_covered <= 2:
continue
if node.support > 0.0: # has a representative model
covered = set(node.covered)
best_set = set()
good_set = set()
best_list=[]
for p in node.best_models:
if p[0] not in best_set:
best_list.append(p[0])
best_set.add(p[0])
good_list=[]
for p in node.good_models:
if p[0] not in best_set and p[0] not in good_set:
good_list.append(p[0])
good_set.add(p[0])
for seq_name in node.sequences:
ofh.write('\t'.join(fmt).format( nodeid=node.name, repr_model=node.repr_model, mod_support=node.mod_support,
num_covered=node.num_covered, num_sequences=node.num_sequences, seq_name=seq_name,
is_covered=(seq_name in covered), best_models=';'.join(best_list), good_models=';'.join(good_list) ))
ofh.write('\n')
funTree.write(features=['name','conf','repr_model','num_covered'], format=2, outfile='{}.nhx'.format(opts.outFile))
return 0
# Check if the program is not being imported
if __name__ == "__main__":
sys.exit(main())
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
import math
import operator
from BitVector import BitVector
from pv_utils import *
class hhm():
def __init__(self, hhm_name, aa2idx, neff_hmm, neff_m, consensus, master_seq, match_states):
self.name = hhm_name
self.aa2idx = aa2idx
self.consensus = consensus
self.master_seq = master_seq
self.match_states = match_states
self.neff_hmm = neff_hmm
self.neff_m = neff_m
self.conserved = BitVector( bitstring = ''.join([ '1' if c.isupper() else '0' for c in self.consensus ]) )
def __str__(self):
return "NAME: {}\nCONSENSUS: {}\nCONSERVED: {}\nMASTERSEQ: {}".format(self.name, self.consensus, self.conserved, self.master_seq)
def frequency(self, state, aa):
aaidx = self.aa2idx[aa]
val = self.match_states[state][aaidx]
return 2**(-val/1000.0)
def state_frequencies(self, state):
out = {}
for aa in self.aa2idx:
aai = self.aa2idx[aa]
val = self.match_states[state][aai]
out[aa] = 2**(-val/1000.0)
return out
def size(self):
return len(self.master_seq)
def get_consensus(self):
return self.consensus
def parse_hhm(fh):
hhm_name = ""
aa_list = []
aa2idx = {}
consensus = ""
master_seq = ""
match_states = []
neff_hmm = 0.0
neff_m = [99.999]
while True:
line = fh.readline()
if line == '':
break
tokens = line.split()
if len(tokens) == 0:
continue
elif tokens[0] == "NAME":
hhm_name = tokens[1]
elif tokens[0] == "NEFF":
neff_hmm = float(tokens[1])
elif tokens[0] == "SEQ":
while not line.startswith('>Consensus'):
line = fh.readline()
# found consensus sequence id
line = fh.readline()
while line and line[0] != '>':
consensus += ''.join(line.split())
line = fh.readline()
# found master sequence id
line = fh.readline()
while line and line[0] != '>' and line[0] != '#':
master_seq += ''.join(line.split())
line = fh.readline()
# consensus/master sequences processed, discard remaining sequences until the end of the SEQ block
while line and line[0] != '#':
line = fh.readline()
elif tokens[0] == "NULL":
null_values = [ int(v) if is_int(v) else 99999 for v in tokens[1:21] ]
match_states.append(null_values)
elif tokens[0] == "HMM":
# load AA alphabet
aa_list = tokens[1:21]
for i,aa in enumerate(aa_list):
aa2idx[aa] = i
# skip first two lines (transitions definitions/start probabilities)
fh.readline() # M->M M->I M->D I->M I->I D->M D->D Neff NeffI NeffD
fh.readline() # 0 * * 0 * 0 * * * *
line = fh.readline()
while line and not line.startswith("//"):
tokens = line.split()
if len(tokens) == 23:
state_values = [ int(val) if is_int(val) else 99999 for val in tokens[2:22] ]
match_states.append(state_values)
elif len(tokens) == 10:
val = tokens[7]
neff_m.append( int(val)/1000.0 if is_int(val) else 99.999 )
line = fh.readline()
assert len(match_states) == len(master_seq)+1
return hhm( hhm_name, aa2idx, neff_hmm, neff_m, consensus, master_seq, match_states )
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
import math
import operator
from pv_utils import *
class hmm_model():
def __init__(self):
self.name = ""
self.length = 0
self.nseq = 0
self.states = []
self.cons = ""
self.aa_list = [ c for c in "ACDEFGHIKLMNPQRSTVWY" ]
self.aa2idx = { c:i for i,c in enumerate(self.aa_list) }
self.bg_freq = [ -math.log(0.05) for c in self.aa_list ]
def __str__(self):
consensus = []
for k,s in enumerate(self.states):
aai, minval = min(enumerate(s), key=operator.itemgetter(1))
f = self.frequency(k, self.aa_list[aai])
c = self.aa_list[aai].upper() if f >= .6 else (self.aa_list[aai].lower() if f >= .4 else 'x')
consensus.append(c)
return "NAME={}\nLENG={} STATES={}\nNSEQ={}\nCONS={}".format(self.name, self.length,len(self.states),self.nseq, ''.join(consensus))
def frequency(self, k, aa):
aaidx = self.aa2idx[aa]
return math.exp(-self.states[k][aaidx])
# def consensus(self):
# if not self.cons:
# consensus = []
# for k,s in enumerate(self.states):
# aai, minval = min(enumerate(s), key=operator.itemgetter(1))
def getStateFreqDict(self, k):
outDict = {}
for aa in aa2idx:
aaidx = self.aa2idx[aa]
outDict[aa] = math.exp(-self.states[k][aaidx])
return outDict
def ln_odds_ratio(self,k,aa):
aai = self.aa2idx[aa]
m_k_a = self.states[k][aai]
m_0_a = self.bg_freq[aai]
return (m_0_a - m_k_a)
def score(self, k, aa):
return self.ln_odds_ratio(k,aa)/math.log(2.0) # convert to base2 logarithm
def is_valid(self):
return len(self.name) > 0 and self.length > 0 and self.length == len(self.states) and self.nseq > 0
class hmmer_hit():
def __init__(self):
self.hmm_name = ""
self.seq_name = ""
self.typ = '?'
self.score = 0.0 # HMMER's bit score
self.mcscore = 0.0 # sum of scores in well-aligned positions (PP[i] \in {8,9,*}) with a match between the sequence and the hmm consensus
self.wcscore = 0.0 # sum of scores in well-aligned positions (PP[i] \in {8,9,*}) with score[i] >= threshold
self.matches = 0 # number of matches between the sequence and the hmm consensus
self.aligned_cols = 0 # number of aligned match states
self.well_aligned_cols = 0 # number of well-aligned match states
self.ali_len = 0 # length of alignment
self.hmm_from = 0
self.hmm_to = 0
self.ali_from = 0
self.ali_to = 0
self.env_from = 0
self.env_to = 0
self.hmm_cons = ""
self.seq_cons = ""
self.pp_cons = ""
self.wc_str = ""
self.cm_str = ""
self.match_str = ""
def canExtendWith(self,b):
if self.hmm_name != b.hmm_name or self.seq_name != b.seq_name or self.typ == '?' or b.typ == '?':
return False
Lm = b.hmm_to-self.hmm_from+1
Ls = b.ali_to-self.ali_from+1
return self.hmm_from <= b.hmm_from and self.ali_from <= b.ali_from and math.fabs(Ls-Lm)/float(min(Ls,Lm)) <= .2
# it is assumed that canExtendWith(b) holds True
def extendWith(self,b):
self.score += b.score
self.mcscore += b.mcscore
self.wcscore += b.wcscore
self.matches += b.matches
self.aligned_cols += b.aligned_cols
self.well_aligned_cols += b.well_aligned_cols
self.ali_len += b.ali_len
self.hmm_from = min(self.hmm_from,b.hmm_from)
self.hmm_to = max(self.hmm_to,b.hmm_to)
self.ali_from = min(self.ali_from,b.ali_from)
self.ali_to = max(self.ali_to,b.ali_to)
self.env_from = min(self.env_from,b.env_from)
self.env_to = max(self.env_to,b.env_to)
self.wc_str += b.wc_str
self.cm_str += b.cm_str
self.match_str += b.match_str
return self
# def __str__(self):
# return "HMM {}\nSEQ {}\nSC {}\nHMM-SEQ ALIGNMENT {}-{} {}-{}\n{}\n{}\n{}".format(
# self.hmm_name, self.seq_name, self.score,
# self.hmm_from, self.hmm_to, self.ali_from, self.ali_to,
# self.hmm_cons, self.seq_cons, self.pp )
#!/ usr/bin/env python
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
import sys, errno, os
import argparse
import gzip
import re
import pickle
from hmm_utils import *
from pv_utils import *
def parseHMMER(fh,hmmDict):
hmm_name = ""
seq_name = ""
line = fh.readline()
while line:
tokens = line.split()
if len(tokens) == 0 or line.startswith("#"):
pass
elif line.startswith("Query: "):
hmm_name = tokens[1]
elif line.startswith(">> "): # target sequence section
seq_name = tokens[1]
# skip seqName domain table header
fh.readline()
fh.readline()
# process domain table
line = fh.readline()
hitTable = []
while line and not line.lstrip().startswith("== domain "):
tokens = line.split()
## domain table row:
## domNum [!?] score bias c-Evalue i-Evalue hmmfrom hmm-to .. alifrom ali-to .. envfrom env-to .. acc
if len(tokens) == 16:
#eprint('[debug] domain-table: {}'.format(line[:-1]))
hit = hmmer_hit()
hit.hmm_name = hmm_name
hit.seq_name = seq_name
hit.typ = tokens[1]
hit.score = float(tokens[2])
hit.hmm_from = int(tokens[6])
hit.hmm_to = int(tokens[7])
hit.ali_from = int(tokens[9])
hit.ali_to = int(tokens[10])
hit.env_from = int(tokens[12])
hit.env_to = int(tokens[13])
hitTable.append(hit)
line = fh.readline()
# process each domain hit (i.e., hmm-seq alignment)
while line and line.lstrip().startswith("== domain "):
tokens = line.split()
hit = hitTable[ int(tokens[2])-1 ]
hmm_cons = ""
seq_cons = ""
pp_cons = ""
while True:
line = fh.readline()
tokens = line.split()
if len(tokens) == 4 and tokens[0] == hit.hmm_name:
hmm_cons += tokens[2]
#fh.readline() # skip hmm-seq match string
match_cons_i = re.search(r"\s*\S+\s+[0-9]+",line).end()+1
match_line = fh.readline()
hit.match_str += match_line[ match_cons_i : match_cons_i+len(tokens[2]) ]
tokens = fh.readline().split()
seq_cons += tokens[2]
aln_to = int(tokens[3])
tokens = fh.readline().split()
pp_cons += tokens[0]
if aln_to == hit.ali_to:
break
hit.hmm_cons = hmm_cons
hit.seq_cons = seq_cons
hit.pp_cons = pp_cons
if hit.typ == '!': # significant hit
hmm = hmmDict[hit.hmm_name]
# compute scores
hit.aligned_cols = 0
hit.well_aligned_cols = 0
hit.matches = 0
hit.wcscore = 0.0
hit.mcscore = 0.0
hmm_i = hit.hmm_from
for i,hmm_c in enumerate(hmm_cons):
hit.ali_len += 1
if hmm_c == '.': # insertion in sequence w.r.t. the model
continue
seq_c = seq_cons[i]
if seq_c == '-': # deletion in sequence w.r.t. the model
hmm_i += 1
continue
hit.aligned_cols += 1
hit.matches += 1 if hmm_c.upper() == seq_c.upper() else 0
if seq_c in 'ACDEFGHIKLMNPQRSTVWY' and pp_cons[i] in list('89*'):
score_i = hmm.score(hmm_i,seq_c)
hit.well_aligned_cols += 1
if hmm_c.upper() == seq_c.upper():
hit.cm_str += hit.match_str[i]
hit.mcscore += score_i
if score_i >= 3.0: # high-score position
hit.wc_str += hit.match_str[i]
hit.wcscore += score_i
hmm_i += 1
yield hit
# skip lines until the next domain/sequence or the end of file
line = fh.readline()
while line and not line.lstrip().startswith("== domain ") and not line.startswith(">> ") and not line.startswith("Query: "):
line = fh.readline()
# here either EOF is reached or there are other models/sequences to process
continue
line = fh.readline()
def main( argv = None ):
# parameter definition
parser = argparse.ArgumentParser()
parser.add_argument('--hmmer-dict', dest='hmmerDictFile', type=str, required=True, help='HMMER dictionary .pgz file')
parser.add_argument('--seq-db', dest='seqFile', type=str, required=True, help='Sequence database in csv format with [name,function,family,length] fields')
args = parser.parse_args()
# parameter validation
validParameters = True;
if not os.path.isfile(args.hmmerDictFile):
validParameters = False
print_error('HMMER dictionary file "{}" does not exist'.format(args.hmmerDictFile))
if not os.path.isfile(args.seqFile):
validParameters = False
print_error('Sequence database file "{}" does not exist'.format(args.seqFile))
if not validParameters:
return 1
print_status("loading HMMER dictionary...")
hmmerDict = {}
with gzip.open(args.hmmerDictFile,'rb') as f:
hmmerDict = pickle.load(f)
print_status("loading sequence database...")
seqDict = {}
with open( args.seqFile, mode='r', newline=None ) as seqFile:
for line in seqFile:
line = line.strip()
if line:
seq_name, seq_fun, seq_fam, seq_len = [ x.strip() for x in line.split(',') ]
seqDict[ seq_name ] = { 'Function':seq_fun, 'Family':seq_fam, 'Length':int(seq_len) }
print_status("processing hmmsearch output files...")
header = ['#hmm_name','hmm_len','seq_name','seq_len','seq_family','seq_func',
'bitscore', 'mean_score', 'mcscore','mean_mcs', 'wcscore','mean_wcs',
'ident','hmm_cov','hit_type']
print('\t'.join(header))
record = []
prev_hit = hmmer_hit()
for hit in parseHMMER(sys.stdin,hmmerDict):
if hit.seq_name not in seqDict:
continue
hmm = hmmerDict[hit.hmm_name]
if prev_hit.canExtendWith(hit):
hit = prev_hit.extendWith(hit)
elif len(record) > 0:
print('\t'.join(record))
record = []
hmm_len = hmm.length
hmm_reg = hit.hmm_to-hit.hmm_from+1
hmm_loh = hit.hmm_from-1 # left overhang of the hmm hit
hmm_roh = hmm_len-hit.hmm_to # right overhang of the hmm hit
hmm_cov = 100.0 * hmm_reg / hmm_len
seq_name = hit.seq_name
seq_len = seqDict[seq_name]['Length']
seq_loh = hit.env_from-1 # left overhang of the seq hit (envelope considered)
seq_roh = seq_len-hit.env_to # right overhang of the seq hit (envelope considered)
max_oh = max( 0.1 * hmm_len, 10 )
#left_oh = min(hmm_loh,seq_loh)
#right_oh = min(hmm_roh,seq_roh)
hit_type = "FULL" if (hmm_loh <= max_oh and hmm_roh <= max_oh) else ("OVER" if (hmm_loh-seq_loh > max_oh or hmm_roh-seq_roh > max_oh) else "PART")
#hit_type = "FULL" if (hmm_loh <= max_oh and hmm_roh <= max_oh) else ("OVER" if (left_oh <= max_oh and right_oh <= max_oh) else "PART")
mean_score = 100.0 * hit.score / hit.aligned_cols
mean_mcs = 100.0 * hit.mcscore / hit.aligned_cols # hit.well_aligned_cols
mean_wcs = 100.0 * hit.wcscore / hit.aligned_cols # hit.well_aligned_cols
identity = 100.0 * hit.matches / hit.ali_len
record = [ hit.hmm_name, str(hmm_len),
seq_name, str(seq_len), seqDict[seq_name]['Family'], seqDict[seq_name]['Function'],
"{:.2f}".format(hit.score), "{:.4f}".format(mean_score),
"{:.2f}".format(hit.mcscore), "{:.4f}".format(mean_mcs),
"{:.2f}".format(hit.wcscore), "{:.4f}".format(mean_wcs),
"{:.2f}".format(identity), "{:.2f}".format(hmm_cov), hit_type ]
prev_hit = hit
if len(record) > 0:
print('\t'.join([ str(r) for r in record ]))
return 0
# Check if the program is not being imported
if __name__ == "__main__":
sys.exit(main())
#!/usr/bin/env python3
#
# This file is part of ProfileView.
#
# ProfileView is free software: you can redistribute it and/or modify
# it under the terms of the CeCILL 2.1 Licence
#
# ProfileView is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# You should have received a copy of the Licence CeCILL 2.1 along
# with ProfileView. If not, see <https://cecill.info/>.
#
import os, sys
import re
script_name = re.sub( r'\.py$', '', os.path.basename(sys.argv[0]) )
def uniq(l):
return list(set(l))
# stable uniq
def suniq(l):
outlist=[]
procs=set() # processed elements
for e in l:
if e not in procs:
procs.add(e)
outlist.append(e)
return outlist
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def print_status(msg):
eprint("[{name}] {m}".format(name=script_name,m=msg))
def print_error(msg):
eprint("[{name}] error: {m}".format(name=script_name,m=msg))
def print_warning(msg):
eprint("[{name}] warning: {m}".format(name=script_name,m=msg))
def is_int(s):
try:
int(s)
return True
except ValueError:
return False
def peekline(f):
pos = f.tell()
line = f.readline()
f.seek(pos)
return line
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment