First commit

ed79a631 · vice1987 · ed79a631 · ed79a631 · ed79a631 · ed79a631
Commit ed79a631 authored May 31, 2019 by vice1987
18 changed files
--- a/AUTHORS
+++ b/AUTHORS
+Riccardo Vicedomini
+Jean-Pierre Bouly
+Soizic Cheminant-Navarro
+Elodie Laine
+Angela Falciatore
+Alessandra Carbone
--- a/LICENSE
+++ b/LICENSE
--- a/profileview-build
+++ b/profileview-build
+#!/usr/bin/env bash
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+CMD_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}" )" && pwd)
+CMD_NAME=$(basename "${BASH_SOURCE[0]}")
+SCRIPTS_DIR="${CMD_DIR}"/scripts
+
+# Include common definitions
+source "${CMD_DIR}/profileview-common"
+
+# Pressing CTRL-C will stop the whole execution of the script
+trap ctrl_c INT; function ctrl_c() { exit 5; }
+
+# Definition of functions and global variables specific to this script
+
+PVLIB_FORCE=false
+PVLIB_PATH=""
+PVLIB_DOMID=""
+NTHREADS=2
+NJOBS=8
+
+PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
+if ! command -v parallel >/dev/null 2>&1; then
+    print_warning "cannot find GNU parallel, all jobs will be run sequentially"
+    PEXEC_CMD="/usr/bin/env bash --"
+fi
+
+function print_usage() {
+    echo -en "\n  USAGE: ${CMD_NAME} -i <input_fasta> -d <hhblits_db> -n <lib_name> [options]\n"
+    echo -en "\n"
+    echo -en "  MANDATORY OPTIONS:\n
+    -i, --input <name>\tInput file of domain sequences in FASTA format\n
+    -D, --domain-id <name>\tDomain identifier\n
+                          \t(e.g., Pfam accession number PF03441)\n
+    -d, --db <name>\thhblits database name\n
+                   \t(multiple databases may be specified with '-d <name1> -d <name2> ...')\n
+    -n, --lib-name <name>\tName of ProfileView library\n" | column -t -s $'\t'
+    echo -en "\n"
+    echo -en "  OTHER OPTIONS:\n
+    --force\tForce the construction of ProfileView's library even if the directory already exists\n
+           \tpossibly overwriting previously generated files\n
+    -t, --threads <num>\tNumber of threads for each hhblits job (default:2)\n
+    -j, --max-jobs <num>\tNumber of parallel jobs (default:8)\n
+    -h, --help\tPrint this help message\n
+    -V, --version\tPrint version\n" | column -t -s $'\t'
+    echo -en "\n"
+}
+
+# retrieve provided arguments
+opts="i:D:d:n:t:j:hV"
+longopts="input:,domain-id:,db:,lib-name:,force,threads:,max-jobs:,help,version"
+ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
+if [ $? -ne 0 ] || [ $# -eq 0 ]; then # do not change the order of this test!
+    print_usage
+    exit 1
+fi
+eval set -- "${ARGS}"
+
+declare -a DBNAMES
+while [ -n "${1}" ]; do
+    case ${1} in
+        -i|--input)
+            shift
+            INPUT_FASTA=${1}
+            ;;
+        -D|--domain-id)
+            shift
+            PVLIB_DOMID=${1}
+            ;;
+        -d|--db)
+            shift
+            DBNAMES+=(${1})
+            ;;
+        -n|--lib-name)
+            shift
+            PVLIB_PATH=${1}
+            ;;
+        --force)
+            PVLIB_FORCE=true
+            ;;
+        -t|--threads)
+            shift
+            NTHREADS=${1}
+            ;;
+        -j|--max-jobs)
+            shift
+            NJOBS=${1}
+            ;;
+        -h|--help)
+            print_usage
+            exit 0
+            ;;
+        -V|--version)
+            print_version
+            exit 0
+            ;;
+        --)
+            shift
+            break
+            ;;
+    esac
+    shift
+done
+
+# Input parameters validation
+
+if [ ${#DBNAMES[@]} -eq 0 ]; then
+    print_error "no hh-suite database provided with the -d|--db parameter"
+    exit 1
+else
+    for db in "${DBNAMES[@]}"; do
+        db_dir=$(dirname "$db")
+        if [ ! -d "${db_dir}" ]; then
+            print_error "hh-suite database path \"${dbn_dir}\" cannot be found"
+            exit 1
+        fi
+    done
+fi
+
+if [ -z "${INPUT_FASTA}" ] || [ ! -f ${INPUT_FASTA} ]; then
+    print_error "-i|--input file is missing or does not exist"
+    exit 1
+fi
+
+if [ -z "${PVLIB_DOMID}" ]; then
+    print_error "-D|--domain-id parameter is mandatory"
+    exit 1
+fi
+
+if [ -z "${PVLIB_PATH}" ]; then
+    print_error "-n|--name parameter is mandatory"
+    exit 1
+elif [ "${PVLIB_FORCE}" = false ] && [ -d "${PVLIB_PATH}" ]; then
+    print_error "it seems that profileview library \"${PVLIB_PATH}\" already exists; delete its directory or use a different name."
+    exit 1
+fi
+
+if ! [[ "${NTHREADS}" =~ ^[0-9]+$ ]] || [ ${NTHREADS} -lt 1 ] ; then
+    print_warning "-t|--threads parameter should be a positive integer; the default value of 1 will be used."
+    NTHREADS=1
+fi
+if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then
+    print_warning "-j|--max-jobs parameter should be a positive integer; the default value of 1 will be used."
+    NJOBS=1
+fi
+
+check_cmds "hhblits" "reformat.pl" "hmmbuild" "python3"
+
+# Create ProfileView library directory
+PVLIB_NAME=$(basename "${PVLIB_PATH}")
+PVLIB_PREFIX="$(dirname "${PVLIB_PATH}")"/"${PVLIB_NAME}"
+mkdir -p ${PVLIB_PREFIX}/{query,a3m,hhm,afa,fas,hmm}
+if [ $? -ne 0 ]; then
+    print_error "cannot create all library directories"
+    exit 1
+fi
+PVLIB_DIR=$(abspath "${PVLIB_PREFIX}")
+PVLIB_QUERY="${PVLIB_DIR}"/query
+
+# First split input fasta file
+print_status "splitting input file: ${INPUT_FASTA}"
+awk -v queryDir="${PVLIB_QUERY}" '
+    /^>/ {
+        idstr=substr($0,2)
+        gsub(/[^A-Za-z0-9._-]/,"_",idstr)
+        split(idstr,ida)
+        sname=ida[1]
+        printf(">%s\n",idstr) >queryDir"/"sname".fa"
+        next
+    }
+    sname!="" {
+        print >queryDir"/"sname".fa"
+    }
+' "${INPUT_FASTA}"
+
+# Run hhblits on each sequences of the fas directory
+#TODO: do not execute hhblits if model had been already built
+print_status "building hhblits models"
+HHBLITS_DBS=""; for hhdb in "${DBNAMES[@]}"; do HHBLITS_DBS="${HHBLITS_DBS} -d ${hhdb}"; done
+for query in "${PVLIB_QUERY}"/*.fa; do
+    [ -e "${query}" ] || continue
+    queryBase=${query##*/}
+    queryName=${queryBase%.fa}
+    echo "hhblits ${HHBLITS_DBS} -i ${query} -o stdout -oa3m ${PVLIB_PREFIX}/a3m/${queryName}.a3m -ohhm ${PVLIB_PREFIX}/hhm/${queryName}.hhm -M first -qid 60 -cov 70 -id 98 -e 1e-10 -n 3 -cpu ${NTHREADS} -v 0 >/dev/null 2>&1"
+done | ${PEXEC_CMD} # left it unquoted!
+
+print_status "formatting a3m files"
+for a3m in "${PVLIB_DIR}"/a3m/*.a3m; do
+    [ -e "${a3m}" ] || continue
+    a3mBase="${a3m##*/}"
+    a3mName="${a3mBase%.a3m}"
+    afaPath="${PVLIB_DIR}/afa/${a3mName}.afa"
+    fasPath="${PVLIB_DIR}/fas/${a3mName}.fas"
+    echo "reformat.pl a3m fas ${a3m} ${afaPath} -uc -l 80 -v 0 >/dev/null 2>&1 && reformat.pl a3m fas ${a3m} ${fasPath} -M first -r -uc -l 80 -v 0 >/dev/null 2>&1"
+done | ${PEXEC_CMD} # left it unquoted!
+
+print_status "building HMMER models"
+for afa in "${PVLIB_DIR}"/afa/*.afa; do
+    [ -e "${afa}" ] || continue
+    afaBase="${afa##*/}"
+    afaName="${afaBase%.afa}"
+    hmm="${PVLIB_DIR}/hmm/${afaName}.hmm"
+    echo "hmmbuild ${hmm} ${afa} >/dev/null 2>&1"
+done | ${PEXEC_CMD} # left it unquoted!
+
+print_status "generating ProfileView library data"
+
+for hmm in "${PVLIB_DIR}"/hmm/*.hmm; do
+    [ -e "${hmm}" ] || continue
+    awk -v domId="${PVLIB_DOMID}" '$1~/^NAME$/{hmm_name=$2;next} $1~/^LENG$/{hmm_len=$2;next} $1~/^NSEQ$/{hmm_nseq=$2;next} $1~/^HMM$/{ OFS=","; print hmm_name,domId,hmm_nseq,hmm_len }' ${hmm}
+done > "${PVLIB_PREFIX}"/"${PVLIB_NAME}".models.list
+
+python3 "${SCRIPTS_DIR}"/createHHdict.py --hhm-dir "${PVLIB_DIR}/hhm" --prefix "${PVLIB_DIR}"/${PVLIB_NAME}.hhdict >/dev/null 2>&1
+python3 "${SCRIPTS_DIR}"/createHmmerDict.py --hmm-dir "${PVLIB_DIR}/hmm" --prefix "${PVLIB_DIR}"/${PVLIB_NAME}.hmmdict >/dev/null 2>&1
+
--- a/profileview-common
+++ b/profileview-common
+#!/usr/bin/env bash
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+
+# Common functions and definitions to be used in ProfileView
+PV_NAME="ProfileView"
+PV_VERSION='1.0'
+PV_DATE='190531'
+
+function abspath() { echo "$(cd "$(dirname "$1")"; pwd -P)/$(basename "$1")"; }
+
+function print_error() { echo "[error] ${@}" >&2; }
+function print_warning() { echo "[warning] ${@}" >&2; }
+function print_status() { echo "[main] ${@}" >&2; }
+function print_version() { echo "${PV_NAME} ${PV_VERSION}-${PV_DATE}"; }
+
+function check_cmds() {
+    cmds=("$@")
+    declare -a notfound
+    for cmd in "${cmds[@]}"; do
+        if ! command -v "${cmd}" >/dev/null 2>&1; then 
+            notfound+=("${cmd}")
+        fi
+    done
+    if [ ${#notfound[@]} -gt 0 ]; then
+        print_error "cannot find the following commands: ${notfound[@]}"
+        print_error "check your PATH environment variable"
+        exit 1
+    fi
+    return 0
+}
+
+function check_pymodules(){
+    check_cmds "python3"
+    mods=("$@")
+    declare -a notfound
+    for mod in "${mods[@]}"; do
+        if ! python3 -c "import ${mod}" >/dev/null 2>&1; then
+            notfound+=("${mod}")
+        fi
+    done
+    if [ ${#notfound[@]} -gt 0 ]; then
+        print_error "cannot find the following python3 modules: ${notfound[@]}"
+        exit 1
+    fi
+    return 0
+}
+
+function check_files() {
+    files=("$@")
+    declare -a notfound
+    for f in "${files[@]}"; do
+        if [ ! -e "${f}" ]; then notfound+=("${f}"); fi
+    done
+    if [ ${#notfound[@]} -gt 0 ]; then
+        print_error "cannot find the following files:"
+        for f in "${notfound[@]}"; do echo "$f"; done
+        exit 1
+    fi
+    return 0
+}
+
+
--- a/profileview-motif
+++ b/profileview-motif
+#!/usr/bin/env bash
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+
+CMD_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}" )" && pwd)
+CMD_NAME=$(basename "${BASH_SOURCE[0]}")
+SCRIPTS_DIR="${CMD_DIR}"/scripts
+
+# Include common definitions
+source "${CMD_DIR}/profileview-common"
+
+# Pressing CTRL-C will stop the whole execution of the script
+trap ctrl_c INT;
+function ctrl_c() { exit 5; }
+
+# Definition of functions and global variables specific to this script
+
+MODLIST=""
+PV_LIBPATH=""
+PV_OUTTREE=""
+PV_SEQDESC=""
+PV_OUTDIR="out_$(date +%Y%m%d_%H%M%S)"
+PV_TMPDIR=""
+NJOBS=8
+
+PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
+if ! command -v parallel >/dev/null 2>&1; then
+    print_warning "cannot find GNU parallel, all jobs will be run sequentially"
+    PEXEC_CMD="/usr/bin/env bash --"
+fi
+
+function print_usage() {
+    echo -en "\n  USAGE: ${CMD_NAME} -l <profileview_lib> -m <model-list> [options]\n"
+    echo -en "\n"
+    echo -en "  MANDATORY OPTIONS:\n
+    -l, --lib        <name>\tProfileView library name\n
+    -m, --model-list <name>\tFile containing a list of models (one identifier per line)\n
+    " | column -t -s $'\t'
+    echo -en "\n"
+    echo -en "  OTHER OPTIONS:\n
+    -o, --out-dir  <name>\tPrefix of output files (default: out_<current-date>)\n
+    --temp-dir     <name>\tTemporary result directory\n
+    -j, --max-jobs  <num>\tNumber of parallel jobs (default:8)\n
+    -h, --help\tPrint this help message and exit\n
+    -V, --version\tPrint version and exit\n
+    " | column -t -s $'\t'
+    echo -en "\n"
+}
+
+# retrieve provided arguments
+
+opts="l:m:o:j:hV"
+longopts="lib:,models:,out-dir:,temp-dir:,max-jobs:,help,version"
+ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
+if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important!
+    print_usage
+    exit 2
+fi
+eval set -- "${ARGS}"
+
+declare -a DBNAMES
+while [ -n "${1}" ]; do
+    case ${1} in
+        -l|--lib)
+            shift
+            PV_LIBPATH=${1}
+            ;;
+        -m|--models)
+            shift
+            MODLIST=${1}
+            ;;
+        -o|--out-dir)
+            shift
+            PV_OUTDIR=${1}
+            ;;
+        --temp-dir)
+            shift
+            PV_TMPDIR=${1}
+            ;;
+        -j|--max-jobs)
+            shift
+            NJOBS=${1}
+            ;;
+        -h|--help) 
+            print_usage
+            exit 0
+            ;;
+        -V|--version) 
+            print_version
+            exit 0
+            ;;
+        --)
+            shift
+            break
+            ;;
+    esac
+    shift
+done
+
+# Pre-requisites check
+
+check_cmds "hhalign" "python3"
+check_pymodules "Bio" "weblogolib" "BitVector"
+
+# Input parameters validation
+
+if [ -z "${PV_LIBPATH}" ]; then
+    print_error "-l|--lib parameter is mandatory"
+    print_usage
+    exit 1
+elif [ ! -d "${PV_LIBPATH}" ]; then
+    print_error "ProfileView library not found: ${PV_LIBPATH}"
+    exit 1
+fi
+PV_LIBNAME=$(basename "${PV_LIBPATH}")
+PV_LIBDIR=$(abspath "${PV_LIBPATH}")
+
+if [ -z "${MODLIST}" ]; then
+    print_error "-m|--model-list parameter is mandatory"
+    print_usage
+    exit 1
+elif [ ! -f "${MODLIST}" ]; then
+    print_error "cannot find model list file: ${MODLIST}"
+    exit 1
+fi
+
+# Create temp working directory
+if [ -z "${PV_TMPDIR}" ]; then
+    PV_TMPDIR=$(mktemp -p "${TMPDIR:-.}" -d pvtmp-XXXXX) || {
+        print_error "cannot create temporary directory"
+        exit 1
+    }
+fi
+PV_TMPDIR=$(abspath "${PV_TMPDIR}")
+if [ ! -d "${PV_TMPDIR}" ] || [ -n "$(ls -A "${PV_TMPDIR}")" ]; then
+    print_error "provided path \"${PV_TMPDIR}\" must be an empty directory"
+    exit 1
+fi
+print_status "using temporary directory: ${PV_TMPDIR}"
+
+# Create log file
+PV_LOGFILE="${PV_TMPDIR}/profileview-tree_$(date +%Y%m%d_%H%M%S).log"
+touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
+
+# define model paths
+declare -a modlst
+while read -r line; do
+    if [ -z $line ]; then continue; fi # skip empty lines
+    mod_path="${PV_LIBDIR}/hhm/${line}.hhm"
+    modlst+=("${mod_path}")
+done <<< "$(awk -F, '!/^#/ && NF>0{printf("%s\n",$1)}' "${MODLIST}")"
+
+###################
+##### HHALIGN #####
+###################
+
+print_status "running hhalign jobs for pairwise comparison of models"
+HHALIGN_HHRDIR="${PV_TMPDIR}/hhr"
+for query in "${modlst[@]}"; do
+    [ -e "${query}" ] || continue
+    queryBase=${query##*/}
+    queryName=${queryBase%.hhm}
+    mkdir -p "${HHALIGN_HHRDIR}/${queryName}"
+    for target in "${modlst[@]}"; do
+        [ -e "${target}" ] || continue
+        targetBase=${target##*/}
+        targetName=${targetBase%.hhm}
+        [ "${queryName}" = "${targetName}" ] || echo "hhalign -i ${query} -t ${target} -o ${HHALIGN_HHRDIR}/${queryName}/${targetName}.hhr -v 0"
+    done
+done | ${PEXEC_CMD} >>"${PV_LOGFILE}" 2>>"${PV_LOGFILE}"
+if [ $? -ne 0 ]; then
+    print_error "error during hhalign jobs, see log: ${PV_LOGFILE}"
+    exit 1
+fi
+
+# merge hhalign results
+print_status "merging hhalign job results"
+HHALIGN_RESDIR="${PV_TMPDIR}/hhalign"
+mkdir -p "${HHALIGN_RESDIR}"
+for query in "${modlst[@]}"; do
+    [ -e "${query}" ] || continue
+    queryBase=${query##*/}
+    queryName=${queryBase%.hhm}
+    #mkdir -p "${HHALIGN_HHRDIR}/${queryName}"
+    for target in "${modlst[@]}"; do
+        [ -e "${target}" ] || continue
+        targetBase=${target##*/}
+        targetName=${targetBase%.hhm}
+        [ "${queryName}" = "${targetName}" ] || cat "${HHALIGN_HHRDIR}/${queryName}/${targetName}.hhr"
+    done >"${HHALIGN_RESDIR}/${queryName}.hhr"
+done
+
+print_status "generating motifs"
+HHDICT="${PV_LIBDIR}/${PV_LIBNAME}.hhdict.pgz"
+mkdir -p "${PV_OUTDIR}"
+if [ $? -ne 0 ]; then
+    print_error "cannot create output directory"
+    exit 1
+fi
+python3 "${SCRIPTS_DIR}/parseHHR.py" --hhm-list "${MODLIST}" --hhm-dict "${HHDICT}" --hhr-dir "${HHALIGN_RESDIR}" --fas-dir "${PV_LIBDIR}/fas" --out-dir "${PV_OUTDIR}" 2>>"${PV_LOGFILE}"
+
--- a/profileview-tree
+++ b/profileview-tree
+#!/usr/bin/env bash
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+
+CMD_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}" )" && pwd)
+CMD_NAME=$(basename "${BASH_SOURCE[0]}")
+SCRIPTS_DIR="${CMD_DIR}"/scripts
+
+# Include common definitions
+source "${CMD_DIR}/profileview-common"
+
+# Pressing CTRL-C will stop the whole execution of the script
+trap ctrl_c INT; 
+function ctrl_c() { exit 5; }
+
+# Definition of functions and global variables specific to this script
+
+INPUT_FASTA=""
+PV_LIBPATH=""
+PV_OUTTREE=""
+PV_SEQDESC=""
+PV_OUTPREFIX="out"
+PV_TMPDIR=""
+NJOBS=8
+
+PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
+if ! command -v parallel >/dev/null 2>&1; then
+    print_warning "cannot find GNU parallel, all jobs will be run sequentially"
+    PEXEC_CMD="/usr/bin/env bash --"
+fi
+
+function print_usage() {
+    echo -en "\n  USAGE: ${CMD_NAME} -i <input_fasta> -l <profileview_lib> [options]\n"
+    echo -en "\n"
+    echo -en "  MANDATORY OPTIONS:\n
+    -i, --input <name>\tFile containing the sequences to be classified in FASTA format\n
+    -l, --lib <name>\tProfileView library name\n" | column -t -s $'\t'
+    echo -en "\n"
+    echo -en "  OTHER OPTIONS:\n
+    -o, --out-tree <name>\tOutput file which will contain the ProfileView tree\n
+                         \t(if not provided, it will be printed on the standard output)\n
+    -s, --seq-desc <name>\tInput sequence descriptor file, that is a CSV file containing the follwing fileds:\n
+                         \t<sequence_id>,<function_id>,<family_id>,<sequence_length>\n
+    -p, --prefix <name>\tPrefix of output files (default:${PV_OUTPREFIX})\n
+    --temp-dir <name>\tTemporary result directory\n
+    -j, --max-jobs <num>\tNumber of parallel jobs (default:8)\n
+    -h, --help\tPrint this help message\n
+    -V, --version\tPrint version\n" | column -t -s $'\t'
+    echo -en "\n"
+}
+
+# retrieve provided arguments
+
+opts="i:l:o:s:p:j:hV"
+longopts="input:,lib:,out-tree:,seq-desc:,prefix:,temp-dir:,max-jobs:,help,version"
+ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
+if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important!
+    print_usage
+    exit 2
+fi
+eval set -- "${ARGS}"
+
+declare -a DBNAMES
+while [ -n "${1}" ]; do
+    case ${1} in
+        -i|--input)
+            shift
+            INPUT_FASTA=${1}
+            ;;
+        -l|--lib)
+            shift
+            PV_LIBPATH=${1}
+            ;;
+        -o|--out-tree)
+            shift
+            PV_OUTTREE=${1}
+            ;;
+        -s|--seq-desc)
+            shift
+            PV_SEQDESC=${1}
+            ;;
+        -p|--prefix)
+            shift
+            PV_OUTPREFIX=${1}
+            ;;
+        --temp-dir)
+            shift
+            PV_TMPDIR=${1}
+            ;;
+        -j|--max-jobs)
+            shift
+            NJOBS=${1}
+            ;;
+        -h|--help) 
+            print_usage
+            exit 0
+            ;;
+        -V|--version) 
+            print_version
+            exit 0
+            ;;
+        --)
+            shift
+            break
+            ;;
+    esac
+    shift
+done
+
+# Input parameters validation
+
+if [ -z "${INPUT_FASTA}" ]; then
+    print_error "-i|--input parameter is mandatory"
+    print_usage
+    exit 1
+elif [ ! -f "${INPUT_FASTA}" ]; then
+    print_error "Input file does not exist: ${INPUT_FASTA}"
+    exit 1
+fi
+
+if [ -z "${PV_LIBPATH}" ]; then
+    print_error "-l|--lib parameter is mandatory"
+    print_usage
+    exit 1
+elif [ ! -d "${PV_LIBPATH}" ]; then
+    print_error "ProfileView library not found: ${PV_LIBPATH}"
+    exit 1
+fi
+
+PV_LIBNAME=$(basename "${PV_LIBPATH}")
+PV_LIBDIR=$(abspath "${PV_LIBPATH}")
+
+if [ ! -z "${PV_SEQDESC}" ] && [ ! -f "${PV_SEQDESC}" ]; then
+    print_error "Cannot find sequence descriptor file: ${PV_SEQDESC}"
+    exit 1
+fi
+
+if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then
+    print_warning "-j|--max-jobs parameter should be a positive integer; the default value of 1 will be used."
+    NJOBS=1
+fi
+
+check_cmds "hmmsearch" "python3" "Rscript"
+check_pymodules "ete3" "numpy"
+#check_files "${SCRIPTS_DIR}"/{createHHdict.py,createHmmerDict.py,hh_utils.py,pv_utils.py}
+
+# Create temp working directory
+if [ -z "${PV_TMPDIR}" ]; then
+    PV_TMPDIR=$(mktemp -p "${TMPDIR:-.}" -d pvtmp-XXXXX) || {
+        print_error "cannot create temporary directory"
+        exit 1
+    }
+fi
+PV_TMPDIR=$(abspath "${PV_TMPDIR}")
+if [ ! -d "${PV_TMPDIR}" ] || [ -n "$(ls -A "${PV_TMPDIR}")" ]; then
+    print_error "provided path \"${PV_TMPDIR}\" must be an empty directory"
+    exit 1
+fi
+print_status "using temporary directory: ${PV_TMPDIR}"
+
+# Create log file
+PV_LOGFILE="${PV_TMPDIR}/profileview-tree_$(date +%Y%m%d_%H%M%S).log"
+touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
+
+# Possibly create a sequence descriptor file (if not provided)
+if [ -z "${PV_SEQDESC}" ]; then
+    PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
+    print_status "creating temporary sequence descriptor file: ${PV_SEQDESC}"
+    awk '
+        BEGIN {
+            sname=""
+            seqlen=0
+            OFS=","
+        }
+        /^>/ {
+            if(sname!=""){ print sname,"NA","NA",seqlen }
+            sname=substr($1,2)
+            seqlen=0
+            next
+        }
+        {
+            for(i=1;i<=NF;i++){seqlen+=length($i)}
+        }
+        END {
+            if(sname!=""){ print sname,"NA","NA",seqlen }
+        }
+    ' "${INPUT_FASTA}" >"${PV_SEQDESC}"
+else
+    cp "${PV_SEQDESC}" "${PV_TMPDIR}/sequences.csv"
+    PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
+fi
+
+# Run hmmsearch for each model of the library against the input sequences
+print_status "running hmmsearch against input sequences"
+HMMSEARCH_RESDIR="${PV_TMPDIR}"/hmmsearch
+mkdir -p "${HMMSEARCH_RESDIR}"
+for hmm in "${PV_LIBDIR}"/hmm/*.hmm; do
+    [ -e "${hmm}" ] || continue
+    hmmBase=${hmm##*/}
+    hmmName=${hmmBase%.hmm}
+    echo "hmmsearch -o ${HMMSEARCH_RESDIR}/${hmmName}.out ${hmm} ${INPUT_FASTA} 2>>${PV_LOGFILE}"
+done | ${PEXEC_CMD}
+if [ $? -ne 0 ]; then
+    print_error "error during hmmsearch jobs, see log: ${PV_LOGFILE}"
+    exit 1
+fi
+
+print_status "processing hmmsearch output files"
+PV_HMMLIB="${PV_LIBDIR}"/"${PV_LIBNAME}".hmmdict.pgz
+PV_SCOREFILE="${PV_TMPDIR}"/hmmsearch.scores
+for f in "${HMMSEARCH_RESDIR}"/*.out; do 
+    [ -e "${f}" ] || continue
+    cat "${f}"
+done | python3 "${SCRIPTS_DIR}"/parseHMMER.py --hmmer-dict "${PV_HMMLIB}" --seq-db "${PV_SEQDESC}" >"${PV_SCOREFILE}" 2>>"${PV_LOGFILE}"
+if [ $? -ne 0 ]; then
+    print_error "error during processing of hmmsearch results, see log: ${PV_LOGFILE}"
+    exit 1
+fi
+
+
+print_status "filtering sequences"
+PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
+awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null
+
+print_status "building representation space"
+python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k 3 2>>"${PV_LOGFILE}"
+if [ $? -ne 0 ]; then
+    print_error "error during feature generation, see log: ${PV_LOGFILE}"
+    exit 1
+fi
+
+print_status "building ProfileView tree"
+Rscript --vanilla "${SCRIPTS_DIR}"/clusterSequences.R "${PV_TMPDIR}/out.feat" "${PV_TMPDIR}/out.tree" 2>>"${PV_LOGFILE}"
+if [ $? -ne 0 ]; then
+    print_error "could not create ProfileView tree, see log: ${PV_LOGFILE}"
+    exit 1
+fi
+
+print_status "finding representative models and generating annotated ProfileView tree"
+python3 "${SCRIPTS_DIR}/findReprModels.py" -t "${PV_TMPDIR}/out.tree" -s "${PV_SCOREFILE}" -m "${PV_TMPDIR}/out.used_models.list" -o "${PV_OUTPREFIX}" 2>>"${PV_LOGFILE}"
+if [ $? -ne 0 ]; then
+    print_error "error during identification of representative models and the construction of the annotated ProfileView tree, see log: ${PV_LOGFILE}"
+    exit 1
+fi
+
+
--- a/scripts/__pycache__/hh_utils.cpython-36.pyc
+++ b/scripts/__pycache__/hh_utils.cpython-36.pyc
--- a/scripts/__pycache__/pv_utils.cpython-36.pyc
+++ b/scripts/__pycache__/pv_utils.cpython-36.pyc
--- a/scripts/clusterSequences.R
+++ b/scripts/clusterSequences.R
+#!/usr/bin/env Rscript
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+
+library(ape)
+
+args = commandArgs(trailingOnly=TRUE)
+if (length(args)==0) { 
+    stop("at least one argument must be supplied") 
+} else if (length(args)==1) {
+    args[2] <- ''
+}
+
+feat <- read.table( args[1], row.names=1, header=TRUE, sep="\t", na.strings=c("") )
+pc <- prcomp(feat[,c(-1,-2)], scale=T)
+
+eigs <- pc$sdev^2
+cumvar = cumsum(eigs)/sum(eigs) # cumulative explaned variance of PCs
+pc_i = min( c(length(cumvar),which(cumvar >= .99)) ) # get the PC that allows to explain at least the 99% of the variance
+
+d <- dist(pc$x[,1:pc_i])
+hc <- hclust(d,method="ward.D2")
+
+my_tree <- as.phylo(hc) 
+if ( args[2] == '' ) {
+    write.tree(phy=my_tree,file=stdout())
+} else {
+    write.tree(phy=my_tree,file=args[2])
+}
+
--- a/scripts/createHHdict.py
+++ b/scripts/createHHdict.py
+#!/usr/bin/env python3
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+import sys, os, argparse
+import time
+import re
+import math
+import pickle, gzip
+
+from pv_utils import *
+from hh_utils import hhm, parse_hhm
+
+script_name = re.sub( r'\.py$', '', os.path.basename(__file__) )
+
+def main(argv=None):
+    start = time.perf_counter()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hhm-file', dest='hhmFile', type=str, help='hhm file path')
+    parser.add_argument('--hhm-dir', dest='hhmDir', type=str, help='hhm directory path')
+    parser.add_argument('--prefix', dest='outPrefix', default="hhdict", type=str, help='output prefix of generated dictionary')
+    args = parser.parse_args()
+
+    # parameters check
+    validParameters = True;
+    if args.hhmFile is None and args.hhmDir is None:
+        validParameters = False
+        print_error('either one of --hhm-file or --hhm-dir arguments must be set')
+    if args.hhmFile is not None and args.hhmDir is not None:
+        validParameters = False
+        print_error('it is not possible to use both --hhm-file and --hhm-dir arguments')
+    elif args.hhmFile is not None and not os.path.isfile(args.hhmFile):
+        validParameters = False
+        print_error('--hhm-file \"{}\" is not a file'.format(args.hhmFile))
+    elif args.hhmDir is not None and not os.path.isdir(args.hhmDir):
+        validParameters = False
+        print_error('--hhm-dir \"{}\" is not a directory'.format(args.hhmDir))
+    if not validParameters:
+        return 1
+
+    fileList = []
+    if args.hhmFile is not None:
+        fileList = [ args.hhmFile ]
+    if args.hhmDir is not None:
+        hhms_dir = os.path.abspath(args.hhmDir)
+        fileList = [ '{}/{}'.format(hhms_dir,f) for f in os.listdir(hhms_dir) if f.endswith('.hhm') and os.path.isfile('{}/{}'.format(hhms_dir,f)) ]
+
+    if len(fileList) == 0:
+        print_warning('no hhm file found in directory "{}"'.format(args.hhmDir))
+
+    print_status("parsing HHM files")
+    hhmDict = {}
+    for fname in fileList:
+        with open(fname,'r') as fh:
+            hhm = parse_hhm(fh)
+            hhmDict[ hhm.name ] = hhm
+
+    hhmDictFileName = "{}.pgz".format(args.outPrefix)
+    print_status("writing hhm dictionary to \"{}\"".format(script_name,hhmDictFileName))
+    with gzip.GzipFile(hhmDictFileName, 'w') as hhmDictFile:
+        pickle.dump( hhmDict, hhmDictFile )
+
+    exec_time = time.perf_counter() - start
+    print_status('execution time: {:.2f}s'.format(script_name,exec_time));
+
+    return 0
+
+# Check if the program is not being imported
+if __name__ == "__main__":
+    sys.exit(main())
--- a/scripts/createHmmerDict.py
+++ b/scripts/createHmmerDict.py
+#!/usr/bin/env python3
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+import sys, os, argparse
+import time
+import gzip, pickle
+
+from pv_utils import *
+from hmm_utils import *
+
+# a HMMER3 model is assumed
+def parseHmmerModel( fh ):
+    hmm = hmm_model()
+    while True:
+        line = fh.readline()
+        if not line:
+            break
+        tokens = line.split()
+        if len(tokens) == 0:
+            continue
+        elif tokens[0] == "NAME":
+            hmm.name = tokens[1]
+        elif tokens[0] == "LENG":
+            hmm.length = int(tokens[1])
+        elif tokens[0] == "NSEQ":
+            hmm.nseq = int(tokens[1])
+        elif tokens[0] == "HMM":
+            fh.readline() # skip line after HMM: m->m  m->i  m->d  i->m  i->i  d->m  d->d
+            stateCount = 1
+            line = fh.readline()
+            while line and not line.startswith("//"):
+                tokens = line.split()
+                if len(tokens) == 26 and stateCount == int(tokens[0]):
+                    hmm.states.append([float(v) for v in tokens[1:21]])
+                    stateCount += 1
+                elif len(tokens) == 21 and tokens[0] == "COMPO":
+                    hmm.bg_freq = [float(v) for v in tokens[1:21]]
+                line = fh.readline()
+            if line.startswith("//") and hmm.is_valid():
+                hmm.states = [hmm.bg_freq] + hmm.states
+                yield hmm
+            hmm = hmm_model()
+
+
+def main(argv=None):
+    start = time.perf_counter()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hmm-file', dest='hmmFile', type=str, help='hmm file path')
+    parser.add_argument('--hmm-dir', dest='hmmDir', type=str, help='hmm directory path')
+    parser.add_argument('--prefix', dest='outPrefix', default="hmmerDict", type=str, help='output prefix of generated dictionary')
+    args = parser.parse_args()
+
+    # parameters check
+    validParameters = True;
+    if args.hmmFile is None and args.hmmDir is None:
+        validParameters = False
+        print_error('either one of --hmm-file or --hmm-dir arguments must be set')
+    if args.hmmFile is not None and args.hmmDir is not None:
+        validParameters = False
+        print_error('it is not possible to use both --hmm-file and --hmm-dir arguments')
+    elif args.hmmFile is not None and not os.path.isfile(args.hmmFile):
+        validParameters = False
+        print_error('--hmm-file \"{}\" is not a file'.format(args.hmmFile))
+    elif args.hmmDir is not None and not os.path.isdir(args.hmmDir):
+        validParameters = False
+        print_error('--hmm-dir \"{}\" is not a directory'.format(args.hmmDir))
+    if not validParameters:
+        return 1
+
+    start = time.clock()
+
+    fileList = []
+    if args.hmmFile is not None:
+        fileList = [ args.hmmFile ]
+    if args.hmmDir is not None:
+        hmmdir = args.hmmDir[:-1] if args.hmmDir.endswith("/") else args.hmmDir
+        fileList = [ '{}/{}'.format(hmmdir,f) for f in os.listdir(hmmdir) if f.endswith('.hmm') and os.path.isfile('{}/{}'.format(hmmdir,f)) ]
+
+    print_status("parsing HMMER3 models");
+    hmmDict = {}
+    for fname in fileList:
+        with open(fname,'r') as fh:
+            for hmm in parseHmmerModel(fh):
+                hmmDict[ hmm.name ] = hmm
+                #print(str(hmm))
+
+    hmmDictFileName = "{}.pgz".format(args.outPrefix)
+    print_status("writing ProfileView's HMM dictionary to \"{}\"".format(hmmDictFileName))
+    with gzip.GzipFile(hmmDictFileName, 'w') as hmmDictFile:
+        pickle.dump( hmmDict, hmmDictFile )
+
+    exec_time = time.perf_counter() - start
+    print_status('execution time: {:.2f}s'.format(exec_time));
+
+    return 0
+
+# Check if the program is not being imported
+if __name__ == "__main__":
+    sys.exit(main())
+
--- a/scripts/findReprModels.py
+++ b/scripts/findReprModels.py
+#!/usr/bin/env python3
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+import sys, errno, os
+import argparse
+import math
+from collections import defaultdict
+
+import numpy as np
+from ete3 import Tree
+
+from pv_utils import *
+
+
+def getBestSubtreeThreshold( mod_name, seq_scores, seq_names, seq_subtree ):
+    thr = [0.0,0.0]
+    points = [ seq_scores[i] for i,s in enumerate(seq_names) if s not in seq_subtree ]
+    return np.amax(points,axis=0) if len(points) > 0 else thr
+
+
+def getCentroid(points):
+    a = np.asarray(points)
+    num = a.shape[0]
+    return [ np.sum(a[:,0])/num, np.sum(a[:,1])/num ] if num > 0 else [0.0,0.0]
+
+
+def getRepresentedSequences( mod_name, seq_scores, seq_names, seq_subtree ):
+    thr = getBestSubtreeThreshold( mod_name, seq_scores, seq_names, seq_subtree )
+    indices = np.nonzero(np.any(np.greater(seq_scores,thr),axis=1))[0]
+    scores = [ seq_scores[i] for i in indices ]
+    names = uniq( seq_names[i] for i in indices )
+    ctr = getCentroid(scores)
+    ctr_norm = math.hypot(*ctr) if ctr is not None else 0.0
+    return { "model":mod_name, "covered":names, "num_sequences":len(seq_subtree), "covered_score":ctr_norm, "thr":thr }
+
+
+def getBestRepresentativeModel( best, a ):
+    best_covered = len(best["covered"])
+    best_score = best["covered_score"]
+    a_covered = len(a["covered"])
+    a_score = a["covered_score"]
+    if (a_covered > best_covered) or (a_covered == best_covered and a_score > best_score):
+        best = a
+    return best
+
+
+def hasBetterCoverage(a,b):
+    a_covered = len(a["covered"])
+    b_covered = len(b["covered"])
+    return a_covered > b_covered
+
+
+
+def main( argv = None ):
+
+    # GET PARAMETERS
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-t', '--tree',   dest='treeFile', required=True, help='Input tree (in newick format) obtained from the hierarchical clustering')
+    parser.add_argument('-s', '--scores', dest='scoreFile', required=True, help='Score file obtained by the script parseHMMER.py')
+    parser.add_argument('-m', '--models', dest='modelFile', help='List of model identifiers to consider')
+    parser.add_argument('--sorted',       dest='isScoreSorted', action='store_true', help='Use this flag if score file is sorted by score (descending order)')
+    parser.add_argument('--min-coverage', dest='minCoverage', type=float, default=0.5)
+    parser.add_argument('-o', '--out',    dest='outFile',   default='out.reprTree', help='output prefix')
+    opts = parser.parse_args()
+
+    # VALIDATE PARAMETERS
+    validParameters = True
+    if not os.path.isfile(opts.treeFile):
+        validParameters = False
+        print_error("input tree file \"" + opts.treeFile + "\" does not exist.")
+    if not os.path.isfile(opts.scoreFile):
+        validParameters = False
+        print_error("score file \"" + opts.scoreFile + "\" does not exist.")
+    if not validParameters:
+        return 1
+
+    # LOAD MODEL FILE (IF PROVIDED)
+    modSet = set()
+    if opts.modelFile is not None:
+        with open(opts.modelFile,'r') as fh:
+            for line in fh:
+                modid = line.strip()
+                if modid != "":
+                    modSet.add(modid)
+
+    # LOAD FUNCTION TREE
+    funTree = Tree(opts.treeFile)
+    seqSet = set([ n.name for n in funTree.iter_leaves() ])
+
+    # LOAD SCORE FILE
+    modSeqs = defaultdict(list)
+    modScores = defaultdict(list)
+    with open(opts.scoreFile,'r') as fh:
+        for line in fh:
+            if not line.startswith('#'):
+                fields = line.strip().split('\t') # hmm_name, hmm_len, seq_name, seq_len, seq_family, seq_func, bitscore, mean-bitscore, ...
+                mod_name = fields[0]
+                seq_name = fields[2]
+                if opts.modelFile is None: # no model file given, cosider the set of models mentioned in the score file
+                    modSet.add(mod_name)
+                if seq_name in seqSet:
+                    modSeqs[mod_name].append(seq_name)
+                    modScores[mod_name].append([float(fields[x]) for x in [7,11]]) # [ bitscore, mean-bitscore ]
+
+    seqDict = defaultdict(set)
+    for mod_name in modScores: # in modDict
+        if mod_name in modSet: # and (len(modDict[mod_name]) > 0):
+            for i in np.argmax(modScores[mod_name],axis=0):
+                seq_name = modSeqs[mod_name][i]
+                seqDict[seq_name].add(mod_name)
+
+    # (POST-ORDER) VISIT OF THE FUNCTIONAL TREE TO FIND REPRESENTATIVE MODELS
+    nameIdx=1
+    for node in funTree.iter_descendants("postorder"):
+        if node.is_leaf(): # do not find best representative for leaves
+            node.add_feature("models",seqDict[node.name])
+            node.add_feature("sequences",set([node.name]))
+            node.add_feature("num_covered",0)
+            node.add_feature("mod_support",0.0)
+            #print( 'LEAF({}): {}'.format(node.name,len(node.models)) )
+            continue
+        # internal node (different from the root)
+        assert len(node.children) == 2
+        node.name = 'I{}'.format(nameIdx)
+        nameIdx += 1
+        # gather models and sequences from the children
+        node.add_feature("models", node.children[0].models | node.children[1].models)
+        node.add_feature("sequences", node.children[0].sequences | node.children[1].sequences)
+        #print( 'INTERNAL({}): {}'.format(node.name,node.models) )
+        # test whether any of the model is representative for the sequence set
+        best = { "model":"", "covered":[], "num_sequences":len(node.sequences), "covered_score":0.0, "thr":[0.0,0.0] }
+        best_list = []
+        good_list  = []
+        for mod_name in node.models:
+            rs = getRepresentedSequences(mod_name, modScores[mod_name], modSeqs[mod_name], node.sequences)
+            rs_covered = len(rs["covered"])
+            rs_support = float(rs_covered)/rs["num_sequences"]
+            best_covered = len(best["covered"])
+            if rs_covered > best_covered:
+                best_list = [ (rs["model"],rs["covered_score"]) ]
+            elif rs_covered == best_covered:
+                best_list.append( (rs["model"],rs["covered_score"]) )
+            if rs_support >= .9:
+                good_list.append( (rs["model"],rs["covered_score"]) )
+            best = getBestRepresentativeModel(best,rs)
+        best_list.sort(key=lambda x:float(x[1]),reverse=True)
+        good_list.sort(key=lambda x:float(x[1]),reverse=True)
+        node.add_feature("repr_model",best["model"])
+        node.add_feature("best_models",best_list)
+        node.add_feature("good_models",good_list)
+        node.add_feature("covered",best["covered"])
+        node.add_feature("num_covered",len(node.covered))
+        node.add_feature("num_sequences",len(node.sequences))
+        node.add_feature("mod_support",float(node.num_covered)/node.num_sequences)
+        node.add_feature("covered_score",best["covered_score"])
+        node.add_feature("thr",best["thr"])
+        node.support = node.mod_support if (node.mod_support >= opts.minCoverage and node.num_covered != node.children[0].num_covered and node.num_covered != node.children[1].num_covered) else 0.0
+        node.support = node.support if len(node.sequences) > 2 or (node.num_covered == 2 and node.num_sequences == 2) else 0.0
+        node.add_feature("conf",node.support)
+
+    with open('{}.models.tsv'.format(opts.outFile),'w') as ofh:
+        fmt = ['{nodeid}','{repr_model}','{mod_support:.2f}','{num_covered}','{num_sequences}','{seq_name}','{is_covered}','{best_models}','{good_models}']
+        for node in funTree.iter_descendants('preorder'):
+            if node.is_leaf() or node.num_covered <= 2:
+                continue
+            if node.support > 0.0: # has a representative model
+                covered = set(node.covered)
+                best_set = set()
+                good_set = set()
+                best_list=[]
+                for p in node.best_models:
+                    if p[0] not in best_set:
+                        best_list.append(p[0])
+                        best_set.add(p[0])
+                good_list=[]
+                for p in node.good_models:
+                    if p[0] not in best_set and p[0] not in good_set:
+                        good_list.append(p[0])
+                        good_set.add(p[0])
+                for seq_name in node.sequences:
+                    ofh.write('\t'.join(fmt).format( nodeid=node.name, repr_model=node.repr_model, mod_support=node.mod_support,
+                        num_covered=node.num_covered, num_sequences=node.num_sequences, seq_name=seq_name,
+                        is_covered=(seq_name in covered), best_models=';'.join(best_list), good_models=';'.join(good_list) ))
+                    ofh.write('\n')
+
+
+    funTree.write(features=['name','conf','repr_model','num_covered'], format=2, outfile='{}.nhx'.format(opts.outFile))
+    return 0
+
+
+# Check if the program is not being imported
+if __name__ == "__main__":
+    sys.exit(main())
+
--- a/scripts/generateFeatures.py
+++ b/scripts/generateFeatures.py
--- a/scripts/hh_utils.py
+++ b/scripts/hh_utils.py
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+import math
+import operator
+
+from BitVector import BitVector
+from pv_utils  import *
+
+class hhm():
+
+    def __init__(self, hhm_name, aa2idx, neff_hmm, neff_m, consensus, master_seq, match_states):
+        self.name         = hhm_name
+        self.aa2idx       = aa2idx
+        self.consensus    = consensus
+        self.master_seq   = master_seq
+        self.match_states = match_states
+        self.neff_hmm     = neff_hmm
+        self.neff_m       = neff_m
+        self.conserved    = BitVector( bitstring = ''.join([ '1' if c.isupper() else '0' for c in self.consensus ]) )
+
+
+    def __str__(self):
+        return "NAME: {}\nCONSENSUS: {}\nCONSERVED: {}\nMASTERSEQ: {}".format(self.name, self.consensus, self.conserved, self.master_seq)
+
+    def frequency(self, state, aa):
+        aaidx = self.aa2idx[aa]
+        val = self.match_states[state][aaidx]
+        return 2**(-val/1000.0)
+
+    def state_frequencies(self, state):
+        out = {}
+        for aa in self.aa2idx:
+            aai = self.aa2idx[aa]
+            val = self.match_states[state][aai]
+            out[aa] = 2**(-val/1000.0)
+        return out
+
+    def size(self):
+        return len(self.master_seq)
+    
+    def get_consensus(self):
+        return self.consensus
+
+
+def parse_hhm(fh):
+    hhm_name     = ""
+    aa_list      = []
+    aa2idx       = {}
+    consensus    = ""
+    master_seq   = ""
+    match_states = []
+    neff_hmm     = 0.0
+    neff_m       = [99.999]
+    while True:
+        line = fh.readline()
+        if line == '':
+            break
+        tokens = line.split()
+        if len(tokens) == 0:
+            continue
+        elif tokens[0] == "NAME":
+            hhm_name = tokens[1]
+        elif tokens[0] == "NEFF":
+            neff_hmm = float(tokens[1])
+        elif tokens[0] == "SEQ":
+            while not line.startswith('>Consensus'):
+                line = fh.readline()
+            # found consensus sequence id
+            line = fh.readline()
+            while line and line[0] != '>':
+                consensus += ''.join(line.split())
+                line = fh.readline()
+            # found master sequence id
+            line = fh.readline()
+            while line and line[0] != '>' and line[0] != '#':
+                master_seq += ''.join(line.split())
+                line = fh.readline()
+            # consensus/master sequences processed, discard remaining sequences until the end of the SEQ block
+            while line and line[0] != '#':
+                line = fh.readline()
+        elif tokens[0] == "NULL":
+            null_values = [ int(v) if is_int(v) else 99999 for v in tokens[1:21] ]
+            match_states.append(null_values)
+        elif tokens[0] == "HMM":
+            # load AA alphabet
+            aa_list = tokens[1:21]
+            for i,aa in enumerate(aa_list):
+                aa2idx[aa] = i
+            # skip first two lines (transitions definitions/start probabilities)
+            fh.readline() # M->M M->I M->D I->M I->I D->M D->D Neff NeffI NeffD
+            fh.readline() #    0    *    *    0    *    0    *    *     *     *
+            line = fh.readline()
+            while line and not line.startswith("//"):
+                tokens = line.split()
+                if len(tokens) == 23:
+                    state_values = [ int(val) if is_int(val) else 99999 for val in tokens[2:22] ]
+                    match_states.append(state_values)
+                elif len(tokens) == 10:
+                    val = tokens[7]
+                    neff_m.append( int(val)/1000.0 if is_int(val) else 99.999 )
+                line = fh.readline()
+    assert len(match_states) == len(master_seq)+1
+    return hhm( hhm_name, aa2idx, neff_hmm, neff_m, consensus, master_seq, match_states )
+
+
--- a/scripts/hmm_utils.py
+++ b/scripts/hmm_utils.py
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+import math
+import operator
+
+from pv_utils import *
+
+
+class hmm_model():
+
+    def __init__(self):
+        self.name    = ""
+        self.length  = 0
+        self.nseq    = 0
+        self.states  = []
+        self.cons    = ""
+        self.aa_list = [ c for c in "ACDEFGHIKLMNPQRSTVWY" ]
+        self.aa2idx  = { c:i for i,c in enumerate(self.aa_list) }
+        self.bg_freq = [ -math.log(0.05) for c in self.aa_list ]
+
+    def __str__(self):
+        consensus = []
+        for k,s in enumerate(self.states):
+            aai, minval = min(enumerate(s), key=operator.itemgetter(1))
+            f = self.frequency(k, self.aa_list[aai])
+            c = self.aa_list[aai].upper() if f >= .6 else (self.aa_list[aai].lower() if f >= .4 else 'x')
+            consensus.append(c)
+        return "NAME={}\nLENG={} STATES={}\nNSEQ={}\nCONS={}".format(self.name, self.length,len(self.states),self.nseq, ''.join(consensus))
+
+    def frequency(self, k, aa):
+        aaidx = self.aa2idx[aa]
+        return math.exp(-self.states[k][aaidx])
+
+#    def consensus(self):
+#        if not self.cons:
+#            consensus = []
+#            for k,s in enumerate(self.states):
+#                aai, minval = min(enumerate(s), key=operator.itemgetter(1))
+
+    def getStateFreqDict(self, k):
+        outDict = {}
+        for aa in aa2idx:
+            aaidx = self.aa2idx[aa]
+            outDict[aa] = math.exp(-self.states[k][aaidx])
+        return outDict
+
+    def ln_odds_ratio(self,k,aa):
+        aai   = self.aa2idx[aa]
+        m_k_a = self.states[k][aai]
+        m_0_a = self.bg_freq[aai]
+        return (m_0_a - m_k_a)
+
+    def score(self, k, aa):
+        return self.ln_odds_ratio(k,aa)/math.log(2.0) # convert to base2 logarithm
+
+    def is_valid(self):
+        return len(self.name) > 0 and self.length > 0 and self.length == len(self.states) and self.nseq > 0
+
+
+class hmmer_hit():
+
+    def __init__(self):
+        self.hmm_name = ""
+        self.seq_name = ""
+        self.typ      = '?'
+        self.score    = 0.0 # HMMER's bit score
+        self.mcscore  = 0.0 # sum of scores in well-aligned positions (PP[i] \in {8,9,*}) with a match between the sequence and the hmm consensus
+        self.wcscore  = 0.0 # sum of scores in well-aligned positions (PP[i] \in {8,9,*}) with score[i] >= threshold
+        self.matches  = 0 # number of matches between the sequence and the hmm consensus
+        self.aligned_cols = 0 # number of aligned match states
+        self.well_aligned_cols = 0 # number of well-aligned match states
+        self.ali_len  = 0 # length of alignment
+        self.hmm_from = 0
+        self.hmm_to   = 0
+        self.ali_from = 0
+        self.ali_to   = 0
+        self.env_from = 0
+        self.env_to   = 0
+        self.hmm_cons = ""
+        self.seq_cons = ""
+        self.pp_cons  = ""
+        self.wc_str   = ""
+        self.cm_str   = ""
+        self.match_str = ""
+
+    def canExtendWith(self,b):
+        if self.hmm_name != b.hmm_name or self.seq_name != b.seq_name or self.typ == '?' or b.typ == '?':
+            return False
+        Lm = b.hmm_to-self.hmm_from+1
+        Ls = b.ali_to-self.ali_from+1
+        return self.hmm_from <= b.hmm_from and self.ali_from <= b.ali_from and math.fabs(Ls-Lm)/float(min(Ls,Lm)) <= .2
+
+    # it is assumed that canExtendWith(b) holds True
+    def extendWith(self,b):
+        self.score   += b.score
+        self.mcscore += b.mcscore
+        self.wcscore += b.wcscore
+        self.matches += b.matches
+        self.aligned_cols += b.aligned_cols
+        self.well_aligned_cols += b.well_aligned_cols
+        self.ali_len += b.ali_len
+        self.hmm_from = min(self.hmm_from,b.hmm_from)
+        self.hmm_to   = max(self.hmm_to,b.hmm_to)
+        self.ali_from = min(self.ali_from,b.ali_from)
+        self.ali_to   = max(self.ali_to,b.ali_to)
+        self.env_from = min(self.env_from,b.env_from)
+        self.env_to   = max(self.env_to,b.env_to)
+        self.wc_str  += b.wc_str
+        self.cm_str  += b.cm_str
+        self.match_str += b.match_str
+        return self
+
+#    def __str__(self):
+#        return "HMM {}\nSEQ {}\nSC  {}\nHMM-SEQ  ALIGNMENT  {}-{}  {}-{}\n{}\n{}\n{}".format(
+#                self.hmm_name, self.seq_name, self.score,
+#                self.hmm_from, self.hmm_to, self.ali_from, self.ali_to,
+#                self.hmm_cons, self.seq_cons, self.pp )
+
+
--- a/scripts/parseHHR.py
+++ b/scripts/parseHHR.py
--- a/scripts/parseHMMER.py
+++ b/scripts/parseHMMER.py
+#!/ usr/bin/env python
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+import sys, errno, os
+import argparse
+import gzip
+import re
+import pickle
+
+from hmm_utils import *
+from pv_utils import *
+
+
+def parseHMMER(fh,hmmDict):
+    hmm_name = ""
+    seq_name = ""
+    line = fh.readline()
+    while line:
+        tokens = line.split()
+        if len(tokens) == 0 or line.startswith("#"):
+            pass
+        elif line.startswith("Query: "):
+            hmm_name = tokens[1]
+        elif line.startswith(">> "): # target sequence section
+            seq_name = tokens[1]
+            # skip seqName domain table header
+            fh.readline()
+            fh.readline()
+            # process domain table
+            line = fh.readline()
+            hitTable = []
+            while line and not line.lstrip().startswith("== domain "):
+                tokens = line.split()
+                ## domain table row:
+                ## domNum  [!?]  score  bias  c-Evalue  i-Evalue  hmmfrom  hmm-to  .. alifrom  ali-to  ..  envfrom  env-to  ..  acc
+                if len(tokens) == 16:
+                    #eprint('[debug] domain-table: {}'.format(line[:-1]))
+                    hit = hmmer_hit()
+                    hit.hmm_name = hmm_name
+                    hit.seq_name = seq_name
+                    hit.typ = tokens[1]
+                    hit.score = float(tokens[2])
+                    hit.hmm_from = int(tokens[6])
+                    hit.hmm_to   = int(tokens[7])
+                    hit.ali_from = int(tokens[9])
+                    hit.ali_to   = int(tokens[10])
+                    hit.env_from = int(tokens[12])
+                    hit.env_to   = int(tokens[13])
+                    hitTable.append(hit)
+                line = fh.readline()
+            # process each domain hit (i.e., hmm-seq alignment)
+            while line and line.lstrip().startswith("== domain "):
+                tokens = line.split()
+                hit = hitTable[ int(tokens[2])-1 ]
+                hmm_cons = ""
+                seq_cons = ""
+                pp_cons  = ""
+                while True:
+                    line = fh.readline()
+                    tokens = line.split()
+                    if len(tokens) == 4 and tokens[0] == hit.hmm_name:
+                        hmm_cons += tokens[2]
+                        #fh.readline() # skip hmm-seq match string
+                        match_cons_i = re.search(r"\s*\S+\s+[0-9]+",line).end()+1
+                        match_line   = fh.readline()
+                        hit.match_str += match_line[ match_cons_i : match_cons_i+len(tokens[2]) ]
+                        tokens = fh.readline().split()
+                        seq_cons += tokens[2]
+                        aln_to = int(tokens[3])
+                        tokens = fh.readline().split()
+                        pp_cons += tokens[0]
+                        if aln_to == hit.ali_to:
+                            break
+                hit.hmm_cons = hmm_cons
+                hit.seq_cons = seq_cons
+                hit.pp_cons = pp_cons
+                if hit.typ == '!': # significant hit
+                    hmm = hmmDict[hit.hmm_name]
+                    # compute scores
+                    hit.aligned_cols = 0
+                    hit.well_aligned_cols = 0
+                    hit.matches = 0
+                    hit.wcscore = 0.0
+                    hit.mcscore = 0.0
+                    hmm_i = hit.hmm_from
+                    for i,hmm_c in enumerate(hmm_cons):
+                        hit.ali_len += 1
+                        if hmm_c == '.':  # insertion in sequence w.r.t. the model
+                            continue
+                        seq_c = seq_cons[i]
+                        if seq_c == '-':  # deletion in sequence w.r.t. the model
+                            hmm_i += 1
+                            continue
+                        hit.aligned_cols += 1
+                        hit.matches += 1 if hmm_c.upper() == seq_c.upper() else 0
+                        if seq_c in 'ACDEFGHIKLMNPQRSTVWY' and pp_cons[i] in list('89*'):
+                            score_i = hmm.score(hmm_i,seq_c)
+                            hit.well_aligned_cols += 1
+                            if hmm_c.upper() == seq_c.upper():
+                                hit.cm_str += hit.match_str[i]
+                                hit.mcscore += score_i
+                            if score_i >= 3.0: # high-score position
+                                hit.wc_str += hit.match_str[i]
+                                hit.wcscore += score_i
+                        hmm_i += 1
+                    yield hit
+                # skip lines until the next domain/sequence or the end of file
+                line = fh.readline()
+                while line and not line.lstrip().startswith("== domain ") and not line.startswith(">> ") and not line.startswith("Query: "):
+                    line = fh.readline()
+            # here either EOF is reached or there are other models/sequences to process
+            continue
+        line = fh.readline()
+
+
+def main( argv = None ):
+
+    # parameter definition
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hmmer-dict', dest='hmmerDictFile', type=str, required=True, help='HMMER dictionary .pgz file')
+    parser.add_argument('--seq-db', dest='seqFile', type=str, required=True, help='Sequence database in csv format with [name,function,family,length] fields')
+    args = parser.parse_args()
+
+    # parameter validation
+    validParameters = True;
+    if not os.path.isfile(args.hmmerDictFile):
+        validParameters = False
+        print_error('HMMER dictionary file "{}" does not exist'.format(args.hmmerDictFile))
+    if not os.path.isfile(args.seqFile):
+        validParameters = False
+        print_error('Sequence database file "{}" does not exist'.format(args.seqFile))
+    if not validParameters:
+        return 1
+
+    print_status("loading HMMER dictionary...")
+    hmmerDict = {}
+    with gzip.open(args.hmmerDictFile,'rb') as f:
+        hmmerDict = pickle.load(f)
+
+    print_status("loading sequence database...")
+    seqDict = {}
+    with open( args.seqFile, mode='r', newline=None ) as seqFile:
+        for line in seqFile:
+            line = line.strip()
+            if line:
+                seq_name, seq_fun, seq_fam, seq_len = [ x.strip() for x in line.split(',') ]
+                seqDict[ seq_name ] = { 'Function':seq_fun, 'Family':seq_fam, 'Length':int(seq_len) }
+
+    print_status("processing hmmsearch output files...")
+
+    header = ['#hmm_name','hmm_len','seq_name','seq_len','seq_family','seq_func',
+            'bitscore', 'mean_score', 'mcscore','mean_mcs', 'wcscore','mean_wcs',
+            'ident','hmm_cov','hit_type']
+    print('\t'.join(header))
+    record = []
+    prev_hit = hmmer_hit()
+    for hit in parseHMMER(sys.stdin,hmmerDict):
+        if hit.seq_name not in seqDict:
+            continue
+        hmm = hmmerDict[hit.hmm_name]
+        if prev_hit.canExtendWith(hit):
+            hit = prev_hit.extendWith(hit)
+        elif len(record) > 0:
+            print('\t'.join(record))
+            record = []
+
+        hmm_len = hmm.length
+        hmm_reg = hit.hmm_to-hit.hmm_from+1
+        hmm_loh = hit.hmm_from-1 # left overhang of the hmm hit
+        hmm_roh = hmm_len-hit.hmm_to # right overhang of the hmm hit
+        hmm_cov = 100.0 * hmm_reg / hmm_len
+
+        seq_name = hit.seq_name
+        seq_len  = seqDict[seq_name]['Length']
+        seq_loh  = hit.env_from-1 # left overhang of the seq hit (envelope considered)
+        seq_roh  = seq_len-hit.env_to # right overhang of the seq hit (envelope considered)
+
+        max_oh   = max( 0.1 * hmm_len, 10 )
+        #left_oh  = min(hmm_loh,seq_loh)
+        #right_oh = min(hmm_roh,seq_roh)
+        hit_type = "FULL" if (hmm_loh <= max_oh and hmm_roh <= max_oh) else ("OVER" if (hmm_loh-seq_loh > max_oh or hmm_roh-seq_roh > max_oh) else "PART")
+        #hit_type = "FULL" if (hmm_loh <= max_oh and hmm_roh <= max_oh) else ("OVER" if (left_oh <= max_oh and right_oh <= max_oh) else "PART")
+
+        mean_score = 100.0 * hit.score / hit.aligned_cols
+        mean_mcs   = 100.0 * hit.mcscore / hit.aligned_cols # hit.well_aligned_cols
+        mean_wcs   = 100.0 * hit.wcscore / hit.aligned_cols # hit.well_aligned_cols
+        identity   = 100.0 * hit.matches / hit.ali_len
+
+        record = [ hit.hmm_name, str(hmm_len),
+                   seq_name, str(seq_len), seqDict[seq_name]['Family'], seqDict[seq_name]['Function'],
+                   "{:.2f}".format(hit.score),   "{:.4f}".format(mean_score),
+                   "{:.2f}".format(hit.mcscore), "{:.4f}".format(mean_mcs),
+                   "{:.2f}".format(hit.wcscore), "{:.4f}".format(mean_wcs),
+                   "{:.2f}".format(identity),    "{:.2f}".format(hmm_cov), hit_type ]
+
+        prev_hit = hit
+
+    if len(record) > 0:
+        print('\t'.join([ str(r) for r in record ]))
+
+    return 0
+
+
+
+
+# Check if the program is not being imported
+if __name__ == "__main__":
+    sys.exit(main())
+
--- a/scripts/pv_utils.py
+++ b/scripts/pv_utils.py
+#!/usr/bin/env python3
+#
+#  This file is part of ProfileView.
+#
+#  ProfileView is free software: you can redistribute it and/or modify
+#  it under the terms of the CeCILL 2.1 Licence
+#
+#  ProfileView is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+#  You should have received a copy of the Licence CeCILL 2.1 along
+#  with ProfileView. If not, see <https://cecill.info/>.
+#
+
+import os, sys
+import re
+
+script_name = re.sub( r'\.py$', '', os.path.basename(sys.argv[0]) )
+
+def uniq(l):
+    return list(set(l))
+
+# stable uniq
+def suniq(l):
+    outlist=[]
+    procs=set() # processed elements
+    for e in l:
+        if e not in procs:
+            procs.add(e)
+            outlist.append(e)
+    return outlist
+
+
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+
+
+def print_status(msg):
+    eprint("[{name}] {m}".format(name=script_name,m=msg))
+
+
+def print_error(msg):
+    eprint("[{name}] error: {m}".format(name=script_name,m=msg))
+
+
+def print_warning(msg):
+    eprint("[{name}] warning: {m}".format(name=script_name,m=msg))
+
+
+def is_int(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+def peekline(f):
+    pos = f.tell()
+    line = f.readline()
+    f.seek(pos)
+    return line
+