Added two new options (minimum explained variance for PC clustering and number…

Added two new options (minimum explained variance for PC clustering and number of best matching models for library reduction)

Added two new options (minimum explained variance for PC clustering and number…
Added two new options (minimum explained variance for PC clustering and number of best matching models for library reduction)
e40a6415 · Riccardo Vicedomini · f01e2972 · e40a6415
Commit e40a6415 authored Jul 26, 2019 by Riccardo Vicedomini
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 6 deletions

profileview-tree profileview-tree +16 -6

No files found.
--- a/profileview-tree
+++ b/profileview-tree
@@ -33,6 +33,7 @@ PV_SEQDESC=""
 PV_OUTPREFIX="out"
 PV_TMPDIR=""
 PV_CVAR=0.99
+PV_KBEST=3
 NJOBS=8

 PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
@@ -46,14 +47,17 @@ function print_usage() {
    echo -en "\n"
    echo -en "  MANDATORY OPTIONS:\n
    -i, --input <name>\tFile containing the sequences to be classified in FASTA format\n
-    -l, --lib <name>\tProfileView library name\n" | column -t -s $'\t'
+    -l, --lib <name>\tProfileView library name/path\n" | column -t -s $'\t'
    echo -en "\n"
    echo -en "  OTHER OPTIONS:\n
+    -k, --k-best <num>\tNumber of best-match models to retain for each sequence (default: ${PV_KBEST})\n
+    --cvar <num>\tMinimum explained variance threshold for selecting principal components before the clustering (default: ${PV_CVAR})
+                \tThe provided value must be a number in the interval [0,1]\n
    -s, --seq-desc <name>\tInput sequence descriptor file, that is a CSV file containing the follwing fileds:\n
                         \t<sequence_id>,<function_id>,<family_id>,<sequence_length>\n
-    -p, --prefix <name>\tPrefix of output files (default:${PV_OUTPREFIX})\n
+    -p, --prefix <name>\tPrefix of output files (default: ${PV_OUTPREFIX})\n
    --temp-dir <name>\tTemporary result directory\n
-    -j, --max-jobs <num>\tNumber of parallel jobs (default:8)\n
+    -j, --max-jobs <num>\tNumber of parallel jobs (default: ${NJOBS})\n
    -h, --help\tPrint this help message\n
    -V, --version\tPrint version\n" | column -t -s $'\t'
    echo -en "\n"
@@ -61,8 +65,8 @@ function print_usage() {

 # retrieve provided arguments

-opts="i:l:s:p:j:hV"
-longopts="input:,lib:,seq-desc:,prefix:,temp-dir:,max-jobs:,cvar:,help,version"
+opts="i:l:k:s:p:j:hV"
+longopts="input:,lib:,k-best:,seq-desc:,prefix:,temp-dir:,max-jobs:,cvar:,help,version"
 ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
 if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important!
    print_usage
@@ -150,6 +154,12 @@ if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then
    NJOBS=1
 fi

+if ! [[ "${PV_KBEST}" =~ ^[0-9]+$ ]] || [ ${PV_KBEST} -lt 1 ] ; then
+    print_warning "-k|--k-best parameter should be a positive integer; the default value of 3 will be used."
+    PV_KBEST=3
+fi
+
+
 if ! [[ "${PV_CVAR}" =~ ^(0(\.[0-9]+)?|\.[0-9]+|1(\.0+)?)$ ]] ; then
    print_warning "--cum-var parameter must be a real number in the interval [0,1]; the default value of 0.99 will be used."
    PV_CVAR=0.99
@@ -235,7 +245,7 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
 awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null

 print_status "building representation space"
-python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k 3 2>>"${PV_LOGFILE}"
+python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k "${PV_KBEST}" 2>>"${PV_LOGFILE}"
 if [ $? -ne 0 ]; then
    print_error "error during feature generation, see log: ${PV_LOGFILE}"
    exit 1