Commit e40a6415 by Riccardo Vicedomini

Added two new options (minimum explained variance for PC clustering and number…

Added two new options (minimum explained variance for PC clustering and number of best matching models for library reduction)
parent f01e2972
......@@ -33,6 +33,7 @@ PV_SEQDESC=""
PV_OUTPREFIX="out"
PV_TMPDIR=""
PV_CVAR=0.99
PV_KBEST=3
NJOBS=8
PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
......@@ -46,14 +47,17 @@ function print_usage() {
echo -en "\n"
echo -en " MANDATORY OPTIONS:\n
-i, --input <name>\tFile containing the sequences to be classified in FASTA format\n
-l, --lib <name>\tProfileView library name\n" | column -t -s $'\t'
-l, --lib <name>\tProfileView library name/path\n" | column -t -s $'\t'
echo -en "\n"
echo -en " OTHER OPTIONS:\n
-k, --k-best <num>\tNumber of best-match models to retain for each sequence (default: ${PV_KBEST})\n
--cvar <num>\tMinimum explained variance threshold for selecting principal components before the clustering (default: ${PV_CVAR})
\tThe provided value must be a number in the interval [0,1]\n
-s, --seq-desc <name>\tInput sequence descriptor file, that is a CSV file containing the follwing fileds:\n
\t<sequence_id>,<function_id>,<family_id>,<sequence_length>\n
-p, --prefix <name>\tPrefix of output files (default:${PV_OUTPREFIX})\n
-p, --prefix <name>\tPrefix of output files (default: ${PV_OUTPREFIX})\n
--temp-dir <name>\tTemporary result directory\n
-j, --max-jobs <num>\tNumber of parallel jobs (default:8)\n
-j, --max-jobs <num>\tNumber of parallel jobs (default: ${NJOBS})\n
-h, --help\tPrint this help message\n
-V, --version\tPrint version\n" | column -t -s $'\t'
echo -en "\n"
......@@ -61,8 +65,8 @@ function print_usage() {
# retrieve provided arguments
opts="i:l:s:p:j:hV"
longopts="input:,lib:,seq-desc:,prefix:,temp-dir:,max-jobs:,cvar:,help,version"
opts="i:l:k:s:p:j:hV"
longopts="input:,lib:,k-best:,seq-desc:,prefix:,temp-dir:,max-jobs:,cvar:,help,version"
ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important!
print_usage
......@@ -150,6 +154,12 @@ if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then
NJOBS=1
fi
if ! [[ "${PV_KBEST}" =~ ^[0-9]+$ ]] || [ ${PV_KBEST} -lt 1 ] ; then
print_warning "-k|--k-best parameter should be a positive integer; the default value of 3 will be used."
PV_KBEST=3
fi
if ! [[ "${PV_CVAR}" =~ ^(0(\.[0-9]+)?|\.[0-9]+|1(\.0+)?)$ ]] ; then
print_warning "--cum-var parameter must be a real number in the interval [0,1]; the default value of 0.99 will be used."
PV_CVAR=0.99
......@@ -235,7 +245,7 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null
print_status "building representation space"
python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k 3 2>>"${PV_LOGFILE}"
python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k "${PV_KBEST}" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then
print_error "error during feature generation, see log: ${PV_LOGFILE}"
exit 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment