Commit e40a6415 by Riccardo Vicedomini

Added two new options (minimum explained variance for PC clustering and number…

Added two new options (minimum explained variance for PC clustering and number of best matching models for library reduction)
parent f01e2972
...@@ -33,6 +33,7 @@ PV_SEQDESC="" ...@@ -33,6 +33,7 @@ PV_SEQDESC=""
PV_OUTPREFIX="out" PV_OUTPREFIX="out"
PV_TMPDIR="" PV_TMPDIR=""
PV_CVAR=0.99 PV_CVAR=0.99
PV_KBEST=3
NJOBS=8 NJOBS=8
PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}" PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
...@@ -46,14 +47,17 @@ function print_usage() { ...@@ -46,14 +47,17 @@ function print_usage() {
echo -en "\n" echo -en "\n"
echo -en " MANDATORY OPTIONS:\n echo -en " MANDATORY OPTIONS:\n
-i, --input <name>\tFile containing the sequences to be classified in FASTA format\n -i, --input <name>\tFile containing the sequences to be classified in FASTA format\n
-l, --lib <name>\tProfileView library name\n" | column -t -s $'\t' -l, --lib <name>\tProfileView library name/path\n" | column -t -s $'\t'
echo -en "\n" echo -en "\n"
echo -en " OTHER OPTIONS:\n echo -en " OTHER OPTIONS:\n
-k, --k-best <num>\tNumber of best-match models to retain for each sequence (default: ${PV_KBEST})\n
--cvar <num>\tMinimum explained variance threshold for selecting principal components before the clustering (default: ${PV_CVAR})
\tThe provided value must be a number in the interval [0,1]\n
-s, --seq-desc <name>\tInput sequence descriptor file, that is a CSV file containing the follwing fileds:\n -s, --seq-desc <name>\tInput sequence descriptor file, that is a CSV file containing the follwing fileds:\n
\t<sequence_id>,<function_id>,<family_id>,<sequence_length>\n \t<sequence_id>,<function_id>,<family_id>,<sequence_length>\n
-p, --prefix <name>\tPrefix of output files (default:${PV_OUTPREFIX})\n -p, --prefix <name>\tPrefix of output files (default: ${PV_OUTPREFIX})\n
--temp-dir <name>\tTemporary result directory\n --temp-dir <name>\tTemporary result directory\n
-j, --max-jobs <num>\tNumber of parallel jobs (default:8)\n -j, --max-jobs <num>\tNumber of parallel jobs (default: ${NJOBS})\n
-h, --help\tPrint this help message\n -h, --help\tPrint this help message\n
-V, --version\tPrint version\n" | column -t -s $'\t' -V, --version\tPrint version\n" | column -t -s $'\t'
echo -en "\n" echo -en "\n"
...@@ -61,8 +65,8 @@ function print_usage() { ...@@ -61,8 +65,8 @@ function print_usage() {
# retrieve provided arguments # retrieve provided arguments
opts="i:l:s:p:j:hV" opts="i:l:k:s:p:j:hV"
longopts="input:,lib:,seq-desc:,prefix:,temp-dir:,max-jobs:,cvar:,help,version" longopts="input:,lib:,k-best:,seq-desc:,prefix:,temp-dir:,max-jobs:,cvar:,help,version"
ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}") ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important! if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important!
print_usage print_usage
...@@ -150,6 +154,12 @@ if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then ...@@ -150,6 +154,12 @@ if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then
NJOBS=1 NJOBS=1
fi fi
if ! [[ "${PV_KBEST}" =~ ^[0-9]+$ ]] || [ ${PV_KBEST} -lt 1 ] ; then
print_warning "-k|--k-best parameter should be a positive integer; the default value of 3 will be used."
PV_KBEST=3
fi
if ! [[ "${PV_CVAR}" =~ ^(0(\.[0-9]+)?|\.[0-9]+|1(\.0+)?)$ ]] ; then if ! [[ "${PV_CVAR}" =~ ^(0(\.[0-9]+)?|\.[0-9]+|1(\.0+)?)$ ]] ; then
print_warning "--cum-var parameter must be a real number in the interval [0,1]; the default value of 0.99 will be used." print_warning "--cum-var parameter must be a real number in the interval [0,1]; the default value of 0.99 will be used."
PV_CVAR=0.99 PV_CVAR=0.99
...@@ -235,7 +245,7 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv" ...@@ -235,7 +245,7 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null
print_status "building representation space" print_status "building representation space"
python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k 3 2>>"${PV_LOGFILE}" python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k "${PV_KBEST}" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
print_error "error during feature generation, see log: ${PV_LOGFILE}" print_error "error during feature generation, see log: ${PV_LOGFILE}"
exit 1 exit 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment