Commit 3d16265e by Riccardo Vicedomini

added (hidden) parameter that allows to specify minimum cumulative variance of…

added (hidden) parameter that allows to specify minimum cumulative variance of principal components, before the hierarchical clustering
parent b13148d3
...@@ -32,6 +32,7 @@ PV_LIBPATH="" ...@@ -32,6 +32,7 @@ PV_LIBPATH=""
PV_SEQDESC="" PV_SEQDESC=""
PV_OUTPREFIX="out" PV_OUTPREFIX="out"
PV_TMPDIR="" PV_TMPDIR=""
PV_CVAR=0.99
NJOBS=8 NJOBS=8
PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}" PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
...@@ -61,7 +62,7 @@ function print_usage() { ...@@ -61,7 +62,7 @@ function print_usage() {
# retrieve provided arguments # retrieve provided arguments
opts="i:l:s:p:j:hV" opts="i:l:s:p:j:hV"
longopts="input:,lib:,seq-desc:,prefix:,temp-dir:,max-jobs:,help,version" longopts="input:,lib:,seq-desc:,prefix:,temp-dir:,max-jobs:,cvar:,help,version"
ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}") ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important! if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important!
print_usage print_usage
...@@ -96,6 +97,10 @@ while [ -n "${1}" ]; do ...@@ -96,6 +97,10 @@ while [ -n "${1}" ]; do
shift shift
NJOBS=${1} NJOBS=${1}
;; ;;
--cvar)
shift
PV_CVAR=${1}
;;
-h|--help) -h|--help)
print_usage print_usage
exit 0 exit 0
...@@ -145,6 +150,12 @@ if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then ...@@ -145,6 +150,12 @@ if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then
NJOBS=1 NJOBS=1
fi fi
if ! [[ "${PV_CVAR}" =~ ^(0(\.[0-9]+)?|1(\.0+)?)$ ]] ; then
print_warning "--cum-var parameter must be a real number in the interval [0,1]; the default value of 0.99 will be used."
PV_CVAR=0.99
fi
check_cmds "hmmsearch" "python3" "Rscript" check_cmds "hmmsearch" "python3" "Rscript"
check_pymodules "ete3" "numpy" check_pymodules "ete3" "numpy"
#check_files "${SCRIPTS_DIR}"/{createHHdict.py,createHmmerDict.py,hh_utils.py,pv_utils.py} #check_files "${SCRIPTS_DIR}"/{createHHdict.py,createHmmerDict.py,hh_utils.py,pv_utils.py}
...@@ -231,7 +242,7 @@ if [ $? -ne 0 ]; then ...@@ -231,7 +242,7 @@ if [ $? -ne 0 ]; then
fi fi
print_status "building ProfileView tree" print_status "building ProfileView tree"
Rscript --vanilla "${SCRIPTS_DIR}"/clusterSequences.R "${PV_TMPDIR}/out.feat" "${PV_TMPDIR}/out.tree" 2>>"${PV_LOGFILE}" Rscript --vanilla "${SCRIPTS_DIR}"/clusterSequences.R "${PV_TMPDIR}/out.feat" "${PV_CVAR}" "${PV_TMPDIR}/out.tree" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
print_error "could not create ProfileView tree, see log: ${PV_LOGFILE}" print_error "could not create ProfileView tree, see log: ${PV_LOGFILE}"
exit 1 exit 1
......
...@@ -20,7 +20,10 @@ args = commandArgs(trailingOnly=TRUE) ...@@ -20,7 +20,10 @@ args = commandArgs(trailingOnly=TRUE)
if (length(args)==0) { if (length(args)==0) {
stop("at least one argument must be supplied") stop("at least one argument must be supplied")
} else if (length(args)==1) { } else if (length(args)==1) {
args[2] <- '' args[2] <- .99
args[3] <- ''
} else if (length(args)==2) {
args[3] <- ''
} }
feat <- read.table( args[1], row.names=1, header=TRUE, sep="\t", na.strings=c("") ) feat <- read.table( args[1], row.names=1, header=TRUE, sep="\t", na.strings=c("") )
...@@ -28,15 +31,15 @@ pc <- prcomp(feat[,c(-1,-2)], scale=T) ...@@ -28,15 +31,15 @@ pc <- prcomp(feat[,c(-1,-2)], scale=T)
eigs <- pc$sdev^2 eigs <- pc$sdev^2
cumvar = cumsum(eigs)/sum(eigs) # cumulative explaned variance of PCs cumvar = cumsum(eigs)/sum(eigs) # cumulative explaned variance of PCs
pc_i = min( c(length(cumvar),which(cumvar >= .99)) ) # get the PC that allows to explain at least the 99% of the variance pc_i = min( c(length(cumvar),which(cumvar >= as.double(args[2])) ) ) # get the PC that allows to explain at least the 99% of the variance
d <- dist(pc$x[,1:pc_i]) d <- dist(pc$x[,1:pc_i])
hc <- hclust(d,method="ward.D2") hc <- hclust(d,method="ward.D2")
my_tree <- as.phylo(hc) my_tree <- as.phylo(hc)
if ( args[2] == '' ) { if ( args[3] == '' ) {
write.tree(phy=my_tree,file=stdout()) write.tree(phy=my_tree,file=stdout())
} else { } else {
write.tree(phy=my_tree,file=args[2]) write.tree(phy=my_tree,file=args[3])
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment