Commit 3d16265e by Riccardo Vicedomini

added (hidden) parameter that allows to specify minimum cumulative variance of…

added (hidden) parameter that allows to specify minimum cumulative variance of principal components, before the hierarchical clustering
parent b13148d3
......@@ -32,6 +32,7 @@ PV_LIBPATH=""
PV_SEQDESC=""
PV_OUTPREFIX="out"
PV_TMPDIR=""
PV_CVAR=0.99
NJOBS=8
PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
......@@ -61,7 +62,7 @@ function print_usage() {
# retrieve provided arguments
opts="i:l:s:p:j:hV"
longopts="input:,lib:,seq-desc:,prefix:,temp-dir:,max-jobs:,help,version"
longopts="input:,lib:,seq-desc:,prefix:,temp-dir:,max-jobs:,cvar:,help,version"
ARGS=$(getopt -o "${opts}" -l "${longopts}" -n "${CMD_NAME}" -- "${@}")
if [ $? -ne 0 ] || [ $# -eq 0 ]; then # the order of this tests is important!
print_usage
......@@ -96,6 +97,10 @@ while [ -n "${1}" ]; do
shift
NJOBS=${1}
;;
--cvar)
shift
PV_CVAR=${1}
;;
-h|--help)
print_usage
exit 0
......@@ -145,6 +150,12 @@ if ! [[ "${NJOBS}" =~ ^[0-9]+$ ]] || [ ${NJOBS} -lt 1 ] ; then
NJOBS=1
fi
if ! [[ "${PV_CVAR}" =~ ^(0(\.[0-9]+)?|1(\.0+)?)$ ]] ; then
print_warning "--cum-var parameter must be a real number in the interval [0,1]; the default value of 0.99 will be used."
PV_CVAR=0.99
fi
check_cmds "hmmsearch" "python3" "Rscript"
check_pymodules "ete3" "numpy"
#check_files "${SCRIPTS_DIR}"/{createHHdict.py,createHmmerDict.py,hh_utils.py,pv_utils.py}
......@@ -231,7 +242,7 @@ if [ $? -ne 0 ]; then
fi
print_status "building ProfileView tree"
Rscript --vanilla "${SCRIPTS_DIR}"/clusterSequences.R "${PV_TMPDIR}/out.feat" "${PV_TMPDIR}/out.tree" 2>>"${PV_LOGFILE}"
Rscript --vanilla "${SCRIPTS_DIR}"/clusterSequences.R "${PV_TMPDIR}/out.feat" "${PV_CVAR}" "${PV_TMPDIR}/out.tree" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then
print_error "could not create ProfileView tree, see log: ${PV_LOGFILE}"
exit 1
......
......@@ -20,7 +20,10 @@ args = commandArgs(trailingOnly=TRUE)
if (length(args)==0) {
stop("at least one argument must be supplied")
} else if (length(args)==1) {
args[2] <- ''
args[2] <- .99
args[3] <- ''
} else if (length(args)==2) {
args[3] <- ''
}
feat <- read.table( args[1], row.names=1, header=TRUE, sep="\t", na.strings=c("") )
......@@ -28,15 +31,15 @@ pc <- prcomp(feat[,c(-1,-2)], scale=T)
eigs <- pc$sdev^2
cumvar = cumsum(eigs)/sum(eigs) # cumulative explaned variance of PCs
pc_i = min( c(length(cumvar),which(cumvar >= .99)) ) # get the PC that allows to explain at least the 99% of the variance
pc_i = min( c(length(cumvar),which(cumvar >= as.double(args[2])) ) ) # get the PC that allows to explain at least the 99% of the variance
d <- dist(pc$x[,1:pc_i])
hc <- hclust(d,method="ward.D2")
my_tree <- as.phylo(hc)
if ( args[2] == '' ) {
if ( args[3] == '' ) {
write.tree(phy=my_tree,file=stdout())
} else {
write.tree(phy=my_tree,file=args[2])
write.tree(phy=my_tree,file=args[3])
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment