Commit c8465df8 by Riccardo Vicedomini

first fully operational version to test on SGE

parent 67af8343
......@@ -22,6 +22,7 @@ search_hmm = %(scripts_path)s/search_hmm.py
search_ccm = %(scripts_path)s/search_ccms_all.py
filter_bd = %(scripts_path)s/filter_hits.py
simple_arch = %(scripts_path)s/simple_arch.py
dama_arch = %(scripts_path)s/dama_arch.py
[programs]
;hmmer_path =
......
......@@ -190,6 +190,11 @@ if [ ! -z "${MCLADE_DOMFILE}" ]; then
fi
fi
MCLADE_DAMAARG=""
if [ "${MCLADE_USEDAMA}" = true ]; then
MCLADE_DAMAARG="--arch"
fi
if ! [[ "${NTHREADS}" =~ ^[0-9]+$ ]] || [ ${NTHREADS} -lt 1 ] ; then
print_error "-t|--threads parameter must be a positive integer"
exit 1
......@@ -227,7 +232,7 @@ print_status "MetaCLADE working directory: ${MCLADE_WORKDIR}"
# Create MetaCLADE scripts
print_status "creating MetaCLADE script/job files"
python3 "${SCRIPTS_DIR}/mclade_create_jobs.py" -i "${INPUT_FASTA}" -N "${MCLADE_JOBNAME}" ${MCLADE_DOMARG} -W "${MCLADE_WORKDIR}" -e "${MCLADE_EVALUECUTOFF}" -E "${MCLADE_EVALUECUTCONF}" -j "${NJOBS}"
python3 "${SCRIPTS_DIR}/mclade_create_jobs.py" -i "${INPUT_FASTA}" -N "${MCLADE_JOBNAME}" ${MCLADE_DOMARG} -W "${MCLADE_WORKDIR}" -e "${MCLADE_EVALUECUTOFF}" -E "${MCLADE_EVALUECUTCONF}" -j "${NJOBS}" ${MCLADE_DAMAARG}
# Possibly run MetaCLADE scripts in a SGE environment
if [ "${MCLADE_USESGE}" = true ] ; then
......@@ -254,13 +259,13 @@ if [ "${MCLADE_USESGE}" = true ] ; then
exit 1
fi
done
echo "search jobs finished successfully"
print_status "search jobs finished successfully"
# submit filter jobs
print_status "submitting filter ijobs"
unset pidarr; pidarr=()
for i in $(seq 1 ${NJOBS}); do
f="${MCLADE_WORKDIR}/jobs/1_filter/${MCLADE_JOBNAME}_${i}.sh"
f="${MCLADE_WORKDIR}/jobs/2_filter/${MCLADE_JOBNAME}_${i}.sh"
# run a qsub job for each non-empty script
if [ -f "${f}" ] && [ -s "${f}" ]; then
qsub $SGE_QUEUEARG $SGE_TIMELIMARG -e "${MCLADE_WORKDIR}/log/filter_${i}.err" -o "${MCLADE_WORKDIR}/log/filter_${i}.out" -cwd -sync yes -N ${MCLADE_JOBNAME} -pe ${SGE_PENAME} ${NTHREADS} -b y ${PEXEC_CMD} ${f} &
......@@ -279,5 +284,16 @@ if [ "${MCLADE_USESGE}" = true ] ; then
exit 1
fi
done
echo "search jobs finished successfully"
print_status "search jobs finished successfully"
# create architecture
print_status "computing architecture"
f="${MCLADE_WORKDIR}/jobs/3_arch/${MCLADE_JOBNAME}.sh"
qsub $SGE_QUEUEARG $SGE_TIMELIMARG -e "${MCLADE_WORKDIR}/log/arch.err" -o "${MCLADE_WORKDIR}/log/arch.out" -cwd -sync yes -N ${MCLADE_JOBNAME} -pe ${SGE_PENAME} 1 -b y ${PEXEC_CMD} ${f}
qret=$?
if ((qret != 0)); then
echo "error during MetaCLADE architecture step ($qret)"
exit 1
fi
print_status "architecture job finished successfully"
fi
"""
Author: Riccardo Vicedomini
"""
import sys, os, argparse
import subprocess
import glob
import time
from mclade_utils import *
script_name = os.path.splitext( os.path.basename(__file__) )[0]
def main():
#TODO: set all arguments as MANDATORY
parser = argparse.ArgumentParser(description="Create DAMA architecture from MetaCLADE filter step")
parser.add_argument('--damaCmd', dest='damaCmd', help='absolute path to dama executable', type=str)
parser.add_argument('--domainsInfoFile', dest='domainsInfoFile', help='.domains file of MetaCLADE library', type=str)
parser.add_argument('--knownArchFile', dest='knownArchFile', help='.knownArch file of MetaCLADE library', type=str)
parser.add_argument('--overlappingDomainFile', dest='overlappingDomainFile', help='.overlapping file of MetaCLADE library', type=str)
parser.add_argument('--evalueCutOff', dest='evalueCutOff', help='e-value cutoff for DAMA', type=str)
parser.add_argument('--evalueCutOffConf', dest='evalueCutOffConf', help='e-value confidence threshold for DAMA', type=str)
parser.add_argument('--filterResDir', dest='filterResDir', help='results directory of MetaCLADE filter step', type=str)
parser.add_argument('--tempDir', dest='tempDir', help='temporary directory for intermediate files', type=str)
parser.add_argument('--outputFile', dest='outputFile', help='output file of DAMA architecture', type=str)
args = parser.parse_args()
temp_dir = f'{args.tempDir}'
if not os.path.exists(temp_dir):
try:
os.makedirs(temp_dir)
except OSError as error:
pass
start_time = time.time()
print(f'[{script_name}] formatting DAMA input file')
domainsHitDict = {}
domains_hit_file = f'{temp_dir}/domainsHitFile.best.res'
with open(domains_hit_file,'w') as dhf:
for bdfile in glob.glob(f'{args.filterResDir}/*.best.res'):
domid = os.path.basename(bdfile).split('.')[0]
with open(bdfile,'r') as bdf:
for line in bdf:
cols = line.rstrip().split('\t')
mid,mbeg,mend,mlen,sid,sbeg,send,slen,evalue,score,acc = cols
hit_key = (sid,sbeg,send,domid)
domainsHitDict[hit_key]=cols
dhf.write(f'{evalue}\t{sbeg}\t{send}\t{sid}\t{domid}\t{acc}\t{mid}\t{mbeg}\t{mend}\t{domid}\n')
print(f'[{script_name}] running DAMA')
dama_temp_out = f'{temp_dir}/dama.arch.txt'
dama_args = [ args.damaCmd,
'-domainsInfoFile', args.domainsInfoFile,
'-knownArchFile', args.knownArchFile,
'-overlappingDomainFile', args.overlappingDomainFile,
'-evalueCutOff', args.evalueCutOff,
'-evalueCutOffConf', args.evalueCutOffConf,
'-domainsHitFile', domains_hit_file,
'-outputFile', dama_temp_out ]
dama_cmd = ' '.join(dama_args)
dama_process = subprocess.call(dama_args)
if dama_process != 0: # if dama failed
eprint(f'[{script_name}] error running: {hmmsearch_cmd}')
return 1
# fix dama output
print(f'[{script_name}] creating architecture file')
with open(args.outputFile,'w') as farch, open(dama_temp_out,'r') as fdama:
for line in fdama:
evalue,sbeg,send,sid,domid,_ = line.rstrip().split('\t')
hit_key = (sid,sbeg,send,domid)
mid,mbeg,mend,mlen,sid,sbeg,send,slen,evalue,score,acc = domainsHitDict[hit_key]
record = [sid,sbeg,send,slen,domid,mid,mbeg,mend,mlen,evalue,score,acc]
farch.write('\t'.join(record) + '\n')
os.remove(domains_hit_file)
os.remove(dama_temp_out)
runtime = time.time()-start_time
print(f'[{script_name}] runtime: {runtime:.2f}')
return 0
# Check if the program is not being imported
if __name__ == "__main__":
sys.exit(main())
......@@ -27,6 +27,7 @@ SCRIPTS_OPT_SEARCH_HMM = "search_hmm"
SCRIPTS_OPT_SEARCH_CCM = "search_ccm"
SCRIPTS_OPT_FILTER_BD = "filter_bd"
SCRIPTS_OPT_SIMPLE_ARCH = "simple_arch"
SCRIPTS_OPT_DAMA_ARCH = "dama_arch"
# [program] external software used in CLADE's pipeline
PROGRAM_SECTION = "programs"
......
......@@ -81,6 +81,7 @@ class CladeController:
self.search_ccm = self.cladeCfgParser.get(C.SCRIPTS_SECTION,C.SCRIPTS_OPT_SEARCH_CCM)
self.filter_bd = self.cladeCfgParser.get(C.SCRIPTS_SECTION,C.SCRIPTS_OPT_FILTER_BD)
self.simple_arch = self.cladeCfgParser.get(C.SCRIPTS_SECTION,C.SCRIPTS_OPT_SIMPLE_ARCH)
self.dama_arch = self.cladeCfgParser.get(C.SCRIPTS_SECTION,C.SCRIPTS_OPT_DAMA_ARCH)
# process [program] section
self.hmmer_path = self.cladeCfgParser.get(C.PROGRAM_SECTION,C.PROGRAM_OPT_HMMER_PATH,fallback='')
self.hmmsearch_cmd = "hmmsearch" if not self.hmmer_path else self.hmmer_path + '/hmmsearch'
......@@ -263,9 +264,11 @@ class CladeController:
with open(f'{jobspath}/{outprefix}.sh','w') as fh:
bd_dir = f'{self.filter_results_path}'
out_file = f'{self.arch_results_path}/{self.name}.arch.txt'
arch_cmd = f'{self.dama_cmd} {self.simple_arch} -e {self.evalue_cutoff} {bd_dir} {out_file}'
arch_cmd = f'{self.python_cmd} {self.dama_arch} --damaCmd {self.dama_cmd} --domainsInfoFile {self.domain_list} --knownArchFile {self.knownarchs} --overlappingDomainFile {self.overlapping} --evalueCutOff {self.evalue_cutoff} --evalueCutOffConf {self.evalue_cutconf} --filterResDir {bd_dir} --tempDir {self.temp_out_path} --outputFile {out_file}'
fh.write(f'{arch_cmd}\n')
# ~/software/DAMA/Release/DAMA -domainsHitFile /home/vicedomini/projects/metaclade2/test/pippo/results/domainsHitFile.best.res -knownArchFile /home/vicedomini/projects/metaclade2/data/pfamLists/pfam32/pfam32.knownArch -domainsInfoFile /home/vicedomini/projects/metaclade2/data/pfamLists/pfam32/pfam32.domains -outputFile /home/vicedomini/projects/metaclade2/test/pippo/results/3_arch/pippo.arch.txt -overlappingDomainFile /home/vicedomini/projects/metaclade2/data/pfamLists/pfam32/pfam32.overlapping
# def createOverLappingFilterJobs(self, jobDir):
# searchResultsDir = "%s%s/%s/" % (self.RESULTS_DIR, self.DATASET_NAME, C.SEARCH_OUTPUT_DIR_NAME)
# arffFilesDir = "%s%s/%s/" % (self.RESULTS_DIR, self.DATASET_NAME, C.ARFF_RESULT_DIR_NAME)
......
......@@ -111,7 +111,7 @@ def main():
os.remove(f'{temp_dir}/{args[ACC_ID]}.ccms.domtblout')
runtime = time.time() - start_time
eprint(f'[{script_name}] runtime: {args[ACC_ID]} CCMs {runtime:.2f}')
print(f'[{script_name}] runtime: {args[ACC_ID]} CCMs {runtime:.2f}')
return 0
if __name__ == "__main__":
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment