parallelization modif

parent 42e60f66
...@@ -175,24 +175,24 @@ check_pymodules "ete3" "numpy" ...@@ -175,24 +175,24 @@ check_pymodules "ete3" "numpy"
PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}" PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
if ! command -v parallel >/dev/null 2>&1; then if ! command -v parallel >/dev/null 2>&1; then
print_warning "cannot find GNU parallel, all jobs will be run sequentially" print_warning "cannot find GNU parallel, all jobs will be run sequentially"
PEXEC_CMD="/usr/bin/env bash --" PEXEC_CMD="/usr/bin/env bash --"
fi fi
# Create temp working directory # Create temp working directory
if [ -z "${PV_TMPDIR}" ]; then if [ -z "${PV_TMPDIR}" ]; then
if [ $(check_system) = "Linux" ]; then if [ $(check_system) = "Linux" ]; then
PV_TMPDIR=$(mktemp -p "${PWD}" -d pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; } PV_TMPDIR=$(mktemp -p "${PWD}" -d pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
else else
PV_TMPDIR=$(mktemp -d "${PWD}"/pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; } PV_TMPDIR=$(mktemp -d "${PWD}"/pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
fi fi
else else
mkdir -p "${PV_TMPDIR}" || { print_error "cannot create temporary directory"; exit 1; } mkdir -p "${PV_TMPDIR}" || { print_error "cannot create temporary directory"; exit 1; }
fi fi
PV_TMPDIR=$(abspath "${PV_TMPDIR}") PV_TMPDIR=$(abspath "${PV_TMPDIR}")
if [ ! -d "${PV_TMPDIR}" ] || [ -n "$(ls -A "${PV_TMPDIR}")" ]; then if [ ! -d "${PV_TMPDIR}" ] || [ -n "$(ls -A "${PV_TMPDIR}")" ]; then
print_error "provided path \"${PV_TMPDIR}\" must be an empty directory" print_error "provided path \"${PV_TMPDIR}\" must be an empty directory"
exit 1 exit 1
fi fi
print_status "using temporary directory: ${PV_TMPDIR}" print_status "using temporary directory: ${PV_TMPDIR}"
...@@ -202,44 +202,44 @@ touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}" ...@@ -202,44 +202,44 @@ touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
# Possibly create a sequence descriptor file (if not provided) # Possibly create a sequence descriptor file (if not provided)
if [ -z "${PV_SEQDESC}" ]; then if [ -z "${PV_SEQDESC}" ]; then
PV_SEQDESC="${PV_TMPDIR}/sequences.csv" PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
print_status "creating temporary sequence descriptor file: ${PV_SEQDESC}" print_status "creating temporary sequence descriptor file: ${PV_SEQDESC}"
awk ' awk '
BEGIN { BEGIN {
sname="" sname=""
seqlen=0 seqlen=0
OFS="," OFS=","
} }
/^>/ { /^>/ {
if(sname!=""){ print sname,"NA","NA",seqlen } if(sname!=""){ print sname,"NA","NA",seqlen }
sname=substr($1,2) sname=substr($1,2)
seqlen=0 seqlen=0
next next
} }
{ {
for(i=1;i<=NF;i++){seqlen+=length($i)} for(i=1;i<=NF;i++){seqlen+=length($i)}
} }
END { END {
if(sname!=""){ print sname,"NA","NA",seqlen } if(sname!=""){ print sname,"NA","NA",seqlen }
} }
' "${INPUT_FASTA}" >"${PV_SEQDESC}" ' "${INPUT_FASTA}" >"${PV_SEQDESC}"
else else
cp "${PV_SEQDESC}" "${PV_TMPDIR}/sequences.csv" cp "${PV_SEQDESC}" "${PV_TMPDIR}/sequences.csv"
PV_SEQDESC="${PV_TMPDIR}/sequences.csv" PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
fi fi
# Run hmmsearch for each model of the library against the input sequences # Run hmmsearch for each model of the library against the input sequences
print_status "running hmmsearch against input sequences" print_status "running hmmsearch against input sequences"
HMMSEARCH_RESFILE="${PV_TMPDIR}"/hmmsearch.out.gz HMMSEARCH_RESFILE="${PV_TMPDIR}"/hmmsearch.out.gz
for hmm in "${PV_LIBDIR}"/hmm/*.hmm; do for hmm in "${PV_LIBDIR}"/hmm/*.hmm; do
[ -e "${hmm}" ] || continue [ -e "${hmm}" ] || continue
hmmBase=${hmm##*/} hmmBase=${hmm##*/}
hmmName=${hmmBase%.hmm} hmmName=${hmmBase%.hmm}
echo "hmmsearch ${hmm} ${INPUT_FASTA} 2>>${PV_LOGFILE}" echo "hmmsearch ${hmm} ${INPUT_FASTA} 2>>${PV_LOGFILE}"
done | ${PEXEC_CMD} | gzip -c >"${HMMSEARCH_RESFILE}" done | ${PEXEC_CMD} | gzip -c >"${HMMSEARCH_RESFILE}"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
print_error "error during hmmsearch jobs, see log: ${PV_LOGFILE}" print_error "error during hmmsearch jobs, see log: ${PV_LOGFILE}"
exit 1 exit 1
fi fi
print_status "processing hmmsearch output files" print_status "processing hmmsearch output files"
...@@ -257,6 +257,8 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv" ...@@ -257,6 +257,8 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null
print_status "building representation space (using k=${PV_KBEST})" print_status "building representation space (using k=${PV_KBEST})"
#for PV_KBEST in `seq 1 20 `
#do
python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k "${PV_KBEST}" 2>>"${PV_LOGFILE}" python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k "${PV_KBEST}" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
print_error "error during feature generation, see log: ${PV_LOGFILE}" print_error "error during feature generation, see log: ${PV_LOGFILE}"
...@@ -271,10 +273,9 @@ if [ $? -ne 0 ]; then ...@@ -271,10 +273,9 @@ if [ $? -ne 0 ]; then
fi fi
print_status "finding representative models and generating annotated ProfileView tree" print_status "finding representative models and generating annotated ProfileView tree"
python3 "${SCRIPTS_DIR}/findReprModels.py" -t "${PV_TMPDIR}/out.tree" -s "${PV_SCOREFILE}" -m "${PV_TMPDIR}/out.used_models.list" -o "${PV_OUTPREFIX}" 2>>"${PV_LOGFILE}" python3 "${SCRIPTS_DIR}/findReprModels.py" -t "${PV_TMPDIR}/out.tree" -s "${PV_SCOREFILE}" -m "${PV_TMPDIR}/out.used_models.list" -o "${PV_OUTPREFIX}_${PV_KBEST}" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
print_error "error during identification of representative models and the construction of the annotated ProfileView tree, see log: ${PV_LOGFILE}" print_error "error during identification of representative models and the construction of the annotated ProfileView tree, see log: ${PV_LOGFILE}"
exit 1 exit 1
fi fi
#done
...@@ -133,6 +133,8 @@ def main( argv = None ): ...@@ -133,6 +133,8 @@ def main( argv = None ):
part = float(stats['PART'])/tot if tot > 0 else 0.0 part = float(stats['PART'])/tot if tot > 0 else 0.0
over = float(stats['OVER'])/tot if tot > 0 else 0.0 over = float(stats['OVER'])/tot if tot > 0 else 0.0
if (full < .5 and part < .5) or over >= .3: # significant number of sequence-model "overlapping" hits (i.e., the sequence is likely to be incomplete) if (full < .5 and part < .5) or over >= .3: # significant number of sequence-model "overlapping" hits (i.e., the sequence is likely to be incomplete)
print_status(f"{seq_name},{full},{part},{over}")
partialSequences.append(seq_name) partialSequences.append(seq_name)
outFile.write("{}\t{:.4f}\t{:.4f}\t{:.4f}\n".format(seq_name,full,over,part)) outFile.write("{}\t{:.4f}\t{:.4f}\t{:.4f}\n".format(seq_name,full,over,part))
for seq_name in partialSequences: for seq_name in partialSequences:
......
...@@ -27,11 +27,13 @@ def parseHMMER(fh,hmmDict): ...@@ -27,11 +27,13 @@ def parseHMMER(fh,hmmDict):
hmm_name = "" hmm_name = ""
seq_name = "" seq_name = ""
line = fh.readline() line = fh.readline()
print_status("c'est la ligne")
print_status(line)
while line: while line:
tokens = line.split() tokens = line.split()
if len(tokens) == 0 or line.startswith("#"): if len(tokens) == 0 or line.startswith("#"):
pass pass
elif line.startswith("Query: "): elif line.startswith("Query: "):
hmm_name = tokens[1] hmm_name = tokens[1]
elif line.startswith(">> "): # target sequence section elif line.startswith(">> "): # target sequence section
seq_name = tokens[1] seq_name = tokens[1]
...@@ -129,7 +131,7 @@ def parseHMMER(fh,hmmDict): ...@@ -129,7 +131,7 @@ def parseHMMER(fh,hmmDict):
def main( argv = None ): def main( argv = None ):
# parameter definition # parameter definition
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--hmmer-dict', dest='hmmerDictFile', type=str, required=True, help='HMMER dictionary .pgz file') parser.add_argument('--hmmer-dict', dest='hmmerDictFile', type=str, required=True, help='HMMER dictionary .pgz file')
...@@ -151,7 +153,7 @@ def main( argv = None ): ...@@ -151,7 +153,7 @@ def main( argv = None ):
hmmerDict = {} hmmerDict = {}
with gzip.open(args.hmmerDictFile,'rb') as f: with gzip.open(args.hmmerDictFile,'rb') as f:
hmmerDict = pickle.load(f) hmmerDict = pickle.load(f)
print_status("loading sequence database...") print_status("loading sequence database...")
seqDict = {} seqDict = {}
with open( args.seqFile, mode='r', newline=None ) as seqFile: with open( args.seqFile, mode='r', newline=None ) as seqFile:
...@@ -160,19 +162,27 @@ def main( argv = None ): ...@@ -160,19 +162,27 @@ def main( argv = None ):
if line: if line:
seq_name, seq_fun, seq_fam, seq_len = [ x.strip() for x in line.split(',') ] seq_name, seq_fun, seq_fam, seq_len = [ x.strip() for x in line.split(',') ]
seqDict[ seq_name ] = { 'Function':seq_fun, 'Family':seq_fam, 'Length':int(seq_len) } seqDict[ seq_name ] = { 'Function':seq_fun, 'Family':seq_fam, 'Length':int(seq_len) }
print_status("processing hmmsearch output files...") print_status("processing hmmsearch output files...")
#print(seqDict)
header = ['#hmm_name','hmm_len','seq_name','seq_len','seq_family','seq_func', header = ['#hmm_name','hmm_len','seq_name','seq_len','seq_family','seq_func',
'bitscore', 'mean_score', 'mcscore','mean_mcs', 'wcscore','mean_wcs', 'bitscore', 'mean_score', 'mcscore','mean_mcs', 'wcscore','mean_wcs',
'ident','hmm_cov','hit_type'] 'ident','hmm_cov','hit_type']
print('\t'.join(header)) print('\t'.join(header))
record = [] record = []
prev_hit = hmmer_hit() prev_hit = hmmer_hit()
print_status("ça marche")
for hit in parseHMMER(sys.stdin,hmmerDict): for hit in parseHMMER(sys.stdin,hmmerDict):
#print(hit.hmm_name)
print_status("hit dans la boucle")
print_status(hit.seq_name)
if hit.seq_name not in seqDict: if hit.seq_name not in seqDict:
print_status("rate")
continue continue
hmm = hmmerDict[hit.hmm_name] hmm = hmmerDict[hit.hmm_name]
print_status("hmm")
print_status( hmmerDict[hit.hmm_name])
if prev_hit.canExtendWith(hit): if prev_hit.canExtendWith(hit):
hit = prev_hit.extendWith(hit) hit = prev_hit.extendWith(hit)
elif len(record) > 0: elif len(record) > 0:
...@@ -212,7 +222,7 @@ def main( argv = None ): ...@@ -212,7 +222,7 @@ def main( argv = None ):
if len(record) > 0: if len(record) > 0:
print('\t'.join([ str(r) for r in record ])) print('\t'.join([ str(r) for r in record ]))
print_status("on a passe la boucle")
return 0 return 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment