parallelization modif

parent 42e60f66
......@@ -175,24 +175,24 @@ check_pymodules "ete3" "numpy"
PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
if ! command -v parallel >/dev/null 2>&1; then
print_warning "cannot find GNU parallel, all jobs will be run sequentially"
PEXEC_CMD="/usr/bin/env bash --"
print_warning "cannot find GNU parallel, all jobs will be run sequentially"
PEXEC_CMD="/usr/bin/env bash --"
fi
# Create temp working directory
if [ -z "${PV_TMPDIR}" ]; then
if [ $(check_system) = "Linux" ]; then
PV_TMPDIR=$(mktemp -p "${PWD}" -d pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
else
PV_TMPDIR=$(mktemp -d "${PWD}"/pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
fi
if [ $(check_system) = "Linux" ]; then
PV_TMPDIR=$(mktemp -p "${PWD}" -d pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
else
PV_TMPDIR=$(mktemp -d "${PWD}"/pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
fi
else
mkdir -p "${PV_TMPDIR}" || { print_error "cannot create temporary directory"; exit 1; }
mkdir -p "${PV_TMPDIR}" || { print_error "cannot create temporary directory"; exit 1; }
fi
PV_TMPDIR=$(abspath "${PV_TMPDIR}")
if [ ! -d "${PV_TMPDIR}" ] || [ -n "$(ls -A "${PV_TMPDIR}")" ]; then
print_error "provided path \"${PV_TMPDIR}\" must be an empty directory"
exit 1
print_error "provided path \"${PV_TMPDIR}\" must be an empty directory"
exit 1
fi
print_status "using temporary directory: ${PV_TMPDIR}"
......@@ -202,44 +202,44 @@ touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
# Possibly create a sequence descriptor file (if not provided)
if [ -z "${PV_SEQDESC}" ]; then
PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
print_status "creating temporary sequence descriptor file: ${PV_SEQDESC}"
awk '
BEGIN {
sname=""
seqlen=0
OFS=","
}
/^>/ {
if(sname!=""){ print sname,"NA","NA",seqlen }
sname=substr($1,2)
seqlen=0
next
}
{
for(i=1;i<=NF;i++){seqlen+=length($i)}
}
END {
if(sname!=""){ print sname,"NA","NA",seqlen }
}
' "${INPUT_FASTA}" >"${PV_SEQDESC}"
PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
print_status "creating temporary sequence descriptor file: ${PV_SEQDESC}"
awk '
BEGIN {
sname=""
seqlen=0
OFS=","
}
/^>/ {
if(sname!=""){ print sname,"NA","NA",seqlen }
sname=substr($1,2)
seqlen=0
next
}
{
for(i=1;i<=NF;i++){seqlen+=length($i)}
}
END {
if(sname!=""){ print sname,"NA","NA",seqlen }
}
' "${INPUT_FASTA}" >"${PV_SEQDESC}"
else
cp "${PV_SEQDESC}" "${PV_TMPDIR}/sequences.csv"
PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
cp "${PV_SEQDESC}" "${PV_TMPDIR}/sequences.csv"
PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
fi
# Run hmmsearch for each model of the library against the input sequences
print_status "running hmmsearch against input sequences"
HMMSEARCH_RESFILE="${PV_TMPDIR}"/hmmsearch.out.gz
for hmm in "${PV_LIBDIR}"/hmm/*.hmm; do
[ -e "${hmm}" ] || continue
hmmBase=${hmm##*/}
hmmName=${hmmBase%.hmm}
echo "hmmsearch ${hmm} ${INPUT_FASTA} 2>>${PV_LOGFILE}"
[ -e "${hmm}" ] || continue
hmmBase=${hmm##*/}
hmmName=${hmmBase%.hmm}
echo "hmmsearch ${hmm} ${INPUT_FASTA} 2>>${PV_LOGFILE}"
done | ${PEXEC_CMD} | gzip -c >"${HMMSEARCH_RESFILE}"
if [ $? -ne 0 ]; then
print_error "error during hmmsearch jobs, see log: ${PV_LOGFILE}"
exit 1
print_error "error during hmmsearch jobs, see log: ${PV_LOGFILE}"
exit 1
fi
print_status "processing hmmsearch output files"
......@@ -257,6 +257,8 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null
print_status "building representation space (using k=${PV_KBEST})"
#for PV_KBEST in `seq 1 20 `
#do
python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k "${PV_KBEST}" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then
print_error "error during feature generation, see log: ${PV_LOGFILE}"
......@@ -271,10 +273,9 @@ if [ $? -ne 0 ]; then
fi
print_status "finding representative models and generating annotated ProfileView tree"
python3 "${SCRIPTS_DIR}/findReprModels.py" -t "${PV_TMPDIR}/out.tree" -s "${PV_SCOREFILE}" -m "${PV_TMPDIR}/out.used_models.list" -o "${PV_OUTPREFIX}" 2>>"${PV_LOGFILE}"
python3 "${SCRIPTS_DIR}/findReprModels.py" -t "${PV_TMPDIR}/out.tree" -s "${PV_SCOREFILE}" -m "${PV_TMPDIR}/out.used_models.list" -o "${PV_OUTPREFIX}_${PV_KBEST}" 2>>"${PV_LOGFILE}"
if [ $? -ne 0 ]; then
print_error "error during identification of representative models and the construction of the annotated ProfileView tree, see log: ${PV_LOGFILE}"
exit 1
fi
#done
......@@ -133,6 +133,8 @@ def main( argv = None ):
part = float(stats['PART'])/tot if tot > 0 else 0.0
over = float(stats['OVER'])/tot if tot > 0 else 0.0
if (full < .5 and part < .5) or over >= .3: # significant number of sequence-model "overlapping" hits (i.e., the sequence is likely to be incomplete)
print_status(f"{seq_name},{full},{part},{over}")
partialSequences.append(seq_name)
outFile.write("{}\t{:.4f}\t{:.4f}\t{:.4f}\n".format(seq_name,full,over,part))
for seq_name in partialSequences:
......
......@@ -27,11 +27,13 @@ def parseHMMER(fh,hmmDict):
hmm_name = ""
seq_name = ""
line = fh.readline()
print_status("c'est la ligne")
print_status(line)
while line:
tokens = line.split()
if len(tokens) == 0 or line.startswith("#"):
pass
elif line.startswith("Query: "):
elif line.startswith("Query: "):
hmm_name = tokens[1]
elif line.startswith(">> "): # target sequence section
seq_name = tokens[1]
......@@ -129,7 +131,7 @@ def parseHMMER(fh,hmmDict):
def main( argv = None ):
# parameter definition
parser = argparse.ArgumentParser()
parser.add_argument('--hmmer-dict', dest='hmmerDictFile', type=str, required=True, help='HMMER dictionary .pgz file')
......@@ -151,7 +153,7 @@ def main( argv = None ):
hmmerDict = {}
with gzip.open(args.hmmerDictFile,'rb') as f:
hmmerDict = pickle.load(f)
print_status("loading sequence database...")
seqDict = {}
with open( args.seqFile, mode='r', newline=None ) as seqFile:
......@@ -160,19 +162,27 @@ def main( argv = None ):
if line:
seq_name, seq_fun, seq_fam, seq_len = [ x.strip() for x in line.split(',') ]
seqDict[ seq_name ] = { 'Function':seq_fun, 'Family':seq_fam, 'Length':int(seq_len) }
print_status("processing hmmsearch output files...")
#print(seqDict)
header = ['#hmm_name','hmm_len','seq_name','seq_len','seq_family','seq_func',
'bitscore', 'mean_score', 'mcscore','mean_mcs', 'wcscore','mean_wcs',
'ident','hmm_cov','hit_type']
print('\t'.join(header))
record = []
prev_hit = hmmer_hit()
print_status("ça marche")
for hit in parseHMMER(sys.stdin,hmmerDict):
#print(hit.hmm_name)
print_status("hit dans la boucle")
print_status(hit.seq_name)
if hit.seq_name not in seqDict:
print_status("rate")
continue
hmm = hmmerDict[hit.hmm_name]
print_status("hmm")
print_status( hmmerDict[hit.hmm_name])
if prev_hit.canExtendWith(hit):
hit = prev_hit.extendWith(hit)
elif len(record) > 0:
......@@ -212,7 +222,7 @@ def main( argv = None ):
if len(record) > 0:
print('\t'.join([ str(r) for r in record ]))
print_status("on a passe la boucle")
return 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment