parallelization modif

53490d34 · Djeser Kordon (edoardo Carbone) · 42e60f66 · 53490d34 · 53490d34 · 53490d34
Commit 53490d34 authored May 31, 2022 by Djeser Kordon (edoardo Carbone)
Hide whitespace changes
Inline Side-by-side

Showing with 61 additions and 48 deletions

profileview-tree profileview-tree +43 -42

generateFeatures.py scripts/generateFeatures.py +2 -0

parseHMMER.py scripts/parseHMMER.py +16 -6

No files found.
--- a/profileview-tree
+++ b/profileview-tree
@@ -175,24 +175,24 @@ check_pymodules "ete3" "numpy"
 PEXEC_CMD="parallel --halt now,fail=1 -j ${NJOBS}"
 if ! command -v parallel >/dev/null 2>&1; then
-    print_warning "cannot find GNU parallel, all jobs will be run sequentially"
+print_warning "cannot find GNU parallel, all jobs will be run sequentially"
-    PEXEC_CMD="/usr/bin/env bash --"
+PEXEC_CMD="/usr/bin/env bash --"
 fi
 # Create temp working directory
 if [ -z "${PV_TMPDIR}" ]; then
-    if [ $(check_system) = "Linux" ]; then
+if [ $(check_system) = "Linux" ]; then
-        PV_TMPDIR=$(mktemp -p "${PWD}" -d pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
+PV_TMPDIR=$(mktemp -p "${PWD}" -d pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
-    else
+else
-        PV_TMPDIR=$(mktemp -d "${PWD}"/pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
+PV_TMPDIR=$(mktemp -d "${PWD}"/pvtmp-XXXXX) || { print_error "cannot create temporary directory"; exit 1; }
-    fi
+fi
 else
-    mkdir -p "${PV_TMPDIR}" || { print_error "cannot create temporary directory"; exit 1; }
+mkdir -p "${PV_TMPDIR}" || { print_error "cannot create temporary directory"; exit 1; }
 fi
 PV_TMPDIR=$(abspath "${PV_TMPDIR}")
 if [ ! -d "${PV_TMPDIR}" ] || [ -n "$(ls -A "${PV_TMPDIR}")" ]; then
-    print_error "provided path \"${PV_TMPDIR}\" must be an empty directory"
+print_error "provided path \"${PV_TMPDIR}\" must be an empty directory"
-    exit 1
+exit 1
 fi
 print_status "using temporary directory: ${PV_TMPDIR}"
@@ -202,44 +202,44 @@ touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
 # Possibly create a sequence descriptor file (if not provided)
 if [ -z "${PV_SEQDESC}" ]; then
-    PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
+PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
-    print_status "creating temporary sequence descriptor file: ${PV_SEQDESC}"
+print_status "creating temporary sequence descriptor file: ${PV_SEQDESC}"
-    awk '
+awk '
-        BEGIN {
+BEGIN {
-            sname=""
+    sname=""
-            seqlen=0
+    seqlen=0
-            OFS=","
+    OFS=","
-        }
+}
-        /^>/ {
+/^>/ {
-            if(sname!=""){ print sname,"NA","NA",seqlen }
+    if(sname!=""){ print sname,"NA","NA",seqlen }
-            sname=substr($1,2)
+    sname=substr($1,2)
-            seqlen=0
+    seqlen=0
-            next
+    next
-        }
+}
-        {
+{
-            for(i=1;i<=NF;i++){seqlen+=length($i)}
+    for(i=1;i<=NF;i++){seqlen+=length($i)}
-        }
+}
-        END {
+END {
-            if(sname!=""){ print sname,"NA","NA",seqlen }
+    if(sname!=""){ print sname,"NA","NA",seqlen }
-        }
+}
-    ' "${INPUT_FASTA}" >"${PV_SEQDESC}"
+' "${INPUT_FASTA}" >"${PV_SEQDESC}"
 else
-    cp "${PV_SEQDESC}" "${PV_TMPDIR}/sequences.csv"
+cp "${PV_SEQDESC}" "${PV_TMPDIR}/sequences.csv"
-    PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
+PV_SEQDESC="${PV_TMPDIR}/sequences.csv"
 fi
 # Run hmmsearch for each model of the library against the input sequences
 print_status "running hmmsearch against input sequences"
 HMMSEARCH_RESFILE="${PV_TMPDIR}"/hmmsearch.out.gz
 for hmm in "${PV_LIBDIR}"/hmm/*.hmm; do
-    [ -e "${hmm}" ] || continue
+[ -e "${hmm}" ] || continue
-    hmmBase=${hmm##*/}
+hmmBase=${hmm##*/}
-    hmmName=${hmmBase%.hmm}
+hmmName=${hmmBase%.hmm}
-    echo "hmmsearch ${hmm} ${INPUT_FASTA} 2>>${PV_LOGFILE}"
+echo "hmmsearch ${hmm} ${INPUT_FASTA} 2>>${PV_LOGFILE}"
 done | ${PEXEC_CMD} | gzip -c >"${HMMSEARCH_RESFILE}"
 if [ $? -ne 0 ]; then
-    print_error "error during hmmsearch jobs, see log: ${PV_LOGFILE}"
+print_error "error during hmmsearch jobs, see log: ${PV_LOGFILE}"
-    exit 1
+exit 1
 fi
 print_status "processing hmmsearch output files"
@@ -257,6 +257,8 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
 awk '/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}' "${PV_SCOREFILE}" >"${PV_SEQDESC}" 2>/dev/null
 print_status "building representation space (using k=${PV_KBEST})"
+#for PV_KBEST in `seq 1 20 `
+#do
 python3 "${SCRIPTS_DIR}"/generateFeatures.py --seq-list "${PV_SEQDESC}" --hmm-list "${PV_LIBDIR}/${PV_LIBNAME}.models.list" --scores "${PV_SCOREFILE}" --prefix "${PV_TMPDIR}"/out -n 20 -k "${PV_KBEST}" 2>>"${PV_LOGFILE}"
 if [ $? -ne 0 ]; then
    print_error "error during feature generation, see log: ${PV_LOGFILE}"
@@ -271,10 +273,9 @@ if [ $? -ne 0 ]; then
 fi
 print_status "finding representative models and generating annotated ProfileView tree"
-python3 "${SCRIPTS_DIR}/findReprModels.py" -t "${PV_TMPDIR}/out.tree" -s "${PV_SCOREFILE}" -m "${PV_TMPDIR}/out.used_models.list" -o "${PV_OUTPREFIX}" 2>>"${PV_LOGFILE}"
+python3 "${SCRIPTS_DIR}/findReprModels.py" -t "${PV_TMPDIR}/out.tree" -s "${PV_SCOREFILE}" -m "${PV_TMPDIR}/out.used_models.list" -o "${PV_OUTPREFIX}_${PV_KBEST}" 2>>"${PV_LOGFILE}"
 if [ $? -ne 0 ]; then
    print_error "error during identification of representative models and the construction of the annotated ProfileView tree, see log: ${PV_LOGFILE}"
    exit 1
 fi
+#done
--- a/scripts/generateFeatures.py
+++ b/scripts/generateFeatures.py
@@ -133,6 +133,8 @@ def main( argv = None ):
            part = float(stats['PART'])/tot if tot > 0 else 0.0
            over = float(stats['OVER'])/tot if tot > 0 else 0.0
            if (full < .5 and part < .5) or over >= .3: # significant number of sequence-model "overlapping" hits (i.e., the sequence is likely to be incomplete)
+                print_status(f"{seq_name},{full},{part},{over}")
                partialSequences.append(seq_name)
                outFile.write("{}\t{:.4f}\t{:.4f}\t{:.4f}\n".format(seq_name,full,over,part))
        for seq_name in partialSequences:

--- a/scripts/parseHMMER.py
+++ b/scripts/parseHMMER.py
@@ -27,11 +27,13 @@ def parseHMMER(fh,hmmDict):
    hmm_name = ""
    seq_name = ""
    line = fh.readline()
+    print_status("c'est la ligne")
+    print_status(line)
    while line:
        tokens = line.split()
        if len(tokens) == 0 or line.startswith("#"):
            pass
-        elif line.startswith("Query: "):
+        elif line.startswith("Query: "): 
            hmm_name = tokens[1]
        elif line.startswith(">> "): # target sequence section
            seq_name = tokens[1]
@@ -129,7 +131,7 @@ def parseHMMER(fh,hmmDict):
 def main( argv = None ):
    # parameter definition
    parser = argparse.ArgumentParser()
    parser.add_argument('--hmmer-dict', dest='hmmerDictFile', type=str, required=True, help='HMMER dictionary .pgz file')
@@ -151,7 +153,7 @@ def main( argv = None ):
    hmmerDict = {}
    with gzip.open(args.hmmerDictFile,'rb') as f:
        hmmerDict = pickle.load(f)
    print_status("loading sequence database...")
    seqDict = {}
    with open( args.seqFile, mode='r', newline=None ) as seqFile:
@@ -160,19 +162,27 @@ def main( argv = None ):
            if line:
                seq_name, seq_fun, seq_fam, seq_len = [ x.strip() for x in line.split(',') ]
                seqDict[ seq_name ] = { 'Function':seq_fun, 'Family':seq_fam, 'Length':int(seq_len) }
    print_status("processing hmmsearch output files...")
+    #print(seqDict)
    header = ['#hmm_name','hmm_len','seq_name','seq_len','seq_family','seq_func',
            'bitscore', 'mean_score', 'mcscore','mean_mcs', 'wcscore','mean_wcs',
            'ident','hmm_cov','hit_type']
    print('\t'.join(header))
    record = []
    prev_hit = hmmer_hit()
+    print_status("ça marche")
    for hit in parseHMMER(sys.stdin,hmmerDict):
+        #print(hit.hmm_name)
+        print_status("hit dans la boucle")
+        print_status(hit.seq_name)
        if hit.seq_name not in seqDict:
+            print_status("rate")
            continue
        hmm = hmmerDict[hit.hmm_name]
+        print_status("hmm")
+        print_status( hmmerDict[hit.hmm_name])
        if prev_hit.canExtendWith(hit):
            hit = prev_hit.extendWith(hit)
        elif len(record) > 0:
@@ -212,7 +222,7 @@ def main( argv = None ):
    if len(record) > 0:
        print('\t'.join([ str(r) for r in record ]))
+    print_status("on a passe la boucle")
    return 0