Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
profileview
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Edoardo SARTI
profileview
Commits
53490d34
Commit
53490d34
authored
May 31, 2022
by
Djeser Kordon (edoardo Carbone)
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
parallelization modif
parent
42e60f66
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
61 additions
and
48 deletions
+61
-48
profileview-tree
profileview-tree
+43
-42
generateFeatures.py
scripts/generateFeatures.py
+2
-0
parseHMMER.py
scripts/parseHMMER.py
+16
-6
No files found.
profileview-tree
View file @
53490d34
...
@@ -175,24 +175,24 @@ check_pymodules "ete3" "numpy"
...
@@ -175,24 +175,24 @@ check_pymodules "ete3" "numpy"
PEXEC_CMD
=
"parallel --halt now,fail=1 -j
${
NJOBS
}
"
PEXEC_CMD
=
"parallel --halt now,fail=1 -j
${
NJOBS
}
"
if
!
command
-v
parallel
>
/dev/null 2>&1
;
then
if
!
command
-v
parallel
>
/dev/null 2>&1
;
then
print_warning
"cannot find GNU parallel, all jobs will be run sequentially"
print_warning
"cannot find GNU parallel, all jobs will be run sequentially"
PEXEC_CMD
=
"/usr/bin/env bash --"
PEXEC_CMD
=
"/usr/bin/env bash --"
fi
fi
# Create temp working directory
# Create temp working directory
if
[
-z
"
${
PV_TMPDIR
}
"
]
;
then
if
[
-z
"
${
PV_TMPDIR
}
"
]
;
then
if
[
$(
check_system
)
=
"Linux"
]
;
then
if
[
$(
check_system
)
=
"Linux"
]
;
then
PV_TMPDIR
=
$(
mktemp
-p
"
${
PWD
}
"
-d
pvtmp-XXXXX
)
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
PV_TMPDIR
=
$(
mktemp
-p
"
${
PWD
}
"
-d
pvtmp-XXXXX
)
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
else
else
PV_TMPDIR
=
$(
mktemp
-d
"
${
PWD
}
"
/pvtmp-XXXXX
)
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
PV_TMPDIR
=
$(
mktemp
-d
"
${
PWD
}
"
/pvtmp-XXXXX
)
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
fi
fi
else
else
mkdir
-p
"
${
PV_TMPDIR
}
"
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
mkdir
-p
"
${
PV_TMPDIR
}
"
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
fi
fi
PV_TMPDIR
=
$(
abspath
"
${
PV_TMPDIR
}
"
)
PV_TMPDIR
=
$(
abspath
"
${
PV_TMPDIR
}
"
)
if
[
!
-d
"
${
PV_TMPDIR
}
"
]
||
[
-n
"
$(
ls
-A
"
${
PV_TMPDIR
}
"
)
"
]
;
then
if
[
!
-d
"
${
PV_TMPDIR
}
"
]
||
[
-n
"
$(
ls
-A
"
${
PV_TMPDIR
}
"
)
"
]
;
then
print_error
"provided path
\"
${
PV_TMPDIR
}
\"
must be an empty directory"
print_error
"provided path
\"
${
PV_TMPDIR
}
\"
must be an empty directory"
exit
1
exit
1
fi
fi
print_status
"using temporary directory:
${
PV_TMPDIR
}
"
print_status
"using temporary directory:
${
PV_TMPDIR
}
"
...
@@ -202,44 +202,44 @@ touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
...
@@ -202,44 +202,44 @@ touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
# Possibly create a sequence descriptor file (if not provided)
# Possibly create a sequence descriptor file (if not provided)
if
[
-z
"
${
PV_SEQDESC
}
"
]
;
then
if
[
-z
"
${
PV_SEQDESC
}
"
]
;
then
PV_SEQDESC
=
"
${
PV_TMPDIR
}
/sequences.csv"
PV_SEQDESC
=
"
${
PV_TMPDIR
}
/sequences.csv"
print_status
"creating temporary sequence descriptor file:
${
PV_SEQDESC
}
"
print_status
"creating temporary sequence descriptor file:
${
PV_SEQDESC
}
"
awk
'
awk
'
BEGIN {
BEGIN {
sname=""
sname=""
seqlen=0
seqlen=0
OFS=","
OFS=","
}
}
/^>/ {
/^>/ {
if(sname!=""){ print sname,"NA","NA",seqlen }
if(sname!=""){ print sname,"NA","NA",seqlen }
sname=substr($1,2)
sname=substr($1,2)
seqlen=0
seqlen=0
next
next
}
}
{
{
for(i=1;i<=NF;i++){seqlen+=length($i)}
for(i=1;i<=NF;i++){seqlen+=length($i)}
}
}
END {
END {
if(sname!=""){ print sname,"NA","NA",seqlen }
if(sname!=""){ print sname,"NA","NA",seqlen }
}
}
'
"
${
INPUT_FASTA
}
"
>
"
${
PV_SEQDESC
}
"
'
"
${
INPUT_FASTA
}
"
>
"
${
PV_SEQDESC
}
"
else
else
cp
"
${
PV_SEQDESC
}
"
"
${
PV_TMPDIR
}
/sequences.csv"
cp
"
${
PV_SEQDESC
}
"
"
${
PV_TMPDIR
}
/sequences.csv"
PV_SEQDESC
=
"
${
PV_TMPDIR
}
/sequences.csv"
PV_SEQDESC
=
"
${
PV_TMPDIR
}
/sequences.csv"
fi
fi
# Run hmmsearch for each model of the library against the input sequences
# Run hmmsearch for each model of the library against the input sequences
print_status
"running hmmsearch against input sequences"
print_status
"running hmmsearch against input sequences"
HMMSEARCH_RESFILE
=
"
${
PV_TMPDIR
}
"
/hmmsearch.out.gz
HMMSEARCH_RESFILE
=
"
${
PV_TMPDIR
}
"
/hmmsearch.out.gz
for
hmm
in
"
${
PV_LIBDIR
}
"
/hmm/
*
.hmm
;
do
for
hmm
in
"
${
PV_LIBDIR
}
"
/hmm/
*
.hmm
;
do
[
-e
"
${
hmm
}
"
]
||
continue
[
-e
"
${
hmm
}
"
]
||
continue
hmmBase
=
${
hmm
##*/
}
hmmBase
=
${
hmm
##*/
}
hmmName
=
${
hmmBase
%.hmm
}
hmmName
=
${
hmmBase
%.hmm
}
echo
"hmmsearch
${
hmm
}
${
INPUT_FASTA
}
2>>
${
PV_LOGFILE
}
"
echo
"hmmsearch
${
hmm
}
${
INPUT_FASTA
}
2>>
${
PV_LOGFILE
}
"
done
|
${
PEXEC_CMD
}
| gzip
-c
>
"
${
HMMSEARCH_RESFILE
}
"
done
|
${
PEXEC_CMD
}
| gzip
-c
>
"
${
HMMSEARCH_RESFILE
}
"
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
print_error
"error during hmmsearch jobs, see log:
${
PV_LOGFILE
}
"
print_error
"error during hmmsearch jobs, see log:
${
PV_LOGFILE
}
"
exit
1
exit
1
fi
fi
print_status
"processing hmmsearch output files"
print_status
"processing hmmsearch output files"
...
@@ -257,6 +257,8 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
...
@@ -257,6 +257,8 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
awk
'/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}'
"
${
PV_SCOREFILE
}
"
>
"
${
PV_SEQDESC
}
"
2>/dev/null
awk
'/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}'
"
${
PV_SCOREFILE
}
"
>
"
${
PV_SEQDESC
}
"
2>/dev/null
print_status
"building representation space (using k=
${
PV_KBEST
}
)"
print_status
"building representation space (using k=
${
PV_KBEST
}
)"
#for PV_KBEST in `seq 1 20 `
#do
python3
"
${
SCRIPTS_DIR
}
"
/generateFeatures.py
--seq-list
"
${
PV_SEQDESC
}
"
--hmm-list
"
${
PV_LIBDIR
}
/
${
PV_LIBNAME
}
.models.list"
--scores
"
${
PV_SCOREFILE
}
"
--prefix
"
${
PV_TMPDIR
}
"
/out
-n
20
-k
"
${
PV_KBEST
}
"
2>>
"
${
PV_LOGFILE
}
"
python3
"
${
SCRIPTS_DIR
}
"
/generateFeatures.py
--seq-list
"
${
PV_SEQDESC
}
"
--hmm-list
"
${
PV_LIBDIR
}
/
${
PV_LIBNAME
}
.models.list"
--scores
"
${
PV_SCOREFILE
}
"
--prefix
"
${
PV_TMPDIR
}
"
/out
-n
20
-k
"
${
PV_KBEST
}
"
2>>
"
${
PV_LOGFILE
}
"
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
print_error
"error during feature generation, see log:
${
PV_LOGFILE
}
"
print_error
"error during feature generation, see log:
${
PV_LOGFILE
}
"
...
@@ -271,10 +273,9 @@ if [ $? -ne 0 ]; then
...
@@ -271,10 +273,9 @@ if [ $? -ne 0 ]; then
fi
fi
print_status
"finding representative models and generating annotated ProfileView tree"
print_status
"finding representative models and generating annotated ProfileView tree"
python3
"
${
SCRIPTS_DIR
}
/findReprModels.py"
-t
"
${
PV_TMPDIR
}
/out.tree"
-s
"
${
PV_SCOREFILE
}
"
-m
"
${
PV_TMPDIR
}
/out.used_models.list"
-o
"
${
PV_OUTPREFIX
}
"
2>>
"
${
PV_LOGFILE
}
"
python3
"
${
SCRIPTS_DIR
}
/findReprModels.py"
-t
"
${
PV_TMPDIR
}
/out.tree"
-s
"
${
PV_SCOREFILE
}
"
-m
"
${
PV_TMPDIR
}
/out.used_models.list"
-o
"
${
PV_OUTPREFIX
}
_
${
PV_KBEST
}
"
2>>
"
${
PV_LOGFILE
}
"
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
print_error
"error during identification of representative models and the construction of the annotated ProfileView tree, see log:
${
PV_LOGFILE
}
"
print_error
"error during identification of representative models and the construction of the annotated ProfileView tree, see log:
${
PV_LOGFILE
}
"
exit
1
exit
1
fi
fi
#done
scripts/generateFeatures.py
View file @
53490d34
...
@@ -133,6 +133,8 @@ def main( argv = None ):
...
@@ -133,6 +133,8 @@ def main( argv = None ):
part
=
float
(
stats
[
'PART'
])
/
tot
if
tot
>
0
else
0.0
part
=
float
(
stats
[
'PART'
])
/
tot
if
tot
>
0
else
0.0
over
=
float
(
stats
[
'OVER'
])
/
tot
if
tot
>
0
else
0.0
over
=
float
(
stats
[
'OVER'
])
/
tot
if
tot
>
0
else
0.0
if
(
full
<
.
5
and
part
<
.
5
)
or
over
>=
.
3
:
# significant number of sequence-model "overlapping" hits (i.e., the sequence is likely to be incomplete)
if
(
full
<
.
5
and
part
<
.
5
)
or
over
>=
.
3
:
# significant number of sequence-model "overlapping" hits (i.e., the sequence is likely to be incomplete)
print_status
(
f
"{seq_name},{full},{part},{over}"
)
partialSequences
.
append
(
seq_name
)
partialSequences
.
append
(
seq_name
)
outFile
.
write
(
"{}
\t
{:.4f}
\t
{:.4f}
\t
{:.4f}
\n
"
.
format
(
seq_name
,
full
,
over
,
part
))
outFile
.
write
(
"{}
\t
{:.4f}
\t
{:.4f}
\t
{:.4f}
\n
"
.
format
(
seq_name
,
full
,
over
,
part
))
for
seq_name
in
partialSequences
:
for
seq_name
in
partialSequences
:
...
...
scripts/parseHMMER.py
View file @
53490d34
...
@@ -27,11 +27,13 @@ def parseHMMER(fh,hmmDict):
...
@@ -27,11 +27,13 @@ def parseHMMER(fh,hmmDict):
hmm_name
=
""
hmm_name
=
""
seq_name
=
""
seq_name
=
""
line
=
fh
.
readline
()
line
=
fh
.
readline
()
print_status
(
"c'est la ligne"
)
print_status
(
line
)
while
line
:
while
line
:
tokens
=
line
.
split
()
tokens
=
line
.
split
()
if
len
(
tokens
)
==
0
or
line
.
startswith
(
"#"
):
if
len
(
tokens
)
==
0
or
line
.
startswith
(
"#"
):
pass
pass
elif
line
.
startswith
(
"Query: "
):
elif
line
.
startswith
(
"Query: "
):
hmm_name
=
tokens
[
1
]
hmm_name
=
tokens
[
1
]
elif
line
.
startswith
(
">> "
):
# target sequence section
elif
line
.
startswith
(
">> "
):
# target sequence section
seq_name
=
tokens
[
1
]
seq_name
=
tokens
[
1
]
...
@@ -129,7 +131,7 @@ def parseHMMER(fh,hmmDict):
...
@@ -129,7 +131,7 @@ def parseHMMER(fh,hmmDict):
def
main
(
argv
=
None
):
def
main
(
argv
=
None
):
# parameter definition
# parameter definition
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--hmmer-dict'
,
dest
=
'hmmerDictFile'
,
type
=
str
,
required
=
True
,
help
=
'HMMER dictionary .pgz file'
)
parser
.
add_argument
(
'--hmmer-dict'
,
dest
=
'hmmerDictFile'
,
type
=
str
,
required
=
True
,
help
=
'HMMER dictionary .pgz file'
)
...
@@ -151,7 +153,7 @@ def main( argv = None ):
...
@@ -151,7 +153,7 @@ def main( argv = None ):
hmmerDict
=
{}
hmmerDict
=
{}
with
gzip
.
open
(
args
.
hmmerDictFile
,
'rb'
)
as
f
:
with
gzip
.
open
(
args
.
hmmerDictFile
,
'rb'
)
as
f
:
hmmerDict
=
pickle
.
load
(
f
)
hmmerDict
=
pickle
.
load
(
f
)
print_status
(
"loading sequence database..."
)
print_status
(
"loading sequence database..."
)
seqDict
=
{}
seqDict
=
{}
with
open
(
args
.
seqFile
,
mode
=
'r'
,
newline
=
None
)
as
seqFile
:
with
open
(
args
.
seqFile
,
mode
=
'r'
,
newline
=
None
)
as
seqFile
:
...
@@ -160,19 +162,27 @@ def main( argv = None ):
...
@@ -160,19 +162,27 @@ def main( argv = None ):
if
line
:
if
line
:
seq_name
,
seq_fun
,
seq_fam
,
seq_len
=
[
x
.
strip
()
for
x
in
line
.
split
(
','
)
]
seq_name
,
seq_fun
,
seq_fam
,
seq_len
=
[
x
.
strip
()
for
x
in
line
.
split
(
','
)
]
seqDict
[
seq_name
]
=
{
'Function'
:
seq_fun
,
'Family'
:
seq_fam
,
'Length'
:
int
(
seq_len
)
}
seqDict
[
seq_name
]
=
{
'Function'
:
seq_fun
,
'Family'
:
seq_fam
,
'Length'
:
int
(
seq_len
)
}
print_status
(
"processing hmmsearch output files..."
)
print_status
(
"processing hmmsearch output files..."
)
#print(seqDict)
header
=
[
'#hmm_name'
,
'hmm_len'
,
'seq_name'
,
'seq_len'
,
'seq_family'
,
'seq_func'
,
header
=
[
'#hmm_name'
,
'hmm_len'
,
'seq_name'
,
'seq_len'
,
'seq_family'
,
'seq_func'
,
'bitscore'
,
'mean_score'
,
'mcscore'
,
'mean_mcs'
,
'wcscore'
,
'mean_wcs'
,
'bitscore'
,
'mean_score'
,
'mcscore'
,
'mean_mcs'
,
'wcscore'
,
'mean_wcs'
,
'ident'
,
'hmm_cov'
,
'hit_type'
]
'ident'
,
'hmm_cov'
,
'hit_type'
]
print
(
'
\t
'
.
join
(
header
))
print
(
'
\t
'
.
join
(
header
))
record
=
[]
record
=
[]
prev_hit
=
hmmer_hit
()
prev_hit
=
hmmer_hit
()
print_status
(
"ça marche"
)
for
hit
in
parseHMMER
(
sys
.
stdin
,
hmmerDict
):
for
hit
in
parseHMMER
(
sys
.
stdin
,
hmmerDict
):
#print(hit.hmm_name)
print_status
(
"hit dans la boucle"
)
print_status
(
hit
.
seq_name
)
if
hit
.
seq_name
not
in
seqDict
:
if
hit
.
seq_name
not
in
seqDict
:
print_status
(
"rate"
)
continue
continue
hmm
=
hmmerDict
[
hit
.
hmm_name
]
hmm
=
hmmerDict
[
hit
.
hmm_name
]
print_status
(
"hmm"
)
print_status
(
hmmerDict
[
hit
.
hmm_name
])
if
prev_hit
.
canExtendWith
(
hit
):
if
prev_hit
.
canExtendWith
(
hit
):
hit
=
prev_hit
.
extendWith
(
hit
)
hit
=
prev_hit
.
extendWith
(
hit
)
elif
len
(
record
)
>
0
:
elif
len
(
record
)
>
0
:
...
@@ -212,7 +222,7 @@ def main( argv = None ):
...
@@ -212,7 +222,7 @@ def main( argv = None ):
if
len
(
record
)
>
0
:
if
len
(
record
)
>
0
:
print
(
'
\t
'
.
join
([
str
(
r
)
for
r
in
record
]))
print
(
'
\t
'
.
join
([
str
(
r
)
for
r
in
record
]))
print_status
(
"on a passe la boucle"
)
return
0
return
0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment