Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
profileview
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Edoardo SARTI
profileview
Commits
53490d34
Commit
53490d34
authored
May 31, 2022
by
Djeser Kordon (edoardo Carbone)
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
parallelization modif
parent
42e60f66
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
61 additions
and
48 deletions
+61
-48
profileview-tree
profileview-tree
+43
-42
generateFeatures.py
scripts/generateFeatures.py
+2
-0
parseHMMER.py
scripts/parseHMMER.py
+16
-6
No files found.
profileview-tree
View file @
53490d34
...
...
@@ -175,24 +175,24 @@ check_pymodules "ete3" "numpy"
PEXEC_CMD
=
"parallel --halt now,fail=1 -j
${
NJOBS
}
"
if
!
command
-v
parallel
>
/dev/null 2>&1
;
then
print_warning
"cannot find GNU parallel, all jobs will be run sequentially"
PEXEC_CMD
=
"/usr/bin/env bash --"
print_warning
"cannot find GNU parallel, all jobs will be run sequentially"
PEXEC_CMD
=
"/usr/bin/env bash --"
fi
# Create temp working directory
if
[
-z
"
${
PV_TMPDIR
}
"
]
;
then
if
[
$(
check_system
)
=
"Linux"
]
;
then
PV_TMPDIR
=
$(
mktemp
-p
"
${
PWD
}
"
-d
pvtmp-XXXXX
)
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
else
PV_TMPDIR
=
$(
mktemp
-d
"
${
PWD
}
"
/pvtmp-XXXXX
)
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
fi
if
[
$(
check_system
)
=
"Linux"
]
;
then
PV_TMPDIR
=
$(
mktemp
-p
"
${
PWD
}
"
-d
pvtmp-XXXXX
)
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
else
PV_TMPDIR
=
$(
mktemp
-d
"
${
PWD
}
"
/pvtmp-XXXXX
)
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
fi
else
mkdir
-p
"
${
PV_TMPDIR
}
"
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
mkdir
-p
"
${
PV_TMPDIR
}
"
||
{
print_error
"cannot create temporary directory"
;
exit
1
;
}
fi
PV_TMPDIR
=
$(
abspath
"
${
PV_TMPDIR
}
"
)
if
[
!
-d
"
${
PV_TMPDIR
}
"
]
||
[
-n
"
$(
ls
-A
"
${
PV_TMPDIR
}
"
)
"
]
;
then
print_error
"provided path
\"
${
PV_TMPDIR
}
\"
must be an empty directory"
exit
1
print_error
"provided path
\"
${
PV_TMPDIR
}
\"
must be an empty directory"
exit
1
fi
print_status
"using temporary directory:
${
PV_TMPDIR
}
"
...
...
@@ -202,44 +202,44 @@ touch "${PV_LOGFILE}" && print_status "a log file is saved in ${PV_LOGFILE}"
# Possibly create a sequence descriptor file (if not provided)
if
[
-z
"
${
PV_SEQDESC
}
"
]
;
then
PV_SEQDESC
=
"
${
PV_TMPDIR
}
/sequences.csv"
print_status
"creating temporary sequence descriptor file:
${
PV_SEQDESC
}
"
awk
'
BEGIN {
sname=""
seqlen=0
OFS=","
}
/^>/ {
if(sname!=""){ print sname,"NA","NA",seqlen }
sname=substr($1,2)
seqlen=0
next
}
{
for(i=1;i<=NF;i++){seqlen+=length($i)}
}
END {
if(sname!=""){ print sname,"NA","NA",seqlen }
}
'
"
${
INPUT_FASTA
}
"
>
"
${
PV_SEQDESC
}
"
PV_SEQDESC
=
"
${
PV_TMPDIR
}
/sequences.csv"
print_status
"creating temporary sequence descriptor file:
${
PV_SEQDESC
}
"
awk
'
BEGIN {
sname=""
seqlen=0
OFS=","
}
/^>/ {
if(sname!=""){ print sname,"NA","NA",seqlen }
sname=substr($1,2)
seqlen=0
next
}
{
for(i=1;i<=NF;i++){seqlen+=length($i)}
}
END {
if(sname!=""){ print sname,"NA","NA",seqlen }
}
'
"
${
INPUT_FASTA
}
"
>
"
${
PV_SEQDESC
}
"
else
cp
"
${
PV_SEQDESC
}
"
"
${
PV_TMPDIR
}
/sequences.csv"
PV_SEQDESC
=
"
${
PV_TMPDIR
}
/sequences.csv"
cp
"
${
PV_SEQDESC
}
"
"
${
PV_TMPDIR
}
/sequences.csv"
PV_SEQDESC
=
"
${
PV_TMPDIR
}
/sequences.csv"
fi
# Run hmmsearch for each model of the library against the input sequences
print_status
"running hmmsearch against input sequences"
HMMSEARCH_RESFILE
=
"
${
PV_TMPDIR
}
"
/hmmsearch.out.gz
for
hmm
in
"
${
PV_LIBDIR
}
"
/hmm/
*
.hmm
;
do
[
-e
"
${
hmm
}
"
]
||
continue
hmmBase
=
${
hmm
##*/
}
hmmName
=
${
hmmBase
%.hmm
}
echo
"hmmsearch
${
hmm
}
${
INPUT_FASTA
}
2>>
${
PV_LOGFILE
}
"
[
-e
"
${
hmm
}
"
]
||
continue
hmmBase
=
${
hmm
##*/
}
hmmName
=
${
hmmBase
%.hmm
}
echo
"hmmsearch
${
hmm
}
${
INPUT_FASTA
}
2>>
${
PV_LOGFILE
}
"
done
|
${
PEXEC_CMD
}
| gzip
-c
>
"
${
HMMSEARCH_RESFILE
}
"
if
[
$?
-ne
0
]
;
then
print_error
"error during hmmsearch jobs, see log:
${
PV_LOGFILE
}
"
exit
1
print_error
"error during hmmsearch jobs, see log:
${
PV_LOGFILE
}
"
exit
1
fi
print_status
"processing hmmsearch output files"
...
...
@@ -257,6 +257,8 @@ PV_SEQDESC="${PV_TMPDIR}/sequences.filtered.csv"
awk
'/^#/{next} !x[$3]++{OFS=",";print $3,$6,$5,$4}'
"
${
PV_SCOREFILE
}
"
>
"
${
PV_SEQDESC
}
"
2>/dev/null
print_status
"building representation space (using k=
${
PV_KBEST
}
)"
#for PV_KBEST in `seq 1 20 `
#do
python3
"
${
SCRIPTS_DIR
}
"
/generateFeatures.py
--seq-list
"
${
PV_SEQDESC
}
"
--hmm-list
"
${
PV_LIBDIR
}
/
${
PV_LIBNAME
}
.models.list"
--scores
"
${
PV_SCOREFILE
}
"
--prefix
"
${
PV_TMPDIR
}
"
/out
-n
20
-k
"
${
PV_KBEST
}
"
2>>
"
${
PV_LOGFILE
}
"
if
[
$?
-ne
0
]
;
then
print_error
"error during feature generation, see log:
${
PV_LOGFILE
}
"
...
...
@@ -271,10 +273,9 @@ if [ $? -ne 0 ]; then
fi
print_status
"finding representative models and generating annotated ProfileView tree"
python3
"
${
SCRIPTS_DIR
}
/findReprModels.py"
-t
"
${
PV_TMPDIR
}
/out.tree"
-s
"
${
PV_SCOREFILE
}
"
-m
"
${
PV_TMPDIR
}
/out.used_models.list"
-o
"
${
PV_OUTPREFIX
}
"
2>>
"
${
PV_LOGFILE
}
"
python3
"
${
SCRIPTS_DIR
}
/findReprModels.py"
-t
"
${
PV_TMPDIR
}
/out.tree"
-s
"
${
PV_SCOREFILE
}
"
-m
"
${
PV_TMPDIR
}
/out.used_models.list"
-o
"
${
PV_OUTPREFIX
}
_
${
PV_KBEST
}
"
2>>
"
${
PV_LOGFILE
}
"
if
[
$?
-ne
0
]
;
then
print_error
"error during identification of representative models and the construction of the annotated ProfileView tree, see log:
${
PV_LOGFILE
}
"
exit
1
fi
#done
scripts/generateFeatures.py
View file @
53490d34
...
...
@@ -133,6 +133,8 @@ def main( argv = None ):
part
=
float
(
stats
[
'PART'
])
/
tot
if
tot
>
0
else
0.0
over
=
float
(
stats
[
'OVER'
])
/
tot
if
tot
>
0
else
0.0
if
(
full
<
.
5
and
part
<
.
5
)
or
over
>=
.
3
:
# significant number of sequence-model "overlapping" hits (i.e., the sequence is likely to be incomplete)
print_status
(
f
"{seq_name},{full},{part},{over}"
)
partialSequences
.
append
(
seq_name
)
outFile
.
write
(
"{}
\t
{:.4f}
\t
{:.4f}
\t
{:.4f}
\n
"
.
format
(
seq_name
,
full
,
over
,
part
))
for
seq_name
in
partialSequences
:
...
...
scripts/parseHMMER.py
View file @
53490d34
...
...
@@ -27,11 +27,13 @@ def parseHMMER(fh,hmmDict):
hmm_name
=
""
seq_name
=
""
line
=
fh
.
readline
()
print_status
(
"c'est la ligne"
)
print_status
(
line
)
while
line
:
tokens
=
line
.
split
()
if
len
(
tokens
)
==
0
or
line
.
startswith
(
"#"
):
pass
elif
line
.
startswith
(
"Query: "
):
elif
line
.
startswith
(
"Query: "
):
hmm_name
=
tokens
[
1
]
elif
line
.
startswith
(
">> "
):
# target sequence section
seq_name
=
tokens
[
1
]
...
...
@@ -129,7 +131,7 @@ def parseHMMER(fh,hmmDict):
def
main
(
argv
=
None
):
# parameter definition
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--hmmer-dict'
,
dest
=
'hmmerDictFile'
,
type
=
str
,
required
=
True
,
help
=
'HMMER dictionary .pgz file'
)
...
...
@@ -151,7 +153,7 @@ def main( argv = None ):
hmmerDict
=
{}
with
gzip
.
open
(
args
.
hmmerDictFile
,
'rb'
)
as
f
:
hmmerDict
=
pickle
.
load
(
f
)
print_status
(
"loading sequence database..."
)
seqDict
=
{}
with
open
(
args
.
seqFile
,
mode
=
'r'
,
newline
=
None
)
as
seqFile
:
...
...
@@ -160,19 +162,27 @@ def main( argv = None ):
if
line
:
seq_name
,
seq_fun
,
seq_fam
,
seq_len
=
[
x
.
strip
()
for
x
in
line
.
split
(
','
)
]
seqDict
[
seq_name
]
=
{
'Function'
:
seq_fun
,
'Family'
:
seq_fam
,
'Length'
:
int
(
seq_len
)
}
print_status
(
"processing hmmsearch output files..."
)
#print(seqDict)
header
=
[
'#hmm_name'
,
'hmm_len'
,
'seq_name'
,
'seq_len'
,
'seq_family'
,
'seq_func'
,
'bitscore'
,
'mean_score'
,
'mcscore'
,
'mean_mcs'
,
'wcscore'
,
'mean_wcs'
,
'ident'
,
'hmm_cov'
,
'hit_type'
]
print
(
'
\t
'
.
join
(
header
))
record
=
[]
prev_hit
=
hmmer_hit
()
print_status
(
"ça marche"
)
for
hit
in
parseHMMER
(
sys
.
stdin
,
hmmerDict
):
#print(hit.hmm_name)
print_status
(
"hit dans la boucle"
)
print_status
(
hit
.
seq_name
)
if
hit
.
seq_name
not
in
seqDict
:
print_status
(
"rate"
)
continue
hmm
=
hmmerDict
[
hit
.
hmm_name
]
print_status
(
"hmm"
)
print_status
(
hmmerDict
[
hit
.
hmm_name
])
if
prev_hit
.
canExtendWith
(
hit
):
hit
=
prev_hit
.
extendWith
(
hit
)
elif
len
(
record
)
>
0
:
...
...
@@ -212,7 +222,7 @@ def main( argv = None ):
if
len
(
record
)
>
0
:
print
(
'
\t
'
.
join
([
str
(
r
)
for
r
in
record
]))
print_status
(
"on a passe la boucle"
)
return
0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment