Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
SENSE-PPI
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Konstantin Volzhenin
SENSE-PPI
Commits
5eab0f2b
Commit
5eab0f2b
authored
Sep 01, 2023
by
Konstantin Volzhenin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
0.3.2 bug_fixes for automatic download in create_dataset
parent
d51c12e8
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
34 additions
and
30 deletions
+34
-30
create_dataset.py
senseppi/commands/create_dataset.py
+34
-30
No files found.
senseppi/commands/create_dataset.py
View file @
5eab0f2b
...
...
@@ -230,8 +230,6 @@ class STRINGDatasetCreation:
os
.
remove
(
self
.
intermediate_file
)
os
.
remove
(
"clusters_preprocessed.tsv"
)
os
.
remove
(
"clusters.tsv"
)
os
.
remove
(
self
.
interactions_file
)
os
.
remove
(
self
.
sequences_file
)
# A method to remove sequences of inappropriate length from a fasta file
def
process_fasta_file
(
self
):
...
...
@@ -282,44 +280,50 @@ def add_args(parser):
def
main
(
params
):
downloaded_flag
=
False
if
params
.
interactions
is
None
or
params
.
sequences
is
None
:
downloaded_flag
=
True
logging
.
info
(
'One or both of the files are not specified (interactions or sequences). '
'Downloading from STRING...'
)
_
,
version
=
get_string_url
()
logging
.
info
(
'STRING version: {}'
.
format
(
version
))
try
:
url
=
"{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz"
.
format
(
DOWNLOAD_LINK_STRING
,
version
,
params
.
species
)
string_file_name_links
=
"{1}.protein.physical.links.full.v{0}.txt"
.
format
(
version
,
params
.
species
)
wget
.
download
(
url
,
out
=
string_file_name_links
+
'.gz'
)
with
gzip
.
open
(
string_file_name_links
+
'.gz'
,
'rb'
)
as
f_in
:
with
open
(
string_file_name_links
,
'wb'
)
as
f_out
:
shutil
.
copyfileobj
(
f_in
,
f_out
)
url
=
"{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz"
.
format
(
DOWNLOAD_LINK_STRING
,
version
,
params
.
species
)
string_file_name_seqs
=
"{1}.protein.sequences.v{0}.fa"
.
format
(
version
,
params
.
species
)
wget
.
download
(
url
,
out
=
string_file_name_seqs
+
'.gz'
)
with
gzip
.
open
(
string_file_name_seqs
+
'.gz'
,
'rb'
)
as
f_in
:
with
open
(
string_file_name_seqs
,
'wb'
)
as
f_out
:
shutil
.
copyfileobj
(
f_in
,
f_out
)
except
HTTPError
:
raise
Exception
(
'The files are not available for the specified species. '
'There might be two reasons for that:
\n
'
'1) the species is not available in STRING. Please check the STRING species list to verify.
\n
'
'2) the download link has changed. Please raise an issue in the repository. '
)
os
.
remove
(
string_file_name_seqs
+
'.gz'
)
os
.
remove
(
string_file_name_links
+
'.gz'
)
params
.
interactions
=
string_file_name_links
params
.
sequences
=
string_file_name_seqs
_
,
version
=
get_string_url
()
logging
.
info
(
'STRING version: {}'
.
format
(
version
))
try
:
url
=
"{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz"
.
format
(
DOWNLOAD_LINK_STRING
,
version
,
params
.
species
)
string_file_name_links
=
"{1}.protein.physical.links.full.v{0}.txt"
.
format
(
version
,
params
.
species
)
wget
.
download
(
url
,
out
=
string_file_name_links
+
'.gz'
)
with
gzip
.
open
(
string_file_name_links
+
'.gz'
,
'rb'
)
as
f_in
:
with
open
(
string_file_name_links
,
'wb'
)
as
f_out
:
shutil
.
copyfileobj
(
f_in
,
f_out
)
url
=
"{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz"
.
format
(
DOWNLOAD_LINK_STRING
,
version
,
params
.
species
)
string_file_name_seqs
=
"{1}.protein.sequences.v{0}.fa"
.
format
(
version
,
params
.
species
)
wget
.
download
(
url
,
out
=
string_file_name_seqs
+
'.gz'
)
with
gzip
.
open
(
string_file_name_seqs
+
'.gz'
,
'rb'
)
as
f_in
:
with
open
(
string_file_name_seqs
,
'wb'
)
as
f_out
:
shutil
.
copyfileobj
(
f_in
,
f_out
)
except
HTTPError
:
raise
Exception
(
'The files are not available for the specified species. '
'There might be two reasons for that:
\n
'
'1) the species is not available in STRING. Please check the STRING species list to verify.
\n
'
'2) the download link has changed. Please raise an issue in the repository. '
)
os
.
remove
(
string_file_name_seqs
+
'.gz'
)
os
.
remove
(
string_file_name_links
+
'.gz'
)
params
.
interactions
=
string_file_name_links
params
.
sequences
=
string_file_name_seqs
data
=
STRINGDatasetCreation
(
params
)
data
.
final_preprocessing_positives
()
data
.
create_negatives
()
if
downloaded_flag
:
os
.
remove
(
params
.
interactions
)
os
.
remove
(
params
.
sequences
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment