0.7.0 changes for create_dataset to process custom datasets from files

parent 385ebc83
......@@ -239,29 +239,29 @@ Create_dataset
.. code-block:: bash
usage: senseppi <command> [<args>] create_dataset [-h] [--interactions INTERACTIONS] [--sequences SEQUENCES] [--not_remove_long_short_proteins] [--min_length MIN_LENGTH] [--max_length MAX_LENGTH]
[--max_positive_pairs MAX_POSITIVE_PAIRS] [--combined_score COMBINED_SCORE] [--experimental_score EXPERIMENTAL_SCORE]
species
positional arguments:
species The Taxon identifier of the organism of interest.
usage: senseppi <command> [<args>] create_dataset [-h] (-s SPECIES | --int_seq interactions sequences) [--not_remove_long_short_proteins] [--min_length MIN_LENGTH]
[--max_length MAX_LENGTH] [--max_positive_pairs MAX_POSITIVE_PAIRS] [--combined_score COMBINED_SCORE]
[--experimental_score EXPERIMENTAL_SCORE]
options:
-h, --help show this help message and exit
--interactions INTERACTIONS
The physical links (full) file from STRING for the organism of interest. (Default: None)
--sequences SEQUENCES
The sequences file downloaded from the same page of STRING. For both files see https://string-db.org/cgi/download (Default: None)
-s SPECIES, --species SPECIES
The Taxon identifier of the organism of interest.
--int_seq interactions sequences
The physical links (full) file from STRING and the fasta file with sequences. Two paths should be separated by a whitespace. If not provided,
they will be downloaded from STRING. For both files see https://string-db.org/cgi/download
--not_remove_long_short_proteins
If specified, does not remove proteins shorter than --min_length and longer than --max_length. By default, long and short proteins are removed.
If specified, does not remove proteins shorter than --min_length and longer than --max_length. By default, long and short proteins are
removed.
--min_length MIN_LENGTH
The minimum length of a protein to be included in the dataset. (Default: 50)
--max_length MAX_LENGTH
The maximum length of a protein to be included in the dataset. (Default: 800)
--max_positive_pairs MAX_POSITIVE_PAIRS
The maximum number of positive pairs to be included in the dataset. If None, all pairs are included. If specified, the pairs are selected based on the combined score in STRING. (Default: None)
The maximum number of positive pairs to be included in the dataset. If None, all pairs are included. If specified, the pairs are selected
based on the combined score in STRING. (Default: None)
--combined_score COMBINED_SCORE
The combined score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. (Default: 500)
--experimental_score EXPERIMENTAL_SCORE
The experimental score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. Default is None, which means that the experimental score is not used. (Default: None)
The experimental score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. Default is None, which means that the
experimental score is not used. (Default: None)
\ No newline at end of file
__version__ = "0.6.9"
__version__ = "0.7.0"
__author__ = "Konstantin Volzhenin"
from . import model, commands, esm2_model, dataset, utils, network_utils
......
......@@ -27,7 +27,7 @@ class STRINGDatasetCreation:
self.sequences_file = params.sequences
self.min_length = params.min_length
self.max_length = params.max_length
self.species = params.species
self.species = 'custom' if params.species is None else params.species
self.max_positive_pairs = params.max_positive_pairs
self.combined_score = params.combined_score
self.experimental_score = params.experimental_score
......@@ -44,7 +44,7 @@ class STRINGDatasetCreation:
# Creating a new intermediate file with only the interactions that are suitable for training
# Such interaction happen only between proteins of appropriate lenghs
#
# And either have a high combined score of > 700
# And have a high combined score of > self.combined_score
#
# Further on, redundant interactions are removed as well as sequences with inappropriate
# length and interactions based on homology
......@@ -97,7 +97,7 @@ class STRINGDatasetCreation:
for line in tqdm(f2, total=n_interactions):
line = line.strip().split(' ')
if self.species is not None:
if self.species != 'custom':
if not line[0].startswith(self.species) or not line[1].startswith(self.species):
continue
......@@ -249,14 +249,14 @@ class STRINGDatasetCreation:
def add_args(parser):
parser.add_argument("species", type=str,
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("-s", "--species", type=str, default=None,
help="The Taxon identifier of the organism of interest.")
parser.add_argument("--interactions", type=str, default=None,
help="The physical links (full) file from STRING for the "
"organism of interest. If not provided, the file will be downloaded from STRING.")
parser.add_argument("--sequences", type=str, default=None,
help="The fasta file with sequences. If not provided, it will be downloaded from "
"STRING. For both files see https://string-db.org/cgi/download")
group.add_argument("--int_seq", nargs=2, metavar=("interactions", "sequences"), default=None,
help="The physical links (full) file from STRING and the fasta file with sequences. "
"Two paths should be separated by a whitespace. "
"If not provided, they will be downloaded from STRING. For both files see "
"https://string-db.org/cgi/download")
parser.add_argument("--not_remove_long_short_proteins", action='store_true',
help="If specified, does not remove proteins "
"shorter than --min_length and longer than --max_length. "
......@@ -282,7 +282,9 @@ def add_args(parser):
def main(params):
downloaded_flag = False
if params.interactions is None or params.sequences is None:
params.interactions = params.int_seq[0] if params.int_seq is not None else None
params.sequences = params.int_seq[1] if params.int_seq is not None else None
if params.species is not None:
downloaded_flag = True
logging.info('One or both of the files are not specified (interactions or sequences). '
'Downloading from STRING...')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment