0.7.0 changes for create_dataset to process custom datasets from files

parent 385ebc83
...@@ -239,29 +239,29 @@ Create_dataset ...@@ -239,29 +239,29 @@ Create_dataset
.. code-block:: bash .. code-block:: bash
usage: senseppi <command> [<args>] create_dataset [-h] [--interactions INTERACTIONS] [--sequences SEQUENCES] [--not_remove_long_short_proteins] [--min_length MIN_LENGTH] [--max_length MAX_LENGTH] usage: senseppi <command> [<args>] create_dataset [-h] (-s SPECIES | --int_seq interactions sequences) [--not_remove_long_short_proteins] [--min_length MIN_LENGTH]
[--max_positive_pairs MAX_POSITIVE_PAIRS] [--combined_score COMBINED_SCORE] [--experimental_score EXPERIMENTAL_SCORE] [--max_length MAX_LENGTH] [--max_positive_pairs MAX_POSITIVE_PAIRS] [--combined_score COMBINED_SCORE]
species [--experimental_score EXPERIMENTAL_SCORE]
positional arguments:
species The Taxon identifier of the organism of interest.
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
--interactions INTERACTIONS -s SPECIES, --species SPECIES
The physical links (full) file from STRING for the organism of interest. (Default: None) The Taxon identifier of the organism of interest.
--sequences SEQUENCES --int_seq interactions sequences
The sequences file downloaded from the same page of STRING. For both files see https://string-db.org/cgi/download (Default: None) The physical links (full) file from STRING and the fasta file with sequences. Two paths should be separated by a whitespace. If not provided,
they will be downloaded from STRING. For both files see https://string-db.org/cgi/download
--not_remove_long_short_proteins --not_remove_long_short_proteins
If specified, does not remove proteins shorter than --min_length and longer than --max_length. By default, long and short proteins are removed. If specified, does not remove proteins shorter than --min_length and longer than --max_length. By default, long and short proteins are
removed.
--min_length MIN_LENGTH --min_length MIN_LENGTH
The minimum length of a protein to be included in the dataset. (Default: 50) The minimum length of a protein to be included in the dataset. (Default: 50)
--max_length MAX_LENGTH --max_length MAX_LENGTH
The maximum length of a protein to be included in the dataset. (Default: 800) The maximum length of a protein to be included in the dataset. (Default: 800)
--max_positive_pairs MAX_POSITIVE_PAIRS --max_positive_pairs MAX_POSITIVE_PAIRS
The maximum number of positive pairs to be included in the dataset. If None, all pairs are included. If specified, the pairs are selected based on the combined score in STRING. (Default: None) The maximum number of positive pairs to be included in the dataset. If None, all pairs are included. If specified, the pairs are selected
based on the combined score in STRING. (Default: None)
--combined_score COMBINED_SCORE --combined_score COMBINED_SCORE
The combined score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. (Default: 500) The combined score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. (Default: 500)
--experimental_score EXPERIMENTAL_SCORE --experimental_score EXPERIMENTAL_SCORE
The experimental score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. Default is None, which means that the experimental score is not used. (Default: None) The experimental score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. Default is None, which means that the
experimental score is not used. (Default: None)
\ No newline at end of file
__version__ = "0.6.9" __version__ = "0.7.0"
__author__ = "Konstantin Volzhenin" __author__ = "Konstantin Volzhenin"
from . import model, commands, esm2_model, dataset, utils, network_utils from . import model, commands, esm2_model, dataset, utils, network_utils
......
...@@ -27,7 +27,7 @@ class STRINGDatasetCreation: ...@@ -27,7 +27,7 @@ class STRINGDatasetCreation:
self.sequences_file = params.sequences self.sequences_file = params.sequences
self.min_length = params.min_length self.min_length = params.min_length
self.max_length = params.max_length self.max_length = params.max_length
self.species = params.species self.species = 'custom' if params.species is None else params.species
self.max_positive_pairs = params.max_positive_pairs self.max_positive_pairs = params.max_positive_pairs
self.combined_score = params.combined_score self.combined_score = params.combined_score
self.experimental_score = params.experimental_score self.experimental_score = params.experimental_score
...@@ -44,7 +44,7 @@ class STRINGDatasetCreation: ...@@ -44,7 +44,7 @@ class STRINGDatasetCreation:
# Creating a new intermediate file with only the interactions that are suitable for training # Creating a new intermediate file with only the interactions that are suitable for training
# Such interaction happen only between proteins of appropriate lenghs # Such interaction happen only between proteins of appropriate lenghs
# #
# And either have a high combined score of > 700 # And have a high combined score of > self.combined_score
# #
# Further on, redundant interactions are removed as well as sequences with inappropriate # Further on, redundant interactions are removed as well as sequences with inappropriate
# length and interactions based on homology # length and interactions based on homology
...@@ -97,7 +97,7 @@ class STRINGDatasetCreation: ...@@ -97,7 +97,7 @@ class STRINGDatasetCreation:
for line in tqdm(f2, total=n_interactions): for line in tqdm(f2, total=n_interactions):
line = line.strip().split(' ') line = line.strip().split(' ')
if self.species is not None: if self.species != 'custom':
if not line[0].startswith(self.species) or not line[1].startswith(self.species): if not line[0].startswith(self.species) or not line[1].startswith(self.species):
continue continue
...@@ -249,14 +249,14 @@ class STRINGDatasetCreation: ...@@ -249,14 +249,14 @@ class STRINGDatasetCreation:
def add_args(parser): def add_args(parser):
parser.add_argument("species", type=str, group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("-s", "--species", type=str, default=None,
help="The Taxon identifier of the organism of interest.") help="The Taxon identifier of the organism of interest.")
parser.add_argument("--interactions", type=str, default=None, group.add_argument("--int_seq", nargs=2, metavar=("interactions", "sequences"), default=None,
help="The physical links (full) file from STRING for the " help="The physical links (full) file from STRING and the fasta file with sequences. "
"organism of interest. If not provided, the file will be downloaded from STRING.") "Two paths should be separated by a whitespace. "
parser.add_argument("--sequences", type=str, default=None, "If not provided, they will be downloaded from STRING. For both files see "
help="The fasta file with sequences. If not provided, it will be downloaded from " "https://string-db.org/cgi/download")
"STRING. For both files see https://string-db.org/cgi/download")
parser.add_argument("--not_remove_long_short_proteins", action='store_true', parser.add_argument("--not_remove_long_short_proteins", action='store_true',
help="If specified, does not remove proteins " help="If specified, does not remove proteins "
"shorter than --min_length and longer than --max_length. " "shorter than --min_length and longer than --max_length. "
...@@ -282,7 +282,9 @@ def add_args(parser): ...@@ -282,7 +282,9 @@ def add_args(parser):
def main(params): def main(params):
downloaded_flag = False downloaded_flag = False
if params.interactions is None or params.sequences is None: params.interactions = params.int_seq[0] if params.int_seq is not None else None
params.sequences = params.int_seq[1] if params.int_seq is not None else None
if params.species is not None:
downloaded_flag = True downloaded_flag = True
logging.info('One or both of the files are not specified (interactions or sequences). ' logging.info('One or both of the files are not specified (interactions or sequences). '
'Downloading from STRING...') 'Downloading from STRING...')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment