0.7.0 changes for create_dataset to process custom datasets from files

c6347479 · Konstantin Volzhenin · 385ebc83 · c6347479 · c6347479 · c6347479
Commit c6347479 authored Mar 18, 2024 by Konstantin Volzhenin
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 26 deletions

usage.rst docs/source/usage.rst +15 -14

__init__.py senseppi/__init__.py +1 -1

create_dataset.py senseppi/commands/create_dataset.py +13 -11

No files found.
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -239,29 +239,29 @@ Create_dataset

 .. code-block:: bash

-    usage: senseppi <command> [<args>] create_dataset [-h] [--interactions INTERACTIONS] [--sequences SEQUENCES] [--not_remove_long_short_proteins] [--min_length MIN_LENGTH] [--max_length MAX_LENGTH]
-                                                      [--max_positive_pairs MAX_POSITIVE_PAIRS] [--combined_score COMBINED_SCORE] [--experimental_score EXPERIMENTAL_SCORE]
-                                                      species
-
-    positional arguments:
-      species               The Taxon identifier of the organism of interest.
+    usage: senseppi <command> [<args>] create_dataset [-h] (-s SPECIES | --int_seq interactions sequences) [--not_remove_long_short_proteins] [--min_length MIN_LENGTH]
+                                                      [--max_length MAX_LENGTH] [--max_positive_pairs MAX_POSITIVE_PAIRS] [--combined_score COMBINED_SCORE]
+                                                      [--experimental_score EXPERIMENTAL_SCORE]

    options:
      -h, --help            show this help message and exit
-      --interactions INTERACTIONS
-                            The physical links (full) file from STRING for the organism of interest. (Default: None)
-      --sequences SEQUENCES
-                            The sequences file downloaded from the same page of STRING. For both files see https://string-db.org/cgi/download (Default: None)
+      -s SPECIES, --species SPECIES
+                            The Taxon identifier of the organism of interest.
+      --int_seq interactions sequences
+                            The physical links (full) file from STRING and the fasta file with sequences. Two paths should be separated by a whitespace. If not provided,
+                            they will be downloaded from STRING. For both files see https://string-db.org/cgi/download
      --not_remove_long_short_proteins
-                            If specified, does not remove proteins shorter than --min_length and longer than --max_length. By default, long and short proteins are removed.
+                            If specified, does not remove proteins shorter than --min_length and longer than --max_length. By default, long and short proteins are
+                            removed.
      --min_length MIN_LENGTH
                            The minimum length of a protein to be included in the dataset. (Default: 50)
      --max_length MAX_LENGTH
                            The maximum length of a protein to be included in the dataset. (Default: 800)
      --max_positive_pairs MAX_POSITIVE_PAIRS
-                            The maximum number of positive pairs to be included in the dataset. If None, all pairs are included. If specified, the pairs are selected based on the combined score in STRING. (Default: None)
+                            The maximum number of positive pairs to be included in the dataset. If None, all pairs are included. If specified, the pairs are selected
+                            based on the combined score in STRING. (Default: None)
      --combined_score COMBINED_SCORE
                            The combined score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. (Default: 500)
      --experimental_score EXPERIMENTAL_SCORE
-                            The experimental score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. Default is None, which means that the experimental score is not used. (Default: None)
-
+                            The experimental score threshold for the pairs extracted from STRING. Ranges from 0 to 1000. Default is None, which means that the
+                            experimental score is not used. (Default: None)
\ No newline at end of file
--- a/senseppi/__init__.py
+++ b/senseppi/__init__.py
-__version__ = "0.6.9"
+__version__ = "0.7.0"
 __author__ = "Konstantin Volzhenin"

 from . import model, commands, esm2_model, dataset, utils, network_utils

--- a/senseppi/commands/create_dataset.py
+++ b/senseppi/commands/create_dataset.py
@@ -27,7 +27,7 @@ class STRINGDatasetCreation:
        self.sequences_file = params.sequences
        self.min_length = params.min_length
        self.max_length = params.max_length
-        self.species = params.species
+        self.species = 'custom' if params.species is None else params.species
        self.max_positive_pairs = params.max_positive_pairs
        self.combined_score = params.combined_score
        self.experimental_score = params.experimental_score
@@ -44,7 +44,7 @@ class STRINGDatasetCreation:
        # Creating a new intermediate file with only the interactions that are suitable for training
        # Such interaction happen only between proteins of appropriate lenghs
        #
-        # And either have a high combined score of > 700
+        # And have a high combined score of > self.combined_score
        #
        # Further on, redundant interactions are removed as well as sequences with inappropriate
        # length and interactions based on homology
@@ -97,7 +97,7 @@ class STRINGDatasetCreation:
                    for line in tqdm(f2, total=n_interactions):
                        line = line.strip().split(' ')

-                        if self.species is not None:
+                        if self.species != 'custom':
                            if not line[0].startswith(self.species) or not line[1].startswith(self.species):
                                continue

@@ -249,14 +249,14 @@ class STRINGDatasetCreation:


 def add_args(parser):
-    parser.add_argument("species", type=str,
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("-s", "--species", type=str, default=None,
                        help="The Taxon identifier of the organism of interest.")
-    parser.add_argument("--interactions", type=str, default=None,
-                        help="The physical links (full) file from STRING for the "
-                             "organism of interest. If not provided, the file will be downloaded from STRING.")
-    parser.add_argument("--sequences", type=str, default=None,
-                        help="The fasta file with sequences. If not provided, it will be downloaded from "
-                             "STRING. For both files see https://string-db.org/cgi/download")
+    group.add_argument("--int_seq", nargs=2, metavar=("interactions", "sequences"), default=None,
+                       help="The physical links (full) file from STRING and the fasta file with sequences. "
+                            "Two paths should be separated by a whitespace. "
+                            "If not provided, they will be downloaded from STRING. For both files see "
+                            "https://string-db.org/cgi/download")
    parser.add_argument("--not_remove_long_short_proteins", action='store_true',
                        help="If specified, does not remove proteins "
                             "shorter than --min_length and longer than --max_length. "
@@ -282,7 +282,9 @@ def add_args(parser):

 def main(params):
    downloaded_flag = False
-    if params.interactions is None or params.sequences is None:
+    params.interactions = params.int_seq[0] if params.int_seq is not None else None
+    params.sequences = params.int_seq[1] if params.int_seq is not None else None
+    if params.species is not None:
        downloaded_flag = True
        logging.info('One or both of the files are not specified (interactions or sequences). '
                     'Downloading from STRING...')