0.6.9 create_dataset improved with pandarallel

aa04f16f · Konstantin Volzhenin · 5b35f1b1 · aa04f16f · aa04f16f · aa04f16f
Commit aa04f16f authored Feb 26, 2024 by Konstantin Volzhenin
Showing with 16 additions and 10 deletions

installation.rst docs/source/installation.rst +2 -0

requirements.txt requirements.txt +2 -0

create_dataset.py senseppi/commands/create_dataset.py +9 -9

setup.py setup.py +3 -1

No files found.
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -34,3 +34,4 @@ Additionally, required packages include:
 - torchmetrics
 - biopython
 - fair-esm
+- pandarallel
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,4 @@ pytorch-lightning==1.9.0
 torchmetrics
 biopython
 fair-esm
+pandarallel
\ No newline at end of file
--- a/senseppi/commands/create_dataset.py
+++ b/senseppi/commands/create_dataset.py
@@ -11,7 +11,7 @@ import gzip
 import shutil
 import random
 from ..network_utils import get_string_url, DOWNLOAD_LINK_STRING
-
+from pandarallel import pandarallel

 def _count_generator(reader):
    b = reader(1024 * 1024)
@@ -182,7 +182,7 @@ class STRINGDatasetCreation:
        interactions = pd.read_csv("protein.pairs_{}.tsv.tmp".format(self.species), sep='\t')

        logging.info('Generating negative pairs.')
-        tqdm.pandas()
+        pandarallel.initialize(progress_bar=True)

        proteins1 = random.choices(proteins, k=len(interactions) * 12)
        proteins2 = random.choices(proteins, k=len(interactions) * 12)
@@ -191,7 +191,7 @@ class STRINGDatasetCreation:
        logging.info('Negative pairs generated. Filtering out duplicates.')

        # Make protein1 and protein2 in alphabetical order
-        negative_pairs['protein1'], negative_pairs['protein2'] = zip(*negative_pairs.progress_apply(
+        negative_pairs['protein1'], negative_pairs['protein2'] = zip(*negative_pairs.parallel_apply(
            lambda x: (x['protein1'], x['protein2']) if x['protein1'] < x['protein2'] else (
                x['protein2'], x['protein1']), axis=1))
        negative_pairs = negative_pairs.drop_duplicates()
@@ -199,17 +199,17 @@ class STRINGDatasetCreation:
        logging.info('Duplicates filtered out. Filtering out pairs that are already in the positive interactions file.')

        negative_pairs = negative_pairs[
-            ~negative_pairs.progress_apply(lambda x: len(interactions[(interactions['protein1'] == x[
+            ~negative_pairs.parallel_apply(lambda x: len(interactions[(interactions['protein1'] == x[
                'protein1']) & (interactions['protein2'] == x['protein2'])]) > 0, axis=1)]
        negative_pairs = negative_pairs[
-            ~negative_pairs.progress_apply(lambda x: len(interactions[(interactions['protein1'] == x[
+            ~negative_pairs.parallel_apply(lambda x: len(interactions[(interactions['protein1'] == x[
                'protein2']) & (interactions['protein2'] == x['protein1'])]) > 0, axis=1)]

        logging.info(
            'Pairs that are already in the positive interactions file filtered out. Filtering out pairs that are in '
            'the same cluster with proteins interacting with a given one already.')

-        negative_pairs = negative_pairs[~negative_pairs.progress_apply(
+        negative_pairs = negative_pairs[~negative_pairs.parallel_apply(
            lambda x: clusters_preprocessed[x['protein2']] in [clusters_preprocessed[i] for i in
                                                               interactions[interactions['protein1'] == x['protein1']][
                                                                   'protein2'].unique()], axis=1)]
@@ -253,10 +253,10 @@ def add_args(parser):
                        help="The Taxon identifier of the organism of interest.")
    parser.add_argument("--interactions", type=str, default=None,
                        help="The physical links (full) file from STRING for the "
-                             "organism of interest.")
+                             "organism of interest. If not provided, the file will be downloaded from STRING.")
    parser.add_argument("--sequences", type=str, default=None,
-                        help="The sequences file downloaded from the same page of STRING. "
-                             "For both files see https://string-db.org/cgi/download")
+                        help="The fasta file with sequences. If not provided, it will be downloaded from "
+                             "STRING. For both files see https://string-db.org/cgi/download")
    parser.add_argument("--not_remove_long_short_proteins", action='store_true',
                        help="If specified, does not remove proteins "
                             "shorter than --min_length and longer than --max_length. "

--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,7 @@ setup(
        "pytorch-lightning==1.9.0",
        "torchmetrics",
        "biopython",
-        "fair-esm"
+        "fair-esm",
+        "pandarallel",
    ],
 )
\ No newline at end of file