0.6.9 create_dataset improved with pandarallel

parent 5b35f1b1
...@@ -33,4 +33,5 @@ Additionally, required packages include: ...@@ -33,4 +33,5 @@ Additionally, required packages include:
- pytorch-lightning==1.9.0 - pytorch-lightning==1.9.0
- torchmetrics - torchmetrics
- biopython - biopython
- fair-esm - fair-esm
\ No newline at end of file - pandarallel
\ No newline at end of file
...@@ -11,4 +11,5 @@ scikit-learn ...@@ -11,4 +11,5 @@ scikit-learn
pytorch-lightning==1.9.0 pytorch-lightning==1.9.0
torchmetrics torchmetrics
biopython biopython
fair-esm fair-esm
\ No newline at end of file pandarallel
\ No newline at end of file
...@@ -11,7 +11,7 @@ import gzip ...@@ -11,7 +11,7 @@ import gzip
import shutil import shutil
import random import random
from ..network_utils import get_string_url, DOWNLOAD_LINK_STRING from ..network_utils import get_string_url, DOWNLOAD_LINK_STRING
from pandarallel import pandarallel
def _count_generator(reader): def _count_generator(reader):
b = reader(1024 * 1024) b = reader(1024 * 1024)
...@@ -182,7 +182,7 @@ class STRINGDatasetCreation: ...@@ -182,7 +182,7 @@ class STRINGDatasetCreation:
interactions = pd.read_csv("protein.pairs_{}.tsv.tmp".format(self.species), sep='\t') interactions = pd.read_csv("protein.pairs_{}.tsv.tmp".format(self.species), sep='\t')
logging.info('Generating negative pairs.') logging.info('Generating negative pairs.')
tqdm.pandas() pandarallel.initialize(progress_bar=True)
proteins1 = random.choices(proteins, k=len(interactions) * 12) proteins1 = random.choices(proteins, k=len(interactions) * 12)
proteins2 = random.choices(proteins, k=len(interactions) * 12) proteins2 = random.choices(proteins, k=len(interactions) * 12)
...@@ -191,7 +191,7 @@ class STRINGDatasetCreation: ...@@ -191,7 +191,7 @@ class STRINGDatasetCreation:
logging.info('Negative pairs generated. Filtering out duplicates.') logging.info('Negative pairs generated. Filtering out duplicates.')
# Make protein1 and protein2 in alphabetical order # Make protein1 and protein2 in alphabetical order
negative_pairs['protein1'], negative_pairs['protein2'] = zip(*negative_pairs.progress_apply( negative_pairs['protein1'], negative_pairs['protein2'] = zip(*negative_pairs.parallel_apply(
lambda x: (x['protein1'], x['protein2']) if x['protein1'] < x['protein2'] else ( lambda x: (x['protein1'], x['protein2']) if x['protein1'] < x['protein2'] else (
x['protein2'], x['protein1']), axis=1)) x['protein2'], x['protein1']), axis=1))
negative_pairs = negative_pairs.drop_duplicates() negative_pairs = negative_pairs.drop_duplicates()
...@@ -199,17 +199,17 @@ class STRINGDatasetCreation: ...@@ -199,17 +199,17 @@ class STRINGDatasetCreation:
logging.info('Duplicates filtered out. Filtering out pairs that are already in the positive interactions file.') logging.info('Duplicates filtered out. Filtering out pairs that are already in the positive interactions file.')
negative_pairs = negative_pairs[ negative_pairs = negative_pairs[
~negative_pairs.progress_apply(lambda x: len(interactions[(interactions['protein1'] == x[ ~negative_pairs.parallel_apply(lambda x: len(interactions[(interactions['protein1'] == x[
'protein1']) & (interactions['protein2'] == x['protein2'])]) > 0, axis=1)] 'protein1']) & (interactions['protein2'] == x['protein2'])]) > 0, axis=1)]
negative_pairs = negative_pairs[ negative_pairs = negative_pairs[
~negative_pairs.progress_apply(lambda x: len(interactions[(interactions['protein1'] == x[ ~negative_pairs.parallel_apply(lambda x: len(interactions[(interactions['protein1'] == x[
'protein2']) & (interactions['protein2'] == x['protein1'])]) > 0, axis=1)] 'protein2']) & (interactions['protein2'] == x['protein1'])]) > 0, axis=1)]
logging.info( logging.info(
'Pairs that are already in the positive interactions file filtered out. Filtering out pairs that are in ' 'Pairs that are already in the positive interactions file filtered out. Filtering out pairs that are in '
'the same cluster with proteins interacting with a given one already.') 'the same cluster with proteins interacting with a given one already.')
negative_pairs = negative_pairs[~negative_pairs.progress_apply( negative_pairs = negative_pairs[~negative_pairs.parallel_apply(
lambda x: clusters_preprocessed[x['protein2']] in [clusters_preprocessed[i] for i in lambda x: clusters_preprocessed[x['protein2']] in [clusters_preprocessed[i] for i in
interactions[interactions['protein1'] == x['protein1']][ interactions[interactions['protein1'] == x['protein1']][
'protein2'].unique()], axis=1)] 'protein2'].unique()], axis=1)]
...@@ -253,10 +253,10 @@ def add_args(parser): ...@@ -253,10 +253,10 @@ def add_args(parser):
help="The Taxon identifier of the organism of interest.") help="The Taxon identifier of the organism of interest.")
parser.add_argument("--interactions", type=str, default=None, parser.add_argument("--interactions", type=str, default=None,
help="The physical links (full) file from STRING for the " help="The physical links (full) file from STRING for the "
"organism of interest.") "organism of interest. If not provided, the file will be downloaded from STRING.")
parser.add_argument("--sequences", type=str, default=None, parser.add_argument("--sequences", type=str, default=None,
help="The sequences file downloaded from the same page of STRING. " help="The fasta file with sequences. If not provided, it will be downloaded from "
"For both files see https://string-db.org/cgi/download") "STRING. For both files see https://string-db.org/cgi/download")
parser.add_argument("--not_remove_long_short_proteins", action='store_true', parser.add_argument("--not_remove_long_short_proteins", action='store_true',
help="If specified, does not remove proteins " help="If specified, does not remove proteins "
"shorter than --min_length and longer than --max_length. " "shorter than --min_length and longer than --max_length. "
......
...@@ -42,6 +42,7 @@ setup( ...@@ -42,6 +42,7 @@ setup(
"pytorch-lightning==1.9.0", "pytorch-lightning==1.9.0",
"torchmetrics", "torchmetrics",
"biopython", "biopython",
"fair-esm" "fair-esm",
"pandarallel",
], ],
) )
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment