0.6.9 create_dataset improved with pandarallel

parent 5b35f1b1
......@@ -33,4 +33,5 @@ Additionally, required packages include:
- pytorch-lightning==1.9.0
- torchmetrics
- biopython
- fair-esm
\ No newline at end of file
- fair-esm
- pandarallel
\ No newline at end of file
......@@ -11,4 +11,5 @@ scikit-learn
pytorch-lightning==1.9.0
torchmetrics
biopython
fair-esm
\ No newline at end of file
fair-esm
pandarallel
\ No newline at end of file
......@@ -11,7 +11,7 @@ import gzip
import shutil
import random
from ..network_utils import get_string_url, DOWNLOAD_LINK_STRING
from pandarallel import pandarallel
def _count_generator(reader):
b = reader(1024 * 1024)
......@@ -182,7 +182,7 @@ class STRINGDatasetCreation:
interactions = pd.read_csv("protein.pairs_{}.tsv.tmp".format(self.species), sep='\t')
logging.info('Generating negative pairs.')
tqdm.pandas()
pandarallel.initialize(progress_bar=True)
proteins1 = random.choices(proteins, k=len(interactions) * 12)
proteins2 = random.choices(proteins, k=len(interactions) * 12)
......@@ -191,7 +191,7 @@ class STRINGDatasetCreation:
logging.info('Negative pairs generated. Filtering out duplicates.')
# Make protein1 and protein2 in alphabetical order
negative_pairs['protein1'], negative_pairs['protein2'] = zip(*negative_pairs.progress_apply(
negative_pairs['protein1'], negative_pairs['protein2'] = zip(*negative_pairs.parallel_apply(
lambda x: (x['protein1'], x['protein2']) if x['protein1'] < x['protein2'] else (
x['protein2'], x['protein1']), axis=1))
negative_pairs = negative_pairs.drop_duplicates()
......@@ -199,17 +199,17 @@ class STRINGDatasetCreation:
logging.info('Duplicates filtered out. Filtering out pairs that are already in the positive interactions file.')
negative_pairs = negative_pairs[
~negative_pairs.progress_apply(lambda x: len(interactions[(interactions['protein1'] == x[
~negative_pairs.parallel_apply(lambda x: len(interactions[(interactions['protein1'] == x[
'protein1']) & (interactions['protein2'] == x['protein2'])]) > 0, axis=1)]
negative_pairs = negative_pairs[
~negative_pairs.progress_apply(lambda x: len(interactions[(interactions['protein1'] == x[
~negative_pairs.parallel_apply(lambda x: len(interactions[(interactions['protein1'] == x[
'protein2']) & (interactions['protein2'] == x['protein1'])]) > 0, axis=1)]
logging.info(
'Pairs that are already in the positive interactions file filtered out. Filtering out pairs that are in '
'the same cluster with proteins interacting with a given one already.')
negative_pairs = negative_pairs[~negative_pairs.progress_apply(
negative_pairs = negative_pairs[~negative_pairs.parallel_apply(
lambda x: clusters_preprocessed[x['protein2']] in [clusters_preprocessed[i] for i in
interactions[interactions['protein1'] == x['protein1']][
'protein2'].unique()], axis=1)]
......@@ -253,10 +253,10 @@ def add_args(parser):
help="The Taxon identifier of the organism of interest.")
parser.add_argument("--interactions", type=str, default=None,
help="The physical links (full) file from STRING for the "
"organism of interest.")
"organism of interest. If not provided, the file will be downloaded from STRING.")
parser.add_argument("--sequences", type=str, default=None,
help="The sequences file downloaded from the same page of STRING. "
"For both files see https://string-db.org/cgi/download")
help="The fasta file with sequences. If not provided, it will be downloaded from "
"STRING. For both files see https://string-db.org/cgi/download")
parser.add_argument("--not_remove_long_short_proteins", action='store_true',
help="If specified, does not remove proteins "
"shorter than --min_length and longer than --max_length. "
......
......@@ -42,6 +42,7 @@ setup(
"pytorch-lightning==1.9.0",
"torchmetrics",
"biopython",
"fair-esm"
"fair-esm",
"pandarallel",
],
)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment