0.1.4

Datasets are now taken from STRING v12.0 model.py - batch_size and lr defaults are 32 and 1e-4 respectively commands are edited with respect to new version of STRING

0.1.4
Datasets are now taken from STRING v12.0 model.py - batch_size and lr defaults are 32 and 1e-4 respectively commands are edited with respect to new version of STRING
a3873ec7 · Konstantin Volzhenin · 390ebdd7 · 390ebdd7 · 390ebdd7 · a3873ec7
Commit a3873ec7 authored Jul 26, 2023 by Konstantin Volzhenin
20 changed files
--- a/data/string_species/protein.pairs_176946.tsv
+++ b/data/string_species/protein.pairs_176946.tsv
--- a/data/string_species/protein.pairs_7029.tsv
+++ b/data/string_species/protein.pairs_7029.tsv
--- a/data/string_species/protein.pairs_7029_aphid.tsv
+++ b/data/string_species/protein.pairs_7029_aphid.tsv
--- a/data/string_species/protein.pairs_8663_snake.tsv
+++ b/data/string_species/protein.pairs_8663_snake.tsv
--- a/data/string_species/protein.pairs_9031_chicken.tsv
+++ b/data/string_species/protein.pairs_9031_chicken.tsv
--- a/data/string_species/protein.pairs_9796.tsv
+++ b/data/string_species/protein.pairs_9796.tsv
--- a/data/string_species/protein.pairs_9796_horse.tsv
+++ b/data/string_species/protein.pairs_9796_horse.tsv
--- a/data/string_species/protein.pairs_9913_cow.tsv
+++ b/data/string_species/protein.pairs_9913_cow.tsv
--- a/data/string_species/sequences_176946.fasta
+++ b/data/string_species/sequences_176946.fasta
--- a/data/string_species/sequences_7029.fasta
+++ b/data/string_species/sequences_7029.fasta
--- a/data/string_species/sequences_7029_aphid.fasta
+++ b/data/string_species/sequences_7029_aphid.fasta
--- a/data/string_species/sequences_8663_snake.fasta
+++ b/data/string_species/sequences_8663_snake.fasta
--- a/data/string_species/sequences_9031_chicken.fasta
+++ b/data/string_species/sequences_9031_chicken.fasta
--- a/data/string_species/sequences_9796.fasta
+++ b/data/string_species/sequences_9796.fasta
--- a/data/string_species/sequences_9796_horse.fasta
+++ b/data/string_species/sequences_9796_horse.fasta
--- a/data/string_species/sequences_9913_cow.fasta
+++ b/data/string_species/sequences_9913_cow.fasta
--- a/senseppi/__init__.py
+++ b/senseppi/__init__.py
-__version__ = "0.1.3"
+__version__ = "0.1.4"
 __author__ = "Konstantin Volzhenin"

 from . import model, commands, esm2_model, dataset, utils, network_utils

--- a/senseppi/commands/string_dataset_create.py
+++ b/senseppi/commands/string_dataset_create.py
@@ -5,11 +5,12 @@ from Bio import SeqIO
 import logging
 import argparse
 import subprocess
+from urllib.error import HTTPError
 import wget
 import gzip
 import shutil
 import random
-from ..network_utils import get_string_url
+from ..network_utils import get_string_url, DOWNLOAD_LINK_STRING


 def _count_generator(reader):
@@ -284,21 +285,29 @@ def main(params):
    if params.interactions is None or params.sequences is None:
        logging.info('One or both of the files are not specified (interactions or sequences). '
                     'Downloading from STRING...')
-    _, version = get_string_url()

-    url = "https://stringdb-static.org/download/protein.physical.links.full.v{0}/{1}.protein.physical.links.full.v{0}.txt.gz".format(version, params.species)
-    string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species)
-    wget.download(url, out=string_file_name_links+'.gz')
-    with gzip.open(string_file_name_links+'.gz', 'rb') as f_in:
-        with open(string_file_name_links, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-
-    url = "https://stringdb-static.org/download/protein.sequences.v{0}/{1}.protein.sequences.v{0}.fa.gz".format(version, params.species)
-    string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species)
-    wget.download(url, out=string_file_name_seqs+'.gz')
-    with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in:
-        with open(string_file_name_seqs, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
+    _, version = get_string_url()
+    logging.info('STRING version: {}'.format(version))
+
+    try:
+        url = "{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
+        string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species)
+        wget.download(url, out=string_file_name_links+'.gz')
+        with gzip.open(string_file_name_links+'.gz', 'rb') as f_in:
+            with open(string_file_name_links, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+
+        url = "{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
+        string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species)
+        wget.download(url, out=string_file_name_seqs+'.gz')
+        with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in:
+            with open(string_file_name_seqs, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+    except HTTPError:
+        raise Exception('The files are not available for the specified species. '
+                     'There might be two reasons for that: \n '
+                     '1) the species is not available in STRING. Please check the STRING species list to verify. \n'
+                     '2) the download link has changed. Please raise an issue in the repository. ')

    os.remove(string_file_name_seqs+'.gz')
    os.remove(string_file_name_links+'.gz')

--- a/senseppi/model.py
+++ b/senseppi/model.py
@@ -202,8 +202,9 @@ class BaselineModel(pl.LightningModule):
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = parent_parser.add_argument_group("Args_model")
-        parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate for training.")
-        parser.add_argument("--batch_size", type=int, default=64, help="Batch size for training/testing.")
+        parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate for training. "
+                                                                   "Cosine warmup will be applied.")
+        parser.add_argument("--batch_size", type=int, default=32, help="Batch size for training/testing.")
        parser.add_argument("--encoder_features", type=int, default=2560,
                            help="Number of features in the encoder "
                                 "(Corresponds to the dimentionality of per-token embedding of ESM2 model.) "

--- a/senseppi/network_utils.py
+++ b/senseppi/network_utils.py
@@ -12,6 +12,7 @@ import requests
 import gzip
 import shutil

+DOWNLOAD_LINK_STRING = "https://stringdb-downloads.org/download/"

 def generate_pairs_string(fasta_file, output_file, with_self=False, delete_proteins=None):
    ids = []
@@ -53,46 +54,6 @@ def generate_pairs_string(fasta_file, output_file, with_self=False, delete_prote
    pairs.to_csv(output_file, sep='\t', index=False, header=False)


-def generate_dscript_gene_names(file_path,
-                                only_positives=True,
-                                species='9606'):
-    data = pd.read_csv(file_path, delimiter='\t', names=['seq1', 'seq2', 'label'])
-
-    if only_positives:
-        train_ids = set(data['seq1'][data['label'] == 1].values).union(set(data['seq2'][data['label'] == 1].values))
-    else:
-        train_ids = set(data['seq1'].values).union(set(data['seq2'].values))
-    # train_ids = [train_id.split('.')[1] for train_id in train_ids]
-    train_ids = [train_id for train_id in train_ids if train_id.startswith(species)]
-
-    if len(train_ids) == 0:
-        return None
-    # Write a request to STRING API to get the gene names for the ids in train_ids
-    # Split the request into chunks of 100 ids and make a pause of 1 second between each chunk
-    chunk_size = 300
-    genes_string = pd.DataFrame()
-    for i in tqdm(range(0, len(train_ids), chunk_size)):
-        chunk = deepcopy(train_ids[i:i + chunk_size])
-        url = 'https://string-db.org/api/tsv/get_string_ids?identifiers=%s&species={}'.format(species) % '%0d'.join(
-            [c.split('.')[-1] for c in chunk])
-        response = urllib.request.urlopen(url)
-        data = response.read()
-        text = data.decode('utf-8')
-        text = text.split('\n')
-        # text = [t for t in text if t]
-        text = [t.split('\t') for t in text]
-        df = pd.DataFrame(text,
-                          columns=['queryIndex', 'stringId', 'ncbiTaxonId', 'taxonName', 'preferredName', 'annotation'])
-        # Remove line if queryIndex is not int
-        df = df[df['queryIndex'].apply(lambda x: x.isdigit())]
-        df['QueryString'] = df['queryIndex'].apply(lambda x: chunk[int(x)])
-        # add stringId and preferredName to genes_string
-        genes_string = pd.concat([genes_string, df[['QueryString', 'preferredName']]])
-        # time.sleep(0.2)
-
-    return genes_string
-
-
 def get_names_from_string(ids, species):
    string_api_url, _ = get_string_url()
    params = {
@@ -124,8 +85,7 @@ def get_interactions_from_string(gene_names, species=9606, add_nodes=10, require
    # Download protein sequences for given species if not downloaded yet
    if not os.path.isfile('{}.protein.sequences.v{}.fa'.format(species, version)):
        print('Downloading protein sequences')
-        url = 'https://stringdb-static.org/download/protein.sequences.v{}/{}.protein.sequences.v{}.fa.gz'.format(
-            version, species, version)
+        url = '{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz'.format(DOWNLOAD_LINK_STRING, version, species)
        urllib.request.urlretrieve(url, '{}.protein.sequences.v{}.fa.gz'.format(species, version))
        print('Unzipping protein sequences')
        with gzip.open('{}.protein.sequences.v{}.fa.gz'.format(species, version), 'rb') as f_in:
@@ -185,9 +145,5 @@ def get_interactions_from_string(gene_names, species=9606, add_nodes=10, require
                SeqIO.write(record, f, "fasta")
    string_interactions.to_csv('string_interactions.tsv', sep='\t', index=False)

-
 if __name__ == '__main__':
-    print(generate_dscript_gene_names(
-        file_path=os.path.join('..', 'STRING_full', 'preprocessed', 'protein.actions_full.tsv'),
-        only_positives=True,
-        species='362663'))
\ No newline at end of file
+    get_interactions_from_string('RFC5')
\ No newline at end of file