0.1.4

Datasets are now taken from STRING v12.0
model.py - batch_size and lr defaults are 32 and 1e-4 respectively
commands are edited with respect to new version of STRING
parent 390ebdd7
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
__version__ = "0.1.3" __version__ = "0.1.4"
__author__ = "Konstantin Volzhenin" __author__ = "Konstantin Volzhenin"
from . import model, commands, esm2_model, dataset, utils, network_utils from . import model, commands, esm2_model, dataset, utils, network_utils
......
...@@ -5,11 +5,12 @@ from Bio import SeqIO ...@@ -5,11 +5,12 @@ from Bio import SeqIO
import logging import logging
import argparse import argparse
import subprocess import subprocess
from urllib.error import HTTPError
import wget import wget
import gzip import gzip
import shutil import shutil
import random import random
from ..network_utils import get_string_url from ..network_utils import get_string_url, DOWNLOAD_LINK_STRING
def _count_generator(reader): def _count_generator(reader):
...@@ -284,21 +285,29 @@ def main(params): ...@@ -284,21 +285,29 @@ def main(params):
if params.interactions is None or params.sequences is None: if params.interactions is None or params.sequences is None:
logging.info('One or both of the files are not specified (interactions or sequences). ' logging.info('One or both of the files are not specified (interactions or sequences). '
'Downloading from STRING...') 'Downloading from STRING...')
_, version = get_string_url() _, version = get_string_url()
logging.info('STRING version: {}'.format(version))
url = "https://stringdb-static.org/download/protein.physical.links.full.v{0}/{1}.protein.physical.links.full.v{0}.txt.gz".format(version, params.species) try:
url = "{0}protein.physical.links.full.v{1}/{2}.protein.physical.links.full.v{1}.txt.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species) string_file_name_links = "{1}.protein.physical.links.full.v{0}.txt".format(version, params.species)
wget.download(url, out=string_file_name_links+'.gz') wget.download(url, out=string_file_name_links+'.gz')
with gzip.open(string_file_name_links+'.gz', 'rb') as f_in: with gzip.open(string_file_name_links+'.gz', 'rb') as f_in:
with open(string_file_name_links, 'wb') as f_out: with open(string_file_name_links, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out) shutil.copyfileobj(f_in, f_out)
url = "https://stringdb-static.org/download/protein.sequences.v{0}/{1}.protein.sequences.v{0}.fa.gz".format(version, params.species) url = "{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz".format(DOWNLOAD_LINK_STRING, version, params.species)
string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species) string_file_name_seqs = "{1}.protein.sequences.v{0}.fa".format(version, params.species)
wget.download(url, out=string_file_name_seqs+'.gz') wget.download(url, out=string_file_name_seqs+'.gz')
with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in: with gzip.open(string_file_name_seqs+'.gz', 'rb') as f_in:
with open(string_file_name_seqs, 'wb') as f_out: with open(string_file_name_seqs, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out) shutil.copyfileobj(f_in, f_out)
except HTTPError:
raise Exception('The files are not available for the specified species. '
'There might be two reasons for that: \n '
'1) the species is not available in STRING. Please check the STRING species list to verify. \n'
'2) the download link has changed. Please raise an issue in the repository. ')
os.remove(string_file_name_seqs+'.gz') os.remove(string_file_name_seqs+'.gz')
os.remove(string_file_name_links+'.gz') os.remove(string_file_name_links+'.gz')
......
...@@ -202,8 +202,9 @@ class BaselineModel(pl.LightningModule): ...@@ -202,8 +202,9 @@ class BaselineModel(pl.LightningModule):
@staticmethod @staticmethod
def add_model_specific_args(parent_parser): def add_model_specific_args(parent_parser):
parser = parent_parser.add_argument_group("Args_model") parser = parent_parser.add_argument_group("Args_model")
parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate for training.") parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate for training. "
parser.add_argument("--batch_size", type=int, default=64, help="Batch size for training/testing.") "Cosine warmup will be applied.")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size for training/testing.")
parser.add_argument("--encoder_features", type=int, default=2560, parser.add_argument("--encoder_features", type=int, default=2560,
help="Number of features in the encoder " help="Number of features in the encoder "
"(Corresponds to the dimentionality of per-token embedding of ESM2 model.) " "(Corresponds to the dimentionality of per-token embedding of ESM2 model.) "
......
...@@ -12,6 +12,7 @@ import requests ...@@ -12,6 +12,7 @@ import requests
import gzip import gzip
import shutil import shutil
DOWNLOAD_LINK_STRING = "https://stringdb-downloads.org/download/"
def generate_pairs_string(fasta_file, output_file, with_self=False, delete_proteins=None): def generate_pairs_string(fasta_file, output_file, with_self=False, delete_proteins=None):
ids = [] ids = []
...@@ -53,46 +54,6 @@ def generate_pairs_string(fasta_file, output_file, with_self=False, delete_prote ...@@ -53,46 +54,6 @@ def generate_pairs_string(fasta_file, output_file, with_self=False, delete_prote
pairs.to_csv(output_file, sep='\t', index=False, header=False) pairs.to_csv(output_file, sep='\t', index=False, header=False)
def generate_dscript_gene_names(file_path,
only_positives=True,
species='9606'):
data = pd.read_csv(file_path, delimiter='\t', names=['seq1', 'seq2', 'label'])
if only_positives:
train_ids = set(data['seq1'][data['label'] == 1].values).union(set(data['seq2'][data['label'] == 1].values))
else:
train_ids = set(data['seq1'].values).union(set(data['seq2'].values))
# train_ids = [train_id.split('.')[1] for train_id in train_ids]
train_ids = [train_id for train_id in train_ids if train_id.startswith(species)]
if len(train_ids) == 0:
return None
# Write a request to STRING API to get the gene names for the ids in train_ids
# Split the request into chunks of 100 ids and make a pause of 1 second between each chunk
chunk_size = 300
genes_string = pd.DataFrame()
for i in tqdm(range(0, len(train_ids), chunk_size)):
chunk = deepcopy(train_ids[i:i + chunk_size])
url = 'https://string-db.org/api/tsv/get_string_ids?identifiers=%s&species={}'.format(species) % '%0d'.join(
[c.split('.')[-1] for c in chunk])
response = urllib.request.urlopen(url)
data = response.read()
text = data.decode('utf-8')
text = text.split('\n')
# text = [t for t in text if t]
text = [t.split('\t') for t in text]
df = pd.DataFrame(text,
columns=['queryIndex', 'stringId', 'ncbiTaxonId', 'taxonName', 'preferredName', 'annotation'])
# Remove line if queryIndex is not int
df = df[df['queryIndex'].apply(lambda x: x.isdigit())]
df['QueryString'] = df['queryIndex'].apply(lambda x: chunk[int(x)])
# add stringId and preferredName to genes_string
genes_string = pd.concat([genes_string, df[['QueryString', 'preferredName']]])
# time.sleep(0.2)
return genes_string
def get_names_from_string(ids, species): def get_names_from_string(ids, species):
string_api_url, _ = get_string_url() string_api_url, _ = get_string_url()
params = { params = {
...@@ -124,8 +85,7 @@ def get_interactions_from_string(gene_names, species=9606, add_nodes=10, require ...@@ -124,8 +85,7 @@ def get_interactions_from_string(gene_names, species=9606, add_nodes=10, require
# Download protein sequences for given species if not downloaded yet # Download protein sequences for given species if not downloaded yet
if not os.path.isfile('{}.protein.sequences.v{}.fa'.format(species, version)): if not os.path.isfile('{}.protein.sequences.v{}.fa'.format(species, version)):
print('Downloading protein sequences') print('Downloading protein sequences')
url = 'https://stringdb-static.org/download/protein.sequences.v{}/{}.protein.sequences.v{}.fa.gz'.format( url = '{0}protein.sequences.v{1}/{2}.protein.sequences.v{1}.fa.gz'.format(DOWNLOAD_LINK_STRING, version, species)
version, species, version)
urllib.request.urlretrieve(url, '{}.protein.sequences.v{}.fa.gz'.format(species, version)) urllib.request.urlretrieve(url, '{}.protein.sequences.v{}.fa.gz'.format(species, version))
print('Unzipping protein sequences') print('Unzipping protein sequences')
with gzip.open('{}.protein.sequences.v{}.fa.gz'.format(species, version), 'rb') as f_in: with gzip.open('{}.protein.sequences.v{}.fa.gz'.format(species, version), 'rb') as f_in:
...@@ -185,9 +145,5 @@ def get_interactions_from_string(gene_names, species=9606, add_nodes=10, require ...@@ -185,9 +145,5 @@ def get_interactions_from_string(gene_names, species=9606, add_nodes=10, require
SeqIO.write(record, f, "fasta") SeqIO.write(record, f, "fasta")
string_interactions.to_csv('string_interactions.tsv', sep='\t', index=False) string_interactions.to_csv('string_interactions.tsv', sep='\t', index=False)
if __name__ == '__main__': if __name__ == '__main__':
print(generate_dscript_gene_names( get_interactions_from_string('RFC5')
file_path=os.path.join('..', 'STRING_full', 'preprocessed', 'protein.actions_full.tsv'), \ No newline at end of file
only_positives=True,
species='362663'))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment