First fully assembled version 0.1.0

Comments, arguments and outdated code have to be cleaned, the version still needs some testing
parent 59a6e327
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -37,7 +37,7 @@ def test(params): ...@@ -37,7 +37,7 @@ def test(params):
def add_args(parser): def add_args(parser):
parser = add_general_args(parser) parser = add_general_args(parser)
predict_args = parser.add_argument_group(title="Predict args") test_args = parser.add_argument_group(title="Predict args")
parser._action_groups[0].add_argument("model_path", type=str, parser._action_groups[0].add_argument("model_path", type=str,
help="A path to .ckpt file that contains weights to a pretrained model.") help="A path to .ckpt file that contains weights to a pretrained model.")
parser._action_groups[0].add_argument("pairs_file", type=str, default=None, parser._action_groups[0].add_argument("pairs_file", type=str, default=None,
...@@ -47,10 +47,10 @@ def add_args(parser): ...@@ -47,10 +47,10 @@ def add_args(parser):
help="FASTA file on which to extract the ESM2 " help="FASTA file on which to extract the ESM2 "
"representations and then evaluate.", "representations and then evaluate.",
) )
predict_args.add_argument("-o", "--output", type=str, default="test_metrics", test_args.add_argument("-o", "--output", type=str, default="test_metrics",
help="A path to a file where the test metrics will be saved. " help="A path to a file where the test metrics will be saved. "
"(.tsv format will be added automatically)") "(.tsv format will be added automatically)")
predict_args.add_argument("--crop_data_to_model_lims", action="store_true", test_args.add_argument("--crop_data_to_model_lims", action="store_true",
help="If set, the data will be cropped to the limits of the model: " help="If set, the data will be cropped to the limits of the model: "
"evaluations will be done only for proteins >50aa and <800aa.") "evaluations will be done only for proteins >50aa and <800aa.")
......
import pytorch_lightning as pl import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.callbacks import ModelCheckpoint
import pathlib import pathlib
import argparse
from ..utils import add_general_args from ..utils import add_general_args
from ..model import SensePPIModel from ..model import SensePPIModel
from ..dataset import PairSequenceData from ..dataset import PairSequenceData
......
...@@ -13,45 +13,44 @@ import gzip ...@@ -13,45 +13,44 @@ import gzip
import shutil import shutil
def generate_pairs(fasta_file, mode='all_to_all', with_self=False, delete_proteins=None): def generate_pairs_string(fasta_file, output_file, with_self=False, delete_proteins=None):
ids = [] ids = []
for record in SeqIO.parse(fasta_file, "fasta"): for record in SeqIO.parse(fasta_file, "fasta"):
ids.append(record.id) ids.append(record.id)
if mode == 'all_to_all': if with_self:
if with_self: all_pairs = [p for p in product(ids, repeat=2)]
all_pairs = [p for p in product(ids, repeat=2)] else:
else: all_pairs = [p for p in permutations(ids, 2)]
all_pairs = [p for p in permutations(ids, 2)]
pairs = [] pairs = []
for p in all_pairs: for p in all_pairs:
if (p[1], p[0]) not in pairs and (p[0], p[1]) not in pairs: if (p[1], p[0]) not in pairs and (p[0], p[1]) not in pairs:
pairs.append(p) pairs.append(p)
pairs = pd.DataFrame(pairs, columns=['seq1', 'seq2']) pairs = pd.DataFrame(pairs, columns=['seq1', 'seq2'])
data = pd.read_csv('string_interactions.tsv', delimiter='\t') data = pd.read_csv('string_interactions.tsv', delimiter='\t')
# Creating a dictionary of string ids and gene names # Creating a dictionary of string ids and gene names
ids_dict = dict(zip(data['preferredName_A'], data['stringId_A'])) ids_dict = dict(zip(data['preferredName_A'], data['stringId_A']))
ids_dict.update(dict(zip(data['preferredName_B'], data['stringId_B']))) ids_dict.update(dict(zip(data['preferredName_B'], data['stringId_B'])))
data = data[['stringId_A', 'stringId_B', 'score']] data = data[['stringId_A', 'stringId_B', 'score']]
data.columns = ['seq1', 'seq2', 'label'] data.columns = ['seq1', 'seq2', 'label']
pairs = pairs.merge(data, on=['seq1', 'seq2'], how='left').fillna(0) pairs = pairs.merge(data, on=['seq1', 'seq2'], how='left').fillna(0)
if delete_proteins is not None: if delete_proteins is not None:
print('Labels removed: ', delete_proteins) print('Labels removed: ', delete_proteins)
string_ids_to_delete = [] string_ids_to_delete = []
for label in delete_proteins: for label in delete_proteins:
string_ids_to_delete.append(ids_dict[label]) string_ids_to_delete.append(ids_dict[label])
print('String ids to delete: ', string_ids_to_delete) print('String ids to delete: ', string_ids_to_delete)
pairs = pairs[~pairs['seq1'].isin(string_ids_to_delete)] pairs = pairs[~pairs['seq1'].isin(string_ids_to_delete)]
pairs = pairs[~pairs['seq2'].isin(string_ids_to_delete)] pairs = pairs[~pairs['seq2'].isin(string_ids_to_delete)]
pairs.to_csv('protein.actions.tsv', sep='\t', index=False, header=False) pairs.to_csv(output_file, sep='\t', index=False, header=False)
def generate_dscript_gene_names(file_path, def generate_dscript_gene_names(file_path,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment