First fully assembled version 0.1.0

Comments, arguments and outdated code have to be cleaned, the version still needs some testing
parent 59a6e327
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -37,7 +37,7 @@ def test(params):
def add_args(parser):
parser = add_general_args(parser)
predict_args = parser.add_argument_group(title="Predict args")
test_args = parser.add_argument_group(title="Predict args")
parser._action_groups[0].add_argument("model_path", type=str,
help="A path to .ckpt file that contains weights to a pretrained model.")
parser._action_groups[0].add_argument("pairs_file", type=str, default=None,
......@@ -47,10 +47,10 @@ def add_args(parser):
help="FASTA file on which to extract the ESM2 "
"representations and then evaluate.",
)
predict_args.add_argument("-o", "--output", type=str, default="test_metrics",
test_args.add_argument("-o", "--output", type=str, default="test_metrics",
help="A path to a file where the test metrics will be saved. "
"(.tsv format will be added automatically)")
predict_args.add_argument("--crop_data_to_model_lims", action="store_true",
test_args.add_argument("--crop_data_to_model_lims", action="store_true",
help="If set, the data will be cropped to the limits of the model: "
"evaluations will be done only for proteins >50aa and <800aa.")
......
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import pathlib
import argparse
from ..utils import add_general_args
from ..model import SensePPIModel
from ..dataset import PairSequenceData
......
......@@ -13,45 +13,44 @@ import gzip
import shutil
def generate_pairs(fasta_file, mode='all_to_all', with_self=False, delete_proteins=None):
def generate_pairs_string(fasta_file, output_file, with_self=False, delete_proteins=None):
ids = []
for record in SeqIO.parse(fasta_file, "fasta"):
ids.append(record.id)
if mode == 'all_to_all':
if with_self:
all_pairs = [p for p in product(ids, repeat=2)]
else:
all_pairs = [p for p in permutations(ids, 2)]
if with_self:
all_pairs = [p for p in product(ids, repeat=2)]
else:
all_pairs = [p for p in permutations(ids, 2)]
pairs = []
for p in all_pairs:
if (p[1], p[0]) not in pairs and (p[0], p[1]) not in pairs:
pairs.append(p)
pairs = []
for p in all_pairs:
if (p[1], p[0]) not in pairs and (p[0], p[1]) not in pairs:
pairs.append(p)
pairs = pd.DataFrame(pairs, columns=['seq1', 'seq2'])
pairs = pd.DataFrame(pairs, columns=['seq1', 'seq2'])
data = pd.read_csv('string_interactions.tsv', delimiter='\t')
data = pd.read_csv('string_interactions.tsv', delimiter='\t')
# Creating a dictionary of string ids and gene names
ids_dict = dict(zip(data['preferredName_A'], data['stringId_A']))
ids_dict.update(dict(zip(data['preferredName_B'], data['stringId_B'])))
# Creating a dictionary of string ids and gene names
ids_dict = dict(zip(data['preferredName_A'], data['stringId_A']))
ids_dict.update(dict(zip(data['preferredName_B'], data['stringId_B'])))
data = data[['stringId_A', 'stringId_B', 'score']]
data.columns = ['seq1', 'seq2', 'label']
data = data[['stringId_A', 'stringId_B', 'score']]
data.columns = ['seq1', 'seq2', 'label']
pairs = pairs.merge(data, on=['seq1', 'seq2'], how='left').fillna(0)
pairs = pairs.merge(data, on=['seq1', 'seq2'], how='left').fillna(0)
if delete_proteins is not None:
print('Labels removed: ', delete_proteins)
string_ids_to_delete = []
for label in delete_proteins:
string_ids_to_delete.append(ids_dict[label])
print('String ids to delete: ', string_ids_to_delete)
pairs = pairs[~pairs['seq1'].isin(string_ids_to_delete)]
pairs = pairs[~pairs['seq2'].isin(string_ids_to_delete)]
if delete_proteins is not None:
print('Labels removed: ', delete_proteins)
string_ids_to_delete = []
for label in delete_proteins:
string_ids_to_delete.append(ids_dict[label])
print('String ids to delete: ', string_ids_to_delete)
pairs = pairs[~pairs['seq1'].isin(string_ids_to_delete)]
pairs = pairs[~pairs['seq2'].isin(string_ids_to_delete)]
pairs.to_csv('protein.actions.tsv', sep='\t', index=False, header=False)
pairs.to_csv(output_file, sep='\t', index=False, header=False)
def generate_dscript_gene_names(file_path,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment