First fully assembled version 0.1.0

Comments, arguments and outdated code have to be cleaned, the version still needs some testing

First fully assembled version 0.1.0
Comments, arguments and outdated code have to be cleaned, the version still needs some testing
91baa8c1 · Konstantin Volzhenin · 59a6e327 · 91baa8c1 · 91baa8c1 · 91baa8c1
Commit 91baa8c1 authored Jul 25, 2023 by Konstantin Volzhenin
10 changed files
--- a/data/string_species/protein.pairs_176946.tsv
+++ b/data/string_species/protein.pairs_176946.tsv
--- a/data/string_species/protein.pairs_7029.tsv
+++ b/data/string_species/protein.pairs_7029.tsv
--- a/data/string_species/protein.pairs_9796.tsv
+++ b/data/string_species/protein.pairs_9796.tsv
--- a/data/string_species/sequences_176946.fasta
+++ b/data/string_species/sequences_176946.fasta
--- a/data/string_species/sequences_7029.fasta
+++ b/data/string_species/sequences_7029.fasta
--- a/data/string_species/sequences_9796.fasta
+++ b/data/string_species/sequences_9796.fasta
--- a/senseppi/commands/predict_string.py
+++ b/senseppi/commands/predict_string.py
--- a/senseppi/commands/test.py
+++ b/senseppi/commands/test.py
@@ -37,7 +37,7 @@ def test(params):
 def add_args(parser):
    parser = add_general_args(parser)
-    predict_args = parser.add_argument_group(title="Predict args")
+    test_args = parser.add_argument_group(title="Predict args")
    parser._action_groups[0].add_argument("model_path", type=str,
                                          help="A path to .ckpt file that contains weights to a pretrained model.")
    parser._action_groups[0].add_argument("pairs_file", type=str, default=None,
@@ -47,10 +47,10 @@ def add_args(parser):
                                          help="FASTA file on which to extract the ESM2 "
                                               "representations and then evaluate.",
                                          )
-    predict_args.add_argument("-o", "--output", type=str, default="test_metrics",
+    test_args.add_argument("-o", "--output", type=str, default="test_metrics",
                              help="A path to a file where the test metrics will be saved. "
                                   "(.tsv format will be added automatically)")
-    predict_args.add_argument("--crop_data_to_model_lims", action="store_true",
+    test_args.add_argument("--crop_data_to_model_lims", action="store_true",
                              help="If set, the data will be cropped to the limits of the model: "
                                   "evaluations will be done only for proteins >50aa and <800aa.")

--- a/senseppi/commands/train.py
+++ b/senseppi/commands/train.py
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks import ModelCheckpoint
 import pathlib
+import argparse
 from ..utils import add_general_args
 from ..model import SensePPIModel
 from ..dataset import PairSequenceData

--- a/senseppi/network_utils.py
+++ b/senseppi/network_utils.py
@@ -13,45 +13,44 @@ import gzip
 import shutil
-def generate_pairs(fasta_file, mode='all_to_all', with_self=False, delete_proteins=None):
+def generate_pairs_string(fasta_file, output_file, with_self=False, delete_proteins=None):
    ids = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        ids.append(record.id)
-    if mode == 'all_to_all':
+    if with_self:
-        if with_self:
+        all_pairs = [p for p in product(ids, repeat=2)]
-            all_pairs = [p for p in product(ids, repeat=2)]
+    else:
-        else:
+        all_pairs = [p for p in permutations(ids, 2)]
-            all_pairs = [p for p in permutations(ids, 2)]
-        pairs = []
+    pairs = []
-        for p in all_pairs:
+    for p in all_pairs:
-            if (p[1], p[0]) not in pairs and (p[0], p[1]) not in pairs:
+        if (p[1], p[0]) not in pairs and (p[0], p[1]) not in pairs:
-                pairs.append(p)
+            pairs.append(p)
-        pairs = pd.DataFrame(pairs, columns=['seq1', 'seq2'])
+    pairs = pd.DataFrame(pairs, columns=['seq1', 'seq2'])
-        data = pd.read_csv('string_interactions.tsv', delimiter='\t')
+    data = pd.read_csv('string_interactions.tsv', delimiter='\t')
-        # Creating a dictionary of string ids and gene names
+    # Creating a dictionary of string ids and gene names
-        ids_dict = dict(zip(data['preferredName_A'], data['stringId_A']))
+    ids_dict = dict(zip(data['preferredName_A'], data['stringId_A']))
-        ids_dict.update(dict(zip(data['preferredName_B'], data['stringId_B'])))
+    ids_dict.update(dict(zip(data['preferredName_B'], data['stringId_B'])))
-        data = data[['stringId_A', 'stringId_B', 'score']]
+    data = data[['stringId_A', 'stringId_B', 'score']]
-        data.columns = ['seq1', 'seq2', 'label']
+    data.columns = ['seq1', 'seq2', 'label']
-        pairs = pairs.merge(data, on=['seq1', 'seq2'], how='left').fillna(0)
+    pairs = pairs.merge(data, on=['seq1', 'seq2'], how='left').fillna(0)
-        if delete_proteins is not None:
+    if delete_proteins is not None:
-            print('Labels removed: ', delete_proteins)
+        print('Labels removed: ', delete_proteins)
-            string_ids_to_delete = []
+        string_ids_to_delete = []
-            for label in delete_proteins:
+        for label in delete_proteins:
-                string_ids_to_delete.append(ids_dict[label])
+            string_ids_to_delete.append(ids_dict[label])
-            print('String ids to delete: ', string_ids_to_delete)
+        print('String ids to delete: ', string_ids_to_delete)
-            pairs = pairs[~pairs['seq1'].isin(string_ids_to_delete)]
+        pairs = pairs[~pairs['seq1'].isin(string_ids_to_delete)]
-            pairs = pairs[~pairs['seq2'].isin(string_ids_to_delete)]
+        pairs = pairs[~pairs['seq2'].isin(string_ids_to_delete)]
-        pairs.to_csv('protein.actions.tsv', sep='\t', index=False, header=False)
+    pairs.to_csv(output_file, sep='\t', index=False, header=False)
 def generate_dscript_gene_names(file_path,