1. MPS still does not work (torch incompatibility)

2. Train function transferred (needs to be checked) 3. Minor changes

1. MPS still does not work (torch incompatibility)
2. Train function transferred (needs to be checked) 3. Minor changes
a701b09f · Konstantin Volzhenin · 4db6ea76 · a701b09f · a701b09f · a701b09f
Commit a701b09f authored Jul 25, 2023 by Konstantin Volzhenin
8 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 /pretrained_models
 *.tsv
 *.fasta
+*.sh
\ No newline at end of file
--- a/senseppi/__main__.py
+++ b/senseppi/__main__.py
 import argparse
+import logging
 from .commands import *
 from senseppi import __version__

@@ -22,6 +23,13 @@ def main():
        sp.set_defaults(func=module.main)

    params = parser.parse_args()
+
+    #WARNING: due to some internal issues of torch, the mps backend is temporarily disabled
+    if params.device == 'mps':
+        logging.warning('WARNING: due to some internal issues of torch, the mps backend is temporarily disabled.'
+                        'The cpu backend will be used instead.')
+        params.device = 'cpu'
+
    params.func(params)



--- a/senseppi/commands/predict.py
+++ b/senseppi/commands/predict.py
@@ -11,7 +11,7 @@ from ..esm2_model import add_esm_args, compute_embeddings


 def predict(params):
-    test_data = PairSequenceData(emb_dir=params.output_dir_esm, actions_file=params.pairs,
+    test_data = PairSequenceData(emb_dir=params.output_dir_esm, actions_file=params.pairs_file,
                                 max_len=params.max_len, labels=False)

    pretrained_model = SensePPIModel(params)
@@ -34,7 +34,7 @@ def predict(params):
    preds = [pred for batch in trainer.predict(pretrained_model, test_loader) for pred in batch.squeeze().tolist()]
    preds = np.asarray(preds)

-    data = pd.read_csv(params.pairs, delimiter='\t', names=["seq1", "seq2"])
+    data = pd.read_csv(params.pairs_file, delimiter='\t', names=["seq1", "seq2"])
    data['preds'] = preds

    return data
@@ -66,16 +66,19 @@ def add_args(parser):
    predict_args = parser.add_argument_group(title="Predict args")
    parser._action_groups[0].add_argument("model_path", type=str,
                              help="A path to .ckpt file that contains weights to a pretrained model.")
-    predict_args.add_argument("--pairs", type=str, default=None,
-                              help="A path to a .tsv file with pairs of proteins to test (Optional). If not provided, all-to-all pairs will be generated.")
+    predict_args.add_argument("--pairs_file", type=str, default=None,
+                              help="A path to a .tsv file with pairs of proteins to test (Optional). If not provided, "
+                                   "all-to-all pairs will be generated.")
    predict_args.add_argument("-o", "--output", type=str, default="predictions",
-                              help="A path to a file where the predictions will be saved. (.tsv format will be added automatically)")
+                              help="A path to a file where the predictions will be saved. "
+                                   "(.tsv format will be added automatically)")
    predict_args.add_argument("--with_self", action='store_true',
                              help="Include self-interactions in the predictions."
                                   "By default they are not included since they were not part of training but"
                                   "they can be included by setting this flag to True.")
    predict_args.add_argument("-p", "--pred_threshold", type=float, default=0.5,
-                              help="Prediction threshold to determine interacting pairs that will be written to a separate file. Range: (0, 1).")
+                              help="Prediction threshold to determine interacting pairs that "
+                                   "will be written to a separate file. Range: (0, 1).")

    parser = SensePPIModel.add_model_specific_args(parser)
    remove_argument(parser, "--lr")
@@ -88,9 +91,9 @@ def main(params):
    logging.info("Device used: ", params.device)

    process_string_fasta(params.fasta_file, min_len=params.min_len, max_len=params.max_len)
-    if params.pairs is None:
+    if params.pairs_file is None:
        generate_pairs(params.fasta_file, 'protein.pairs.tsv', with_self=params.with_self)
-        params.pairs = 'protein.pairs.tsv'
+        params.pairs_file = 'protein.pairs.tsv'

    compute_embeddings(params)

@@ -104,7 +107,8 @@ def main(params):


 if __name__ == '__main__':
-    parser = add_args()
+    parser = argparse.ArgumentParser()
+    parser = add_args(parser)
    params = parser.parse_args()

    main(params)
--- a/senseppi/commands/train.py
+++ b/senseppi/commands/train.py
-import os
-from pathlib import Path
-import torch
 import pytorch_lightning as pl
-import sys
 from pytorch_lightning.callbacks import TQDMProgressBar, ModelCheckpoint
 from ..model import SensePPIModel
 from ..dataset import PairSequenceData
@@ -10,72 +6,63 @@ from ..utils import *
 from ..esm2_model import add_esm_args, compute_embeddings


-
-# training_th.py
 def main(params):
    if params.seed is not None:
        pl.seed_everything(params.seed, workers=True)

-    dataset = PairSequenceData(emb_dir=params.output_dir_esm, actions_file=params.pairs,
-                               max_len=params.max_len, labels=False)
+    compute_embeddings(params)

-    model = SensePPIModel(params)
+    dataset = PairSequenceData(emb_dir=params.output_dir_esm, actions_file=params.pairs_file,
+                               max_len=params.max_len, labels=True)

-    model.load_data(dataset=dataset, valid_size=0.1)
+    model = SensePPIModel(params)

+    model.load_data(dataset=dataset, valid_size=params.valid_size)
    train_set = model.train_dataloader()
    val_set = model.val_dataloader()

-    logger = pl.loggers.TensorBoardLogger("logs", name='SENSE-PPI')
+    # logger = pl.loggers.TensorBoardLogger("logs", name='SENSE-PPI')
+    logger = None

    callbacks = [
-        TQDMProgressBar(refresh_rate=250),
+        # TQDMProgressBar(refresh_rate=250),
        ModelCheckpoint(filename='chkpt_loss_based_{epoch}-{val_loss:.3f}-{val_BinaryF1Score:.3f}', verbose=True,
                        monitor='val_loss', mode='min', save_top_k=1)
    ]

-    trainer = pl.Trainer(accelerator="gpu" if torch.cuda.is_available() else "cpu", devices=params.devices, num_nodes=params.num_nodes, max_epochs=100,
-                         logger=logger, callbacks=callbacks, strategy=params.strategy)
+    trainer = pl.Trainer(accelerator=params.device, devices=params.num_devices, num_nodes=params.num_nodes,
+                         max_epochs=params.num_epochs, logger=logger, callbacks=callbacks)

    trainer.fit(model, train_set, val_set)


-def esm_check(fasta_file, output_dir, params):
-    params.model_location = 'esm2_t36_3B_UR50D'
-    params.fasta_file = fasta_file
-    params.output_dir = output_dir
-
-    with open(params.fasta_file, 'r') as f:
-        seq_ids = [line.strip().split(' ')[0].replace('>', '') for line in f.readlines() if line.startswith('>')]
-
-    if not os.path.exists(params.output_dir):
-        print('Computing ESM embeddings...')
-        esm2_model.run(params)
-    else:
-        for seq_id in seq_ids:
-            if not os.path.exists(os.path.join(params.output_dir, seq_id + '.pt')):
-                print('Computing ESM embeddings...')
-                esm2_model.run(params)
-                break
-
 def add_args(parser):
    parser = add_general_args(parser)

-    predict_args = parser.add_argument_group(title="Training args")
+    train_args = parser.add_argument_group(title="Training args")
+    parser._action_groups[0].add_argument("pairs_file", type=str,
+                                          help="A path to a .tsv file containing training pairs. "
+                                               "Required format: 3 tab separated columns: first protein, "
+                                               "second protein (protein names have to be present in fasta_file), "
+                                               "label (0 or 1).")
+    train_args.add_argument("--valid_size", type=float, default=0.1,
+                            help="Fraction of the training data to use for validation.")
+    train_args.add_argument("--seed", type=int, default=None, help="Global training seed.")
+    train_args.add_argument("--num_epochs", type=int, default=100, help="Number of training epochs.")
+    train_args.add_argument("--num_devices", type=int, default=1,
+                            help="Number of devices to use for multi GPU training.")
+    train_args.add_argument("--num_nodes", type=int, default=1,
+                            help="Number of nodes to use for training on a cluster.")

    parser = SensePPIModel.add_model_specific_args(parser)
-    remove_argument(parser, "--lr")

    add_esm_args(parser)
    return parser

+
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser = add_args(parser)
    params = parser.parse_args()

-    esm_check(Path(os.path.join('Data', 'Dscript', 'human.fasta')),
-                Path(os.path.join('Data', 'Dscript', 'esm_emb_3B_human')),
-                params)
-
    main(params)
\ No newline at end of file
--- a/senseppi/dataset.py
+++ b/senseppi/dataset.py
@@ -10,7 +10,7 @@ class PairSequenceData(Dataset):
    def __init__(self,
                 actions_file,
                 emb_dir,
-                 max_len=800,
+                 max_len,
                 pad_inputs=True,
                 labels=True):

@@ -34,9 +34,8 @@ class PairSequenceData(Dataset):
        try:
            emb = torch.load(f)
        except FileNotFoundError as _:
-            raise Exception(
-                'Embedding file {} not found. Check your fasta file and make sure it contains all the sequences used in training/testing.'.format(
-                    f))
+            raise Exception('Embedding file {} not found. Check your fasta file and make sure it contains '
+                            'all the sequences used in training/testing.'.format(f))

        tensor_emb = emb['representations'][36]  # [33]
        tensor_len = tensor_emb.size(0)

--- a/senseppi/model.py
+++ b/senseppi/model.py
@@ -47,7 +47,6 @@ class DynamicLSTM(pl.LightningModule):

    def forward(self, x, seq_lens):
        # sort input by descending length
-
        _, idx_sort = torch.sort(seq_lens, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        x_sort = torch.index_select(x, dim=0, index=idx_sort)
@@ -163,12 +162,12 @@ class BaselineModel(pl.LightningModule):
        self.valid_metrics.reset()
        self.log_dict(result, on_epoch=True, sync_dist=self.hparams.sync_dist)

-    def load_data(self, dataset, valid_size=0.2, indices=None):
+    def load_data(self, dataset, valid_size=0.1, indices=None):
        if indices is None:
            dataset_length = len(dataset)
            valid_length = int(valid_size * dataset_length)
            train_length = dataset_length - valid_length
-            self.train_set, self.val_set = data.random_split(dataset, [train_length, valid_length])  # , test_size])
+            self.train_set, self.val_set = data.random_split(dataset, [train_length, valid_length])
            print('Data has been randomly divided into train/val sets with sizes {} and {}'.format(len(self.train_set),
                                                                                                   len(self.val_set)))
        else:
@@ -216,7 +215,7 @@ class SensePPIModel(BaselineModel):
    def __init__(self, params):
        super(SensePPIModel, self).__init__(params)

-        self.encoder_features = self.hparams.encoder_features  # 2560
+        self.encoder_features = self.hparams.encoder_features
        self.hidden_dim = 256

        self.lstm = DynamicLSTM(self.encoder_features, hidden_size=128, num_layers=3, dropout=0.5, bidirectional=True)

--- a/senseppi/utils.py
+++ b/senseppi/utils.py
@@ -15,10 +15,10 @@ def add_general_args(parser):
    )
    parser.add_argument("--min_len", type=int, default=50,
                        help="Minimum length of the protein sequence. "
-                             "The sequences with smaller length will not be considered. Default: 50")
+                             "The sequences with smaller length will not be considered.")
    parser.add_argument("--max_len", type=int, default=800,
                        help="Maximum length of the protein sequence. "
-                             "The sequences with larger length will not be considered. Default: 800")
+                             "The sequences with larger length will not be considered.")
    parser.add_argument("--device", type=str, default=determine_device(), choices=['cpu', 'gpu', 'mps'],
                        help="Device to used for computations. Options include: cpu, gpu, mps (for MacOS)."
                             "If not selected the device is set by torch automatically.")

--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ setup(
        "matplotlib",
        "tqdm",
        "scikit-learn",
-        "pytorch-lightning",
+        "pytorch-lightning==1.9.0",
        "torchmetrics",
        "biopython",
        "fair-esm"