0.5.5 docs updated

b8fdcb75 · Konstantin Volzhenin · 5b93c030 · b8fdcb75 · b8fdcb75 · b8fdcb75
Commit b8fdcb75 authored Sep 22, 2023 by Konstantin Volzhenin
7 changed files
--- a/README.md
+++ b/README.md
 SENSE-PPI
-=======================================
+========================================
+[![DOI - 10.1101/2023.09.19.558413](https://img.shields.io/badge/DOI-10.1101%2F2023.09.19.558413-blue)](https://doi.org/10.1101/2023.09.19.558413)
+[![PyPI](https://img.shields.io/pypi/v/senseppi?logo=PyPi)](https://pypi.org/project/senseppi/)
+[![Licence - MIT](https://img.shields.io/badge/Licence-MIT-2ea44f)](http://gitlab.lcqb.upmc.fr/Konstvv/SENSE-PPI/blob/master/LICENSE)
 SENSE-PPI is a Deep Learning model for predicting physical protein-protein interactions based on amino acid sequences. 
 It is based on embeddings generated by ESM2 and uses Siamese RNN architecture to perform a binary classification.
@@ -36,4 +40,6 @@ The original SENSE-PPI repository contains two models (checkpoints with weights)
 The package already comes with preinstalled model `senseppi.ckpt` that is used by default if model path is not specified.
 **N.B.**: Both pretrained models were made to work with proteins in range 50-800 amino acids.
\ No newline at end of file
+In order to cite the original SENSE-PPI paper, please use the following link: https://doi.org/10.1101/2023.09.19.558413  
\ No newline at end of file
--- a/senseppi/__init__.py
+++ b/senseppi/__init__.py
-__version__ = "0.5.4"
+__version__ = "0.5.5"
 __author__ = "Konstantin Volzhenin"
 from . import model, commands, esm2_model, dataset, utils, network_utils

--- a/senseppi/__main__.py
+++ b/senseppi/__main__.py
@@ -3,21 +3,21 @@ import logging
 import torch
 from .commands import *
 from senseppi import __version__
-from senseppi.utils import block_mps
+from senseppi.utils import ArgumentParserWithDefaults, block_mps, determine_device
 def main():
    logging.basicConfig(level=logging.INFO)
-    parser = argparse.ArgumentParser(
+    parser = ArgumentParserWithDefaults(
        description="SENSE_PPI: Sequence-based EvolutIoNary ScalE Protein-Protein Interaction prediction",
        usage="senseppi <command> [<args>]",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        "-v", "--version", action="version", version="SENSE-PPI v{} ".format(__version__))
-    subparsers = parser.add_subparsers(title="The list of SEINE-PPI commands:", required=True, dest="cmd")
+    subparsers = parser.add_subparsers(title="The list of SEINE-PPI commands", required=True, dest="cmd")
    modules = {'train': train,
               'predict': predict,
@@ -34,6 +34,9 @@ def main():
    params = parser.parse_args()
    if hasattr(params, 'device'):
+        if params.device == 'auto':
+            params.device = determine_device()
        if params.device == 'gpu':
            torch.set_float32_matmul_precision('high')

--- a/senseppi/commands/test.py
+++ b/senseppi/commands/test.py
@@ -52,7 +52,8 @@ def add_args(parser):
                                "(.tsv format will be added automatically)")
    test_args.add_argument("--crop_data_to_model_lims", action="store_true",
                           help="If set, the data will be cropped to the limits of the model: "
-                                "evaluations will be done only for proteins >50aa and <800aa.")
+                                "evaluations will be done only for proteins >50aa and <800aa. WARNING: "
+                                "this will modify the original input files.")
    parser = SensePPIModel.add_model_specific_args(parser)
    remove_argument(parser, "--lr")

--- a/senseppi/commands/train.py
+++ b/senseppi/commands/train.py
@@ -45,7 +45,7 @@ def main(params):
 def add_args(parser):
    parser = add_general_args(parser)
-    train_args = parser.add_argument_group(title="Training args")
+    train_args = parser.add_argument_group(title="Training args", description="Arguments for training the model.")
    parser._action_groups[0].add_argument("pairs_file", type=str,
                                          help="A path to a .tsv file containing training pairs. "
                                               "Required format: 3 tab separated columns: first protein, "

--- a/senseppi/network_utils.py
+++ b/senseppi/network_utils.py
@@ -111,14 +111,13 @@ def get_interactions_from_string(gene_names, species=9606, add_nodes=10, require
    if len(string_interactions) == 0:
        raise Exception('No interactions found. Please revise your input parameters.')
-    # Remove duplicated interactions
+    # Removing duplicated interactions
    string_interactions.drop_duplicates(inplace=True)
-    # Make the interactions symmetric: add the interactions where the first and second columns are swapped
+    # Making the interactions symmetric: adding the interactions where the first and second columns are swapped
    string_interactions = pd.concat([string_interactions, string_interactions.rename(
        columns={'stringId_A': 'stringId_B', 'stringId_B': 'stringId_A', 'preferredName_A': 'preferredName_B',
                 'preferredName_B': 'preferredName_A'})])
-    # Getting the sequences for hparams.genes in case there are proteins with no connections and add ghost self_connections to keep gene names in the file
    string_names_input_genes = get_names_from_string(gene_names, species)
    string_names_input_genes['stringId_A'] = string_names_input_genes['stringId']
    string_names_input_genes['preferredName_A'] = string_names_input_genes['preferredName']
@@ -128,10 +127,11 @@ def get_interactions_from_string(gene_names, species=9606, add_nodes=10, require
        ['stringId_A', 'preferredName_A', 'stringId_B', 'preferredName_B']]])
    string_interactions.fillna(0, inplace=True)
-    # For all the proteins in the first ans second columns extract their sequences from 9606.protein.sequences.v11.5.fasta and write them to sequences.fasta
+    ids = list(string_interactions['stringId_A'].values) + \
-    ids = list(string_interactions['stringId_A'].values) + list(string_interactions['stringId_B'].values) + \
+          list(string_interactions['stringId_B'].values) + \
          string_names_input_genes['stringId'].to_list()
    ids = set(ids)
    with open('sequences.fasta', 'w') as f:
        for record in SeqIO.parse('{}.protein.sequences.v{}.fa'.format(species, version), "fasta"):
            if record.id in ids:

--- a/senseppi/utils.py
+++ b/senseppi/utils.py
@@ -3,6 +3,18 @@ import os
 from senseppi import __version__
 import torch
 import logging
+import argparse
+class ArgumentParserWithDefaults(argparse.ArgumentParser):
+    def add_argument(self, *args, help=None, default=None, **kwargs):
+        if help is not None:
+            kwargs['help'] = help
+        if default is not None and args[0] != '-h':
+            kwargs['default'] = default
+            if help is not None:
+                kwargs['help'] += ' Default: {}'.format(default)
+        super().add_argument(*args, **kwargs)
 def add_general_args(parser):
@@ -13,8 +25,8 @@ def add_general_args(parser):
    parser.add_argument("--max_len", type=int, default=800,
                        help="Maximum length of the protein sequence. The sequences with larger length will not be "
                             "considered and will be deleted from the fasta file.")
-    parser.add_argument("--device", type=str, default=determine_device(), choices=['cpu', 'gpu', 'mps'],
+    parser.add_argument("--device", type=str, default='auto', choices=['cpu', 'gpu', 'mps', 'auto'],
-                        help="Device to used for computations. Options include: cpu, gpu, mps (for MacOS)."
+                        help="Device to use for computations. Options include: cpu, gpu, mps (for MacOS), and auto."
                             "If not selected the device is set by torch automatically. WARNING: mps is temporarily "
                             "disabled, if it is chosen, cpu will be used instead.")
@@ -23,12 +35,11 @@ def add_general_args(parser):
 def determine_device():
    if torch.cuda.is_available():
-        device = 'gpu'
+        return 'gpu'
    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
-        device = 'mps'
+        return 'mps'
    else:
-        device = 'cpu'
+        return 'cpu'
-    return device
 def block_mps(params):