Update imports and README

f3163134 · Gianluca LOMBARDI · f812f2cc · f3163134 · f3163134 · f3163134
Commit f3163134 authored Sep 02, 2024 by Gianluca LOMBARDI
10 changed files
--- a/README.md
+++ b/README.md
-# MuLAN: MUtational effects with Light Attention Networks
+# MuLAN: Mutational effects with Light Attention Networks
 ![mulan abstract](./images/visual_abstract.png)
@@ -13,8 +13,8 @@ Attention weights extracted from the model can give insights on protein interfac
 ### Installation
 As a prerequisite, you must have PyTorch installed to use this repository. If not, it can be installed with conda running the following:
 ```bash
-# PyTorch 2.1.0, CUDA 12.1
+# PyTorch 2.4.0, CUDA 12.1
-conda install pytorch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pytorch-cuda=12.1 -c pytorch -c nvidia
+conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
 ```
 For other versions or installation methods, please refer to [PyTorch documentation](https://pytorch.org/get-started/locally/).  
 MuLAN and its dependencies can then be installed with
@@ -24,14 +24,35 @@ cd mulan
 pip install .
 ```
 We suggest to do it in a dedicated conda environment.  
+Cloning the repository will also download weights for different model versions trained on SKEMPI dataset. They are available in the folder 
+`models/pretrained`. To be able to load them within the package, the environmental variable must be set:
+```bash
+export MULAN="path/to/mulan/folder"
+```
+Alternatively, if you prefer to move the checkpoint files to a different folder, you can acces them in the new location by setting the
+`MULAN_MODELS_DIR` variable pointing to the corresponding folder. 
 ### Usage
+Available mulan models and pre-trained protein language models can be easily loaded through `mulan` interface. For MuLAN models:
+```python
+import mulan
+print(mulan.get_available_models())
+model = mulan.load_pretrained("mulan-ankh")
+```
+For some supported PLMs, instead:
+```python
+import mulan
+print(mulan.get_available_plms())
+model = mulan.load_pretrained_plm("ankh")
+```
+If the corresponding PLMs are not found on disk, they will be downloaded from HuggingFace Hub and stored by default in `~/.cache/huggingface/hub` folder.
 We provide several command line interfaces for quick usage of MuLAN different applications:
 - `mulan-predict` for $\Delta \Delta G$ prediction of single and multiple-point mutations;
 - `mulan-att` to extract residues weights, related to interface regions;
 - `mulan-landscape` to produce a full mutational landscape for a given complex;
- `mulan-train` to re-train the model on a custom dataset or to run cross validation (not added yet);
+- `mulan-train` to re-train the model on a custom dataset or to run cross validation (not supported yet);
 - `plm-embed` to extract embeddings from protein language models. 
 Since the script uses the `transformers` interface, only models that are saved on HuggingFace 🤗 Hub can be loaded.
@@ -39,5 +60,16 @@ Information about required inputs and usage examples for each command are provid
 ## Citation
+If you find this useful, please cite
+```bibtex
+@article {Lombardi2024.08.24.609515,
+	author = {Lombardi, Gianluca and Carbone, Alessandra},
+	title = {MuLAN: Mutation-driven Light Attention Networks for investigating protein-protein interactions from sequences},
+	year = {2024},
+	doi = {10.1101/2024.08.24.609515},
+	publisher = {Cold Spring Harbor Laboratory},
+	URL = {https://www.biorxiv.org/content/early/2024/08/26/2024.08.24.609515},
+	journal = {bioRxiv}
+}
+```
--- a/mulan/__init__.py
+++ b/mulan/__init__.py
+from .config import MulanConfig   # noqa
+from .modules import LightAttModel  # noqa
+from .utils import (
+    load_pretrained,
+    load_pretrained_plm, 
+    get_available_models, 
+    get_available_plms,
+) # noqa
\ No newline at end of file
--- a/mulan/constants.py
+++ b/mulan/constants.py
@@ -49,9 +49,9 @@ three2one = {v: k for k, v in one2three.items()}
 # Models names and paths
-_BASE_DIR = os.getcwd()  # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+_BASE_DIR = os.environ.get("MULAN", os.getcwd())
 _DEFAULT_MODELS_DIR = os.path.join(_BASE_DIR, "models/pretrained")
-MODELS_DIR = os.environ.get("MULAN_MODELS_DIR", _DEFAULT_MODELS_DIR)
+MODELS_DIR = os.environ.get("MULAN_MODELS_PATH", _DEFAULT_MODELS_DIR)
 MODELS = {
    "mulan-esm": f"{MODELS_DIR}/mulan_esm.ckpt",
    "mulan-esm-multiple": f"{MODELS_DIR}/mulan_esm_multiple.ckpt",

--- a/mulan/modules.py
+++ b/mulan/modules.py
@@ -69,9 +69,7 @@ class AttentionMeanK(nn.Module):
            ],
            dim=1,
        )
-        # print(attn_weights.shape, o.shape)
        o1 = torch.sum(o * attn_weights, dim=-1).view(batch_size, -1)
-        # print(o1.shape)
        # max pooling
        o2, _ = torch.max(o, dim=-1)
@@ -80,7 +78,6 @@ class AttentionMeanK(nn.Module):
        # mlp
        o = torch.cat([o1, o2], dim=-1)
        o = self.fc(o)
-        # attn_mean = torch.softmax(attn_weigths.mean(dim=-1)
        output = OutputWithAttention(o, attn_weights)
        return output
@@ -130,7 +127,7 @@ class LightAttModel(nn.Module):
        )
        output = x_mut - x_wt
-        if zs_scores is not None and (self.config.add_scores or self.config.add_columns_scores):
+        if zs_scores is not None and self.config.add_scores:
            output = torch.cat((output, zs_scores.view(batch_size, -1)), dim=-1)
        output = self.linear(output).squeeze(-1)

--- a/mulan/utils.py
+++ b/mulan/utils.py
@@ -9,6 +9,7 @@ import numpy as np
 from scipy.stats import rankdata
 import mulan.constants as C
+from mulan.constants import AAs, aa2idx, idx2aa, one2three, three2one  # noqa
 def mutation_generator(sequence):
@@ -72,28 +73,39 @@ def dict_to_fasta(fasta_dict, fasta_file):
            f.write(f"{value}\n")
-def get_plm_names():
+def get_available_plms():
    """Return the names of the available pretrained language models."""
    return list(C.PLM_ENCODERS.keys())
+def get_available_models():
+    """Return the names of the available Mulan models."""
+    return list(C.MODELS.keys())
 def load_pretrained_plm(model_name, device="cpu"):
    model_id = C.PLM_ENCODERS.get(model_name)
    if model_id is None:
-        raise ValueError(f"Invalid model_name: {model_name}. Must be one of {get_plm_names()}")
+        raise ValueError(
+            f"Invalid model_name: {model_name}. Must be one of {get_available_plms()}"
+        )
    if "t5" in model_id or "ankh" in model_id:
        from transformers import T5EncoderModel
        model = T5EncoderModel.from_pretrained(model_id)
        if "ankh" in model_id:
            from transformers import AutoTokenizer
            tokenizer = AutoTokenizer.from_pretrained(model_id)
        else:
            from transformers import T5Tokenizer
            tokenizer = T5Tokenizer.from_pretrained(model_id)
    else:
        try:
            from transformers import AutoTokenizer, AutoModel
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            model = AutoModel.from_pretrained(model_id)
        except Exception as e:
@@ -103,6 +115,16 @@ def load_pretrained_plm(model_name, device="cpu"):
    return model, tokenizer
+def load_pretrained(pretrained_model_name, device=None, **kwargs):
+    """Load a pretrained model from disk."""
+    model_path = C.MODELS.get(pretrained_model_name)
+    if model_path is None:
+        raise ValueError(f"Invalid model_name: {pretrained_model_name}")
+    from mulan.modules import LightAttModel
+    return LightAttModel.from_pretrained(model_path, device=device, **kwargs)
 @torch.inference_mode()
 def embed_sequence(plm_model, plm_tokenizer, sequence):
    """Embed a sequence using a pretrained model."""

--- a/scripts/compute_landscape.py
+++ b/scripts/compute_landscape.py
@@ -8,8 +8,8 @@ import pandas as pd
 import torch
 import numpy as np
-from mulan import utils, constants as C
+import mulan
-from mulan.modules import LightAttModel
+import mulan.utils as utils
 def get_args():
@@ -24,10 +24,11 @@ def get_args():
        The first is the sequence to be scored, the other is the partner.""",
    )
    parser.add_argument(
-        "-m", "--model-name",
+        "-m",
+        "--model-name",
        type=str,
        default="mulan-ankh",
-        help=f"""Name of the pre-trained model. Must be one of: {C.MODELS.keys()}.
+        help=f"""Name of the pre-trained model. Must be one of: {mulan.get_available_models()}.
        If the model is a version of imulan, a TXT file containing zero-shot scores 
        must be provided, where lines correspond to mutated positions and columns 
        to amino acids, in alphabetic order for the single letter name, separated 
@@ -58,8 +59,8 @@ def get_args():
        help="If not None, directory to store embeddings in PT format.",
    )
    args = parser.parse_args()
-    if args.model_name not in C.MODELS:
+    if args.model_name not in mulan.get_available_models():
-        raise ValueError(f"Model name must be one of: {C.MODELS.keys()}")
+        raise ValueError(f"Invalid model name: {args.model_name}")
    args.sequences = args.sequences.upper()
    if "imulan" in args.model_name and args.scores_file is None:
        raise ValueError("Zero-shot scores file must be provided for imulan models.")
@@ -86,7 +87,7 @@ def score_mutation(
                wildtype_embedding,
                partner_embedding,
                mutation_embedding,
-                partner_embedding
+                partner_embedding,
            ],
            zs_scores=zs_score,
        )
@@ -104,7 +105,7 @@ def run(
    """Compute full mutational landscape of a protein in a given complex."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    seq1, seq2 = re.sub(r"[UZOB]", "X", sequences).split(":")
-    output = torch.zeros(len(seq1), len(C.AAs))
+    output = torch.zeros(len(seq1), len(utils.AAs))
    os.makedirs(output_dir, exist_ok=True)
    if embeddings_dir is not None:
        os.makedirs(embeddings_dir, exist_ok=True)
@@ -112,7 +113,7 @@ def run(
    # load models
    plm_name = model_name.split("-")[1]
    plm_model, plm_tokenizer = utils.load_pretrained_plm(plm_name, device=device)
-    model = LightAttModel.from_pretrained(C.MODELS[model_name])
+    model = mulan.load_pretrained(model_name)
    model.eval()
    if scores_file is not None:
@@ -127,12 +128,12 @@ def run(
        utils.save_embedding(partner_embedding, embeddings_dir, "PARTNER")
    # iterate over single-point mutations
-    num_iterations = len(seq1) * (len(C.AAs) - 1)
+    num_iterations = len(seq1) * (len(utils.AAs) - 1)
    pbar = tqdm(initial=0, total=num_iterations, colour="red", dynamic_ncols=True, ascii="-#")
    pbar.set_description("Scoring mutations")
    for mut_name, mutation in utils.mutation_generator(seq1):
        i, aa = int(mut_name[1:-1]) - 1, mut_name[-1]
-        zs_score = None if scores_file is None else zs_scores[i, C.aa2idx[aa]]
+        zs_score = None if scores_file is None else zs_scores[i, utils.aa2idx[aa]]
        score = score_mutation(
            model,
            plm_model,
@@ -144,16 +145,18 @@ def run(
            zs_score,
            embeddings_dir,
        )
-        output[i, C.aa2idx[aa]] = score
+        output[i, utils.aa2idx[aa]] = score
        pbar.update(1)
    # save output
    output = output.cpu().numpy()
    if ranksort_output:
-        wt_index = (range(len(seq1)), tuple([C.aa2idx[aa] for aa in seq1]))
+        wt_index = (range(len(seq1)), tuple([utils.aa2idx[aa] for aa in seq1]))
        output[wt_index] = -10000  # set to very low value to give lowest scores to wt residues
        output = utils.ranksort(output)
-    output = pd.DataFrame(output, columns=C.AAs, index=[f"{i+1}{aa}" for i, aa in enumerate(seq1)])
+    output = pd.DataFrame(
+        output, columns=utils.AAs, index=[f"{i+1}{aa}" for i, aa in enumerate(seq1)]
+    )
    output.to_csv(os.path.join(output_dir, "landscape.csv"), float_format="%.3f")

--- a/scripts/extract_attentions.py
+++ b/scripts/extract_attentions.py
@@ -4,15 +4,14 @@ import os
 from argparse import ArgumentParser
 from tqdm import tqdm
 import torch
-import h5py  # type: ignore
+import h5py  
-from mulan import utils, constants as C
+import mulan
-from mulan.modules import LightAttModel
 def get_args():
    parser = ArgumentParser(
-        prog="extract_attentions",
+        prog="mulan-att",
        description=__doc__,
    )
    parser.add_argument(
@@ -24,9 +23,7 @@ def get_args():
        "--model-name",
        type=str,
        default="mulan-ankh",
-        help=f"""
+        help=f"Name of the pre-trained model. Must be one of: {mulan.get_available_models()}.",
-        Name of the pre-trained model. Must be one of: 
-        {list(C.MODELS.keys())}.""",
    )
    parser.add_argument(
        "-o", "--output-file", type=str, default="attentions.h5", help="Output H5 file"
@@ -39,7 +36,7 @@ def get_args():
        help="If not None, directory to store embeddings in PT format.",
    )
    args = parser.parse_args()
-    if args.model_name not in C.MODELS:
+    if args.model_name not in mulan.get_available_models():
        raise ValueError(f"Invalid model name: {args.model_name}")
    return args
@@ -47,10 +44,10 @@ def get_args():
 @torch.inference_mode
 def run(model_name, fasta_file, output_file, embeddings_dir=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    dataset = utils.parse_fasta(fasta_file)
+    dataset = mulan.utils.parse_fasta(fasta_file)
    plm_name = model_name.split("-")[1]
-    plm_model, plm_tokenizer = utils.load_pretrained_plm(plm_name, device=device)
+    plm_model, plm_tokenizer = mulan.load_pretrained_plm(plm_name, device=device)
-    model = LightAttModel.from_pretrained(C.MODELS[model_name])
+    model = mulan.load_pretrained(model_name)
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    if embeddings_dir is not None:
        os.makedirs(embeddings_dir, exist_ok=True)
@@ -59,15 +56,15 @@ def run(model_name, fasta_file, output_file, embeddings_dir=None):
    with h5py.File(output_file, "w") as f:
        for name, sequence in dataset.items():
            # embed sequence
-            embedding = utils.embed_sequence(plm_model, plm_tokenizer, sequence)
+            embedding = mulan.utils.embed_sequence(plm_model, plm_tokenizer, sequence)
            # compute attention weights
            attention = model([embedding] * 4, output_attentions=True).attention[0]
            attention = attention.squeeze(0).cpu().numpy().mean(axis=(-2, -3))
-            attention = utils.minmax_scale(attention)
+            attention = mulan.utils.minmax_scale(attention)
            # save attention weights
            f.create_dataset(name, data=attention)
            if embeddings_dir is not None:
-                utils.save_embedding(embedding, embeddings_dir, name)
+                mulan.utils.save_embedding(embedding, embeddings_dir, name)
            pbar.update(1)

--- a/scripts/generate_embeddings.py
+++ b/scripts/generate_embeddings.py
@@ -6,12 +6,12 @@ from argparse import ArgumentParser
 from tqdm import tqdm
 import torch
-from mulan import utils, constants as C
+import mulan
 def get_args():
    parser = ArgumentParser(
-        prog="generate_embeddings",
+        prog="plm-embed",
        description=__doc__,
    )
    parser.add_argument(
@@ -22,8 +22,8 @@ def get_args():
    parser.add_argument(
        "model_name",
        type=str,
-        default="ankh_large",
+        default="ankh",
-        help=f"PLM name to be loaded. Must be one of {C.PLM_ENCODERS.keys()}",
+        help=f"PLM name to be loaded. Must be one of {mulan.get_available_plms()}",
    )
    parser.add_argument(
        "-o",
@@ -33,21 +33,23 @@ def get_args():
        help="Output directory. Defaults to './embeddings'",
    )
    args = parser.parse_args()
+    if args.model_name not in mulan.get_available_plms():
+        raise ValueError(f"Invalid model name: {args.model_name}")
    return args
 @torch.inference_mode()
 def run(model_name, fasta_file, output_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model, tokenizer = utils.load_pretrained_plm(model_name, device=device)
+    model, tokenizer = mulan.load_pretrained_plm(model_name, device=device)
-    dataset = utils.parse_fasta(fasta_file)
+    dataset = mulan.utils.parse_fasta(fasta_file)
    os.makedirs(output_dir, exist_ok=True)
    pbar = tqdm(initial=0, total=len(dataset), colour="red", dynamic_ncols=True, ascii="-#")
    pbar.set_description("Embedding sequences")
    for name, sequence in dataset.items():
        # embed sequence
-        embedding = utils.embed_sequence(model, tokenizer, sequence)
+        embedding = mulan.utils.embed_sequence(model, tokenizer, sequence)
-        utils.save_embedding(embedding, output_dir, name)
+        mulan.utils.save_embedding(embedding, output_dir, name)
        pbar.update(1)

--- a/scripts/predict.py
+++ b/scripts/predict.py
@@ -7,21 +7,20 @@ from tqdm import tqdm
 import torch
 import pandas as pd
-import mulan.constants as C
+import mulan
-from mulan import utils
+import mulan.utils as utils
-from mulan.modules import LightAttModel
 def get_args():
    parser = ArgumentParser(
-        prog="predict",
+        prog="mulan-predict",
        description=__doc__,
    )
    parser.add_argument(
-        "model_name",
+        "--model-name",
        type=str,
        default="mulan-ankh",
-        help=f"Name of the pre-trained model. Must be one of: {C.MODELS.keys()}.",
+        help=f"Name of the pre-trained model. Must be one of: {mulan.get_available_models()}.",
    )
    parser.add_argument(
        "input_file",
@@ -54,9 +53,11 @@ def get_args():
    parser.add_argument(
        "--store-embeddings",
        action="store_true",
-        help="Store embeddings in PT format. Only wild type embeddings are stored.",
+        help="Store embeddings in PT format. Output directory is the same of 'output_file'.",
    )
    args = parser.parse_args()
+    if args.model_name not in mulan.get_available_models():
+        raise ValueError(f"Invalid model name: {args.model_name}")
    return args
@@ -97,8 +98,9 @@ def run(model_name, input_file, scores_file, output_file, store_embeddings):
    num_iterations = sum(len(d["mutations"]) for d in data)
    scores = []
    plm_name = model_name.split("-")[1]
-    plm_model, plm_tokenizer = utils.load_pretrained_plm(plm_name, device=device)
+    plm_model, plm_tokenizer = mulan.load_pretrained_plm(plm_name, device=device)
-    model = LightAttModel.from_pretrained(C.MODELS[model_name])
+    model = mulan.load_pretrained(model_name)
+    model = model.eval()
    if "imulan" in model_name and scores_file is None:
        raise ValueError("Zero-shot scores file must be provided for imulan models.")
    zs_scores = parse_zs_scores(scores_file) if scores_file is not None else {}
@@ -138,8 +140,6 @@ def run(model_name, input_file, scores_file, output_file, store_embeddings):
            )
            if store_embeddings:
-                utils.save_embedding(seq1_embedding, embeddings_dir, f"{complex_name}_A")
-                utils.save_embedding(seq2_embedding, embeddings_dir, f"{complex_name}_B")
                utils.save_embedding(
                    mut_seq1_embedding, 
                    embeddings_dir, 
@@ -152,8 +152,12 @@ def run(model_name, input_file, scores_file, output_file, store_embeddings):
                )
            pbar.update(1)
+        if store_embeddings:
+            utils.save_embedding(seq1_embedding, embeddings_dir, f"{complex_name}_A")
+            utils.save_embedding(seq2_embedding, embeddings_dir, f"{complex_name}_B")
    df = pd.DataFrame.from_records(scores)
-    df.to_csv(output_file, index=False, sep="\t")
+    df.to_csv(output_file, index=False, sep="\t", float_format="%.3f")
 def main():

--- a/scripts/train.py
+++ b/scripts/train.py
@@ -4,7 +4,7 @@ from argparse import ArgumentParser
 def get_args():
    parser = ArgumentParser(
-        prog="train",
+        prog="mulan-train",
        description=__doc__,
    )
    args = parser.parse_args()