A Homology-Aware but Alignment-Free Protein State Space Model: ProtMamba is a novel protein language model designed to facilitate protein design. Unlike traditional models that rely on multiple sequence alignments (MSAs) to capture evolutionary information, ProtMamba can use unaligned homologous sequences, avoiding the imperfections associated with MSAs.
ProtMamba is based on the Mamba architecture, a state space model that efficiently handles very long sequences. The model uses a fill-in-the-middle (FIM) training objective, combining autoregressive modeling and masked language modeling to predict amino acids conditioned on the given context sequences. This makes ProtMamba particularly well-suited for generating novel protein sequences, filling in specific regions of sequences, and predicting the fitness of protein variants.
configs/
: Configuration files for model training and evaluation.
data/
: Example dataset and full train/test split used to train the model (list of OpenProteinSet cluster names).
nbs/
: Implementation of the ProtMamba model architecture in jupyter
notebooks.
ProtMamba_ssm/
: Implementation of the ProtMamba model architecture.
tests/
: Scripts to sample from ProtMamba and evaluate the model’s
performance.
The model weights are available in the latest release of the repository here
pip install -e .
IMPORTANT: the sequences should be in a3m
files but they do not need
to be aligned.
import pickle
msa_paths = {"name-of-msa" : "../data/example_msa.a3m"}
# path saving directory
filepath = input("What is the path to the folder where you want to save the dataset?") # example: "../data/"
dataset_name = input("How do you want to name the dataset file?") # example: "encoded_MSAs_train.pkl"
dataset_dictionary = {}
for msa_name, msa_path in msa_paths.items():
# Load an a3m file with all the context sequences
msa = load_from_file(msa_path)
# Tokenize the sequences and concatenate them into a single array
tokens = tokenizer(msa, concatenate=True)
tokens = tokens.numpy()[0]
dataset_dictionary[msa_name] = tokens
with open(filepath+dataset_name, "wb") as f:
pickle.dump(dataset_dictionary, f)
import yaml
if int(use_one_gpu) >= 0:
print(f"Using gpu {use_one_gpu}")
print("Number of gpus used: ", torch.cuda.device_count())
# Load the default config file (change it and add the path to the training dataset)
with open("../configs/default_config.yaml", "r") as file:
defaultconfig = yaml.safe_load(file)
namedir = input("Enter name of directory to save results: ")
finetune_path = input("If you want to finetune a model, enter the relative path to the model's checkpoint, otherwise press enter:")
finetune_path = finetune_path if finetune_path else None
# Run the trainer with the selected training configuration
trainer = run(defaultconfig, namedir, finetune_model_path=finetune_path)
use_custom = input("Do you want to use a custom MSA? (y/n): ")
if use_custom == "y":
# Load an a3m file with all the context sequences
msa = load_from_file("../data/example_msa.a3m")
target_sequence = msa[:1]
context_msa = msa[1:]
# Tokenize the sequences and concatenate them into a single array
target = tokenizer(target_sequence, concatenate=True)
tokens = tokenizer(context_msa, concatenate=True)
fim_gen = input("Do you want to generate using FIM? (y/n): ")
if fim_gen=="n":
# AUTOREGRESSIVE, no-FIM generation
# generate the full sequence autoregressively starting from residue 10 in the sequence `target`
gen_dictionary = {"<cls>": 10}
input_seq, targ_pos = prepare_target(target, use_fim={"<cls>": 10})
if fim_gen=="y":
# FIM generation
# mask_dictionary is a dictionary of the positions in the sequence that you want to mask, in this example there will be
# - a mask that covers the residues 4,5,6 and the model will fill it by sampling 10 residues
# - a mask that covers the residues 30,31,32,33,34 and the model will fill it by sampling 3 residues
mask_dictionary = {"<mask-1>": ((4,7),10),"<mask-2>": ((30,35),3)}
input_seq, targ_pos, is_fim_dict = prepare_target(target, use_fim=mask_dictionary)
context_tokens, context_pos_ids = prepare_tokens(tokens,
target_tokens=input_seq,
target_pos_ids=targ_pos,
DatasetClass=Uniclust30_Dataset,
num_sequences=50,
fim_strategy="multiple_span",
mask_fraction=0.2,
max_patches=5,
add_position_ids="1d")
if use_custom == "n":
is_fim = input("Do you want to use FIM? (y/n): ")
filepath = input("What is the path to the folder with the dataset?")
is_fim = True if is_fim == "y" else False
# Load the dataset used for training
dataset_name = input("What is the name of the dataset file?") # example: "encoded_MSAs_subset-100.pkl", "encoded_MSAs_train.pkl"
fim_strategy = "multiple_span" if is_fim else "no-scramble"
dataset = Uniclust30_Dataset(filename=dataset_name,
filepath=filepath,
sample=False,
mask_fraction=0.2,
fim_strategy=fim_strategy,
max_position_embeddings=2048,
add_position_ids="1d")
# Select a sample of the dataset to be the input
data = dataset[1]
tokens = data["input_ids"][None,:].to("cuda")
pos_ids = data["position_ids"][None,:].to("cuda")
model_name = input("What is the path to the folder with the checkpoint of the model?") # example: "results/train_100M_FIM_restart-spikes_merged/checkpoint_131k-750"
# Load pretrained model
model = load_model(model_name,
model_class=MambaLMHeadModelwithPosids,
device="cuda",
dtype=torch.bfloat16,
checkpoint_mixer=False # Must be False when using model for Inference
)
print("Number of tokens in the MSA (target, context): ", len(input_seq[0]), ",", len(tokens[0]))
print("Target:")
print("sequence:", decode_sequence(input_seq[0].numpy()))
print("original sequence:", decode_sequence(target[0].numpy()))
print("pos ids:", list(targ_pos[0].numpy()))
print("Context:")
print("sequence:", decode_sequence(context_tokens[0].numpy()))
print("pos ids:", list(context_pos_ids[0].numpy()))
print("Mask positions:", is_fim_dict)
# Generate the new sequence
output = generate_sequence(model,
context_tokens,
position_ids=context_pos_ids,
is_fim=is_fim_dict,
max_length=20000,
temperature=1.,
top_k=3,
top_p=0.0,
return_dict_in_generate=True,
output_scores=True,
eos_token_id=AA_TO_ID["<cls>"],
device="cuda")
input_seq, output_seq = output["input"], output["generated"]
logits = output["scores"]
print(f"All input context (len = {len(input_seq[0])}):\n", input_seq[0])
print("Last sequence where the masked parts should be predicted:\n", input_seq[0].split("<cls>")[-1])
print(f"Generated (len = {len(output_seq[0])}):\n", output_seq[0])
input_continuation = decode_sequence(target[0].numpy())+"<cls>"
print(f"Continuation of input:\n", input_continuation)
print(f"\nLogits (shape = {logits.shape})")
context_tokens, context_pos_ids, tokens_fim, pos_ids_fim, is_fim_dict = prepare_dataset_for_fim_generation(tokens, pos_ids)
print("Length of context information", context_tokens.shape[1])
print("Masked part of the sequence to predict and its position indices:\n", tokens_fim, "\n", pos_ids_fim)
print("Masked tokens and their positions in the input sequence:", is_fim_dict)
# Generate the new sequence
output = generate_sequence(model,
context_tokens,
position_ids=context_pos_ids,
is_fim=is_fim_dict,
max_length=1570,
temperature=1.,
top_k=3,
top_p=0.0,
return_dict_in_generate=True,
output_scores=True,
eos_token_id=AA_TO_ID["<cls>"],
device="cuda")
input_seq, output_seq = output["input"], output["generated"]
logits = output["scores"]
print(f"All input context (len = {len(input_seq[0])}):\n", input_seq[0])
print("Last sequence where the masked parts should be predicted:\n", input_seq[0].split("<cls>")[-1])
print(f"Generated (len = {len(output_seq[0])}):\n", output_seq[0])
input_continuation = decode_sequence(tokens_fim[0].cpu().numpy())+"<cls>"
print(f"Continuation of input:\n", input_continuation)
print(f"\nLogits (shape = {logits.shape})")
total = len(tokens_fim[0])+1
if total>4:
fig, axs = plt.subplots(total//4,4, figsize=(20,5*total//4))
else:
fig, axs = plt.subplots(1,total, figsize=(20,total,5))
axs = axs[None,:]
for el in range(total):
ax = axs[el//4,el%4]
ax.bar(np.arange(logits.shape[-1]),
torch.softmax(torch.from_numpy(logits[0,el,:]), dim=0))
ax.axvline(output["generated_tokens"][0][el], color="red", label="Prediction: "+ID_TO_AA[output["generated_tokens"][0][el]] + f" ({output['generated_tokens'][0][el]})", linewidth=0.5)
# ax.axvline(tokens_fim[0,el].cpu().numpy(), color="k",label="Original: "+ID_TO_AA[tokens_fim[0,el].cpu().numpy()] +f" ({tokens_fim[0,el].cpu().numpy()})", linewidth=0.5)
ax.legend()
fig.suptitle(f"Real sequence: {input_continuation}\nPred sequence: {output_seq[0]}")
plt.show()
<cls>
# Generate the new sequence
L = 650#628#
output = generate_sequence(model,
tokens[:,:L],
position_ids=pos_ids[:,:L],
is_fim=False,
max_length=1570,
temperature=1.,
top_k=10,
top_p=0.0,
return_dict_in_generate=True,
output_scores=True,
eos_token_id=torch.tensor([AA_TO_ID["<cls>"],AA_TO_ID["<mask-1>"], AA_TO_ID["<mask-2>"], AA_TO_ID["<mask-3>"],
AA_TO_ID["<mask-4>"], AA_TO_ID["<mask-5>"]]).to("cuda"),
device="cuda")
input_seq, output_seq = output["input"], output["generated"]
logits = output["scores"]
print(f"All input context (len = {len(input_seq[0])}):\n", input_seq[0])
print("Last sequence where the masked parts should be predicted:\n", input_seq[0].split("<cls>")[-1])
print(f"Generated (len = {len(output_seq[0])}):\n", output_seq[0])
input_continuation = decode_sequence(tokens[0,L:].cpu().numpy()).split("<cls>")[0]+"<cls>"
print(f"Continuation of input:\n", input_continuation)
print(f"\nLogits (shape = {logits.shape})")
print("\nStops if the model predicts a mask token")
fig, axs = plt.subplots(4,4, figsize=(20,10))
for el in range(16):
ax = axs[el//4,el%4]
ax.bar(np.arange(logits.shape[-1]),
torch.softmax(torch.from_numpy(logits[0,el,:]), dim=0))
ax.axvline(output["generated_tokens"][0][el], color="red", label="Prediction: "+output_seq[0][el] + f" ({AA_TO_ID[output_seq[0][el]]})", linewidth=0.5)
ax.axvline(tokens[0,L+el].cpu().numpy(), color="k",label="Original: "+input_continuation[el] +f" ({AA_TO_ID[input_continuation[el]]})", linewidth=0.5)
ax.legend()
fig.suptitle(f"Real sequence: {input_continuation[:16]}\nPred sequence: {output_seq[0][:16]}")
plt.show()
which_layers = list(range(1,model.config.n_layer+1))
def get_hidden_states(model, tokens, which_layers, position_ids=None, seq_position_ids=None):
hidden_states = model(input_ids=tokens,
save_layer=which_layers,
position_ids=position_ids,
seq_position_ids=seq_position_ids)
return hidden_states
hidden_states = get_hidden_states(model, tokens[:,:10], which_layers, pos_ids[:,:10])
print(f"Saved hidden states for layers: {hidden_states.keys()}")
print(f"Shape of hidden state of layer 1: {hidden_states[1].shape}")
To cite this work, please refer to the following publication:
@article{sgarbossa2024protmamba,
title={{ProtMamba}: -- a homology-aware but alignment-free protein state space model},
author={Damiano Sgarbossa and Cyril Malbranke and Anne-Florence Bitbol},
journal={bioRxiv},
doi = {10.1101/2024.05.24.595730},
year={2024},
url={https://www.biorxiv.org/content/early/2024/05/25/2024.05.24.595730}
}
Project developed using nbdev.