Issues with tokenization in embedding generation script

I'm trying to generate embeddings from scGPT for my single-cell data but encountering tokenization issues. Here's my scenario and the errors I'm facing:

Initial Setup:

adata = ad.read_h5ad("single_cell_human_pbmc_counts.h5ad")
model_dir = Path("scGPT_human/")
vocab_file = model_dir / "vocab.json"
model_config_file = model_dir / "args.json"
model_file = model_dir / "best_model.pt"

I've tried different approaches to handle the vocabulary:

Approach 1: Using raw JSON dictionary

with open(vocab_file) as f:
    vocab = json.load(f)

Approach 2: Using custom vocabulary class

class CustomVocab:
    def __init__(self):
        self.stoi = {}
        self.itos = []

    @classmethod
    def from_file(cls, vocab_file):
        vocab = cls()
        with open(vocab_file, "r") as f:
            token2idx = json.load(f)
            if isinstance(next(iter(token2idx.values())), str):
                token2idx = {v: int(k) for k, v in token2idx.items()}
        vocab.stoi = token2idx
        vocab.itos = [""] * (max(token2idx.values()) + 1)
        for token, idx in token2idx.items():
            vocab.itos[idx] = token
        return vocab

The error occurs during tokenization:

Error: TypeError: 'int' object is not subscriptable

at this line in gene_tokenizer.py:

cls_id = vocab[cls_token]

I'm using this embedding generation function based on the example of Tutorial_Reference_Mapping_dataset.ipynb:

def get_batch_cell_embeddings(
adata,
cell_embedding_mode: str = "cls",
model=None,
vocab=None,
max_length=1200,
model_configs=None,
gene_ids=None,
use_batch_labels=False,
) -> np.ndarray:
"""
Get the cell embeddings for a batch of cells.

Args:
    adata (AnnData): The AnnData object.
    gene_embs (np.ndarray): The gene embeddings, shape (len(vocab), d_emb).
    count_matrix (np.ndarray): The count matrix.

Returns:
    np.ndarray: The cell embeddings.
"""
count_matrix = (
    adata.layers["counts"]
    if isinstance(adata.layers["counts"], np.ndarray)
    else adata.layers["counts"].A
)

# gene vocabulary ids
if gene_ids is None:
    gene_ids = np.array(adata.var["id_in_vocab"])
    assert np.all(gene_ids >= 0)

if use_batch_labels:
    batch_ids = np.array(adata.obs["batch_id"].tolist())

elif cell_embedding_mode == "cls":
    tokenized_all = tokenize_and_pad_batch(
        count_matrix,
        gene_ids,
        max_len=max_length,
        vocab=vocab,
        pad_token=model_configs["pad_token"],
        pad_value=model_configs["pad_value"],
        append_cls=True,  # append <cls> token at the beginning
        include_zero_gene=False,
    )
    all_gene_ids, all_values = tokenized_all["genes"], tokenized_all["values"]
    src_key_padding_mask = all_gene_ids.eq(vocab[model_configs["pad_token"]])
    with torch.no_grad(), torch.cuda.amp.autocast(enabled=True):
        cell_embeddings = model.encode_batch(
            all_gene_ids,
            all_values.float(),
            src_key_padding_mask=src_key_padding_mask,
            batch_size=64,
            batch_labels=None,
            time_step=0,
            return_np=True,
        )
    cell_embeddings = cell_embeddings / np.linalg.norm(
        cell_embeddings, axis=1, keepdims=True
    )
else:
    raise ValueError(f"Unknown cell embedding mode: {cell_embedding_mode}")
return cell_embeddings

Base on those here are my questions:

What is the correct way to handle the vocabulary for tokenization?
Should we be using a specific vocabulary class from scGPT instead of creating a custom one?
Is there an example of the correct vocabulary format and usage for embedding generation?

Here is the environ I am using:

Python 3.8
scGPT (latest version from GitHub)
PyTorch 2.0+
CUDA 11.7

Let me know if you'd like me to provide any additional information or test any specific solutions.

bowang-lab / scGPT

Issues with tokenization in embedding generation script #268