[BUG] Training Mamba with ZeRO stage 3 and long context lengths - CUDA OOM error

PageTurnIO commented 7 months ago

Describe the bug I am experiencing a CUDA out of memory error while training a Mamba 2.8b model with DeepSpeed using ZeRO 3. The issue occurs during the backward pass, and I have tried adjusting the config file many times, but the problem persists.

The training works fine for context lengths up to 48k tokens, but fails when increasing to > 50k tokens.

CPU memory usage is relatively low (~100 GB out of 1800 GB) during training, indicating that the model parameters and optimizer states are not being properly offloaded to the CPU.

I am trying to figure out if this is a Mamba specific error. Does anyone have experience training state space models on Zero 3 with Deepspeed?

I would greatly appreciate any guidance or suggestions to resolve this issue.

To Reproduce

from datasets import load_dataset, concatenate_datasets

from pathlib import Path
import transformers
from transformers import (
    MistralForCausalLM, 
    AutoTokenizer,
    Trainer
)
import torch
from datasets import load_dataset
from typing import Optional
import os
from random import sample as sample_fn
from transformers import AutoModelForCausalLM,get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, AdamW
from transformers.models.mamba import MambaForCausalLM
#from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from transformers.training_args import OptimizerNames
from modeling.mamba_lm import MambaLMHeadModel

LOCAL_DST_MODEL_DIR = "model_result"
LOCAL_JSONL_DATASET_FILE = "/home/ubuntu/MambaPro/reference_squad_data_v2_final_100k.jsonl"
LOCAL_TRAINING_LOGS_DIR = "logs"
LOCAL_CKPTS_DIR = "ckpts"
TMP_MODEL_DIR = "model"

LOCAL_DIRS = [
    LOCAL_DST_MODEL_DIR, 
    LOCAL_TRAINING_LOGS_DIR,
    LOCAL_CKPTS_DIR,
    TMP_MODEL_DIR,
    str(Path(LOCAL_JSONL_DATASET_FILE).parent)
]

class InstructionDataCollator:
    def __init__(
        self,
        tokenizer,
        max_length: Optional[int] = None
    ):
        dummy_text = " Who is Elon Musk"
        self.dummy_text = dummy_text*20000
        print("length of dummy text: ", len(tokenizer.tokenize(self.dummy_text)))

        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch):
        final_input_samples = []
        final_output_samples = []

        for sample in batch:
            tokenized_input = self.tokenizer(self.dummy_text)
            #tokenized_output = self.tokenizer(self.dummy_text) #+ self.tokenizer.eos_token)

            input_ids = tokenized_input['input_ids'][:self.max_length] 
            # The input texts are not taken into account for the loss computation
            labels = tokenized_input['input_ids'][:self.max_length] 

            # Truncate the input texts and labels
            input_ids = input_ids[: self.max_length - 1]
            labels = labels[: self.max_length - 1]

            input_ids = input_ids+[self.tokenizer.eos_token_id]
            labels = labels+[self.tokenizer.eos_token_id]

            attention_mask = [1] * len(input_ids)

            final_input_samples.append({
                "input_ids": torch.tensor(input_ids).long(),
                "attention_mask": torch.tensor(attention_mask).long()
            })

            final_output_samples.append({
                "input_ids": torch.tensor(labels).long()
            })

        # IMPORTANT: no need to shift the labels, already done by the model (HF)
        preprocessed_batch = self.tokenizer.pad(
            final_input_samples,
            padding=True,
            #max_length=self.max_length,
            return_tensors="pt",
        )

        preprocessed_batch['labels'] = self.tokenizer.pad(
            final_output_samples,
            padding=True,
            #max_length=self.max_length,
            return_tensors="pt",
        )['input_ids']

        return preprocessed_batch

def create_local_dirs_if_not_exist():
    for local_dir in LOCAL_DIRS:
        local_dir_path = Path(local_dir)

        if not local_dir_path.exists():
            local_dir_path.mkdir(parents=True, exist_ok=True)

def do_training():    
    dataset = load_dataset("json", data_files=LOCAL_JSONL_DATASET_FILE)
    dataset = dataset.shuffle(seed=42)   

    # model = AutoModelForCausalLM.from_pretrained(
    #     "google/gemma-7b",
    #     low_cpu_mem_usage=True,
    #     torch_dtype=torch.float16,
    #     attn_implementation="flash_attention_2"
    # )
    model = AutoModelForCausalLM.from_pretrained(
        "/home/ubuntu/llm_finetune/mamba-2.8b-hf",
        torch_dtype=torch.bfloat16,

    )
    # model = MambaLMHeadModel.from_pretrained(
    #     "state-spaces/mamba-2.8b-slimpj",
    # )

    # To allow the option of gradient checkpointing
    #model.enable_input_require_grads()

    #model = model.cuda()

    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
    tokenizer.pad_token = tokenizer.eos_token

    data_collator = InstructionDataCollator(
        tokenizer=tokenizer, 
        max_length=64000
    )

    #optimizer=OptimizerNames.ADAMW_HF
    args = transformers.TrainingArguments(
        output_dir="./output_19_03_mamba",
        num_train_epochs=1,
        warmup_steps=20,
        learning_rate=2e-5,
        deepspeed="deepspeed_configs/zero3.yaml",
        #gradient_checkpointing_kwargs={"use_reentrant":True},
        #optim=optimizer,
        fp16="fp16",
        #bf16=True,
        local_rank=-1,
        gradient_checkpointing=True,
        #gradient_accumulation_steps=8,
        per_device_train_batch_size=1,
        weight_decay=0,
        logging_steps=10,
        save_total_limit=1,
        save_strategy="steps",
        save_steps=100,
        remove_unused_columns=False,
        report_to=["wandb"]
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        train_dataset=dataset["train"]
    )

    # Start training
    trainer.train()

    # It should only save the adapter weights
    trainer.save_model(TMP_MODEL_DIR)
    tokenizer.save_pretrained(TMP_MODEL_DIR)

    model.save_pretrained(LOCAL_DST_MODEL_DIR, safe_serialization=False)
    tokenizer.save_pretrained(LOCAL_DST_MODEL_DIR)

def main():
    create_local_dirs_if_not_exist()
    # Preprocess the dataset and do the training
    do_training()

main()

Deepspeed Config File

{
"bf16": {
"enabled": true
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},

"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": 1e9,
"stage3_param_persistence_threshold": "auto",
"stage3_gather_16bit_weights_on_model_save": true,
"offload_optimizer": {
"device": "cpu"
}
},
"activation_checkpointing": {
"partition_activations": true,
"cpu_checkpointing": true,
"contiguous_memory_optimization": false,
"number_checkpoints": null,
"synchronize_checkpoint_boundary": false,
"profile": false
},
"offload_param": {
"device": "cpu",
"pin_memory": true,
"buffer_count": 5,
"buffer_size": 5e9,
"max_in_cpu": 5e10
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 10,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}

Expected behavior The code should use Zero 3 to offload to CPU and/or NVMe, and allow training of longer context sequences.

ds_report output

2024-04-02 11:11:11,280 [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
async_io ............... [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_lion ............... [NO] ....... [OKAY]
 [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
evoformer_attn ......... [NO] ....... [NO]
fused_lamb ............. [NO] ....... [OKAY]
fused_lion ............. [NO] ....... [OKAY]
inference_core_ops ..... [NO] ....... [OKAY]
cutlass_ops ............ [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
ragged_device_ops ...... [NO] ....... [OKAY]
ragged_ops ............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
 [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
 [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/home/ubuntu/miniconda3/envs/llm/lib/python3.11/site-packages/torch']
torch version .................... 2.1.2+cu121
deepspeed install path ........... ['/home/ubuntu/miniconda3/envs/llm/lib/python3.11/site-packages/deepspeed']
deepspeed info ................... 0.14.0, unknown, unknown
torch cuda version ............... 12.1
torch hip version ................ None
nvcc version ..................... 12.4
deepspeed wheel compiled w. ...... torch 2.1, cuda 12.1
shared memory (/dev/shm) size .... 885.85 GB

Screenshots If applicable, add screenshots to help explain your problem.

System Information GPU: 8 x NVIDIA A100 80GB, 240 vCPUs, 1800 GiB RAM, 20 TiB SSD CUDA Version: 12.2 DeepSpeed Version: 0.14.3

Launcher context CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 OMP_NUM_THREADS=8 torchrun --nproc_per_node=8 --master_port=$((RANDOM + 10000)) train.py

Additional context I've also tried NVMe and I get the same result.

Training Setup: Model: Mamba-2.8b-slimpj Dataset: Upsampled Slim Pajama dataset with long context lengths (> 50k tokens) Hardware: 8 x A100 80GB GPUs, 240 vCPUs, 1800 GiB RAM, 20 TiB SSD

tjruwase commented 7 months ago

@PageTurnIO, the underlying issue here is that ZeRO* and offloading help to optimize memory footprint of model state (parameters and optimizer). In this case, however, the OOM is caused by activation memory footprint of the very long context length. So, you will need to explore techniques like sequence-parallelism, such as Ulysses. and activation checkpointing to fit the long context in memory.

PageTurnIO commented 7 months ago

@tjruwase thanks for your comment, very helpful! Fully agree with you that this is an activation memory problem. Looking over the Ulysses code it seems compatible with transformers/attention, but not necessarily compatible with state space models. But the principles are all the same.

ZonglinY commented 7 months ago

Using deepspeed on Mamba is indeed a question. Have anyone succeed on this issue?

microsoft / DeepSpeed

[BUG] Training Mamba with ZeRO stage 3 and long context lengths - CUDA OOM error #5349