microsoft / DeepSpeed

DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.
https://www.deepspeed.ai/
Apache License 2.0
35.44k stars 4.11k forks source link

[BUG] CUDA out of memory error when using a customized model at deepspeed.initialize(). #6737

Open 962086838 opened 2 days ago

962086838 commented 2 days ago

Describe the bug In my own implementation, I combine a large language model and a speculator model. And my goal is to train the speculator model to make it better at predicting the n+2, n+3... tokens. I have read the doc of deepspeed, and I think it supports any customized model on top of nn.Module. But I have encountered CUDA OOM error when initializing the customized model with deepspeed.initialize().

To Reproduce Here is my main code

import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.utils.data
from time import time
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence

import deepspeed
import argparse
import random
import numpy as np
import os
from torch.utils.data import DataLoader, Dataset

from speculator import MLPSpeculator

class CombinedModel(nn.Module):
    def __init__(self, base_model, speculator):
        super(CombinedModel, self).__init__()
        self.base_model = base_model
        self.speculator = speculator

        for param in self.base_model.parameters():
            param.requires_grad = False

    def forward_base_model(self, *args, **kwargs):
        return self.base_model(*args, **kwargs)

    def forward_speculator(self, *args, **kwargs):
        return self.speculator(*args, **kwargs)

def print_model_parameters(model):
    for name, param in model.named_parameters():
        print(f"Parameter: {name}, requires_grad: {param.requires_grad}")

def get_argument_parser():
    parser = argparse.ArgumentParser(description="GAN for NLP Task using Alpaca Dataset")

    # Other parameters
    parser.add_argument('--backend', type=str, default='nccl', help='distributed backend')
    parser.add_argument('--batchSize', type=int, default=64, help='input batch size')
    parser.add_argument('--epochs', type=int, default=1, help='number of epochs to train for')
    parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
    parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
    parser.add_argument('--cuda', action='store_true', help='enables cuda')
    parser.add_argument('--ngpu', type=int, default=16, help='number of GPUs to use')
    parser.add_argument('--outf', default='./gan_output', help='folder to output model checkpoints')
    parser.add_argument('--manualSeed', type=int, default=999, help='manual seed')
    # parser.add_argument('--tensorboard_path', default='./runs/deepspeed', help='tensorboard log dir')
    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")

    return parser

def set_seed(value):
    print("Random Seed: ", value)
    random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed_all(value)
    np.random.seed(value)

def create_folder(folder):
    try:
        os.makedirs(folder)
    except OSError:
        pass

class AlpacaDataset(Dataset):
    def __init__(self, json_path: str, tokenizer: PreTrainedTokenizer, max_length: int):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Load and filter data
        with open(json_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
            for entry in raw_data:
                instruction = entry.get("instruction", "")
                input_text = entry.get("input", "")
                output_text = entry.get("output", "")

                if not output_text:
                    continue  # Skip if output is empty

                # Combine instruction, input, and output
                combined_text = f"Instruction: {instruction} Input: {input_text} Output: {output_text}"
                self.data.append(combined_text)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        encoded = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoded['input_ids'].squeeze(0)  # Shape: (max_length,)
        attention_mask = encoded['attention_mask'].squeeze(0)

        # Create target data by shifting input_ids
        target_ids = input_ids.clone()
        target_ids[:-1] = input_ids[1:]  # Shift left by one
        target_ids[-1] = -100  # Set the last token to be ignored

        return input_ids, target_ids, attention_mask

def get_alpaca_dataloader(json_path: str, tokenizer: PreTrainedTokenizer, batch_size: int, max_length: int):
    dataset = AlpacaDataset(json_path, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

def train_stage1(cfg, combined_model, base_model_input, input_texts, criterion, ddp_stats):
    with torch.no_grad():
        outputs = combined_model.forward_base_model(input_ids=base_model_input, attention_mask=torch.ones_like(base_model_input),
                                                     output_hidden_states=True, use_cache=False)

    embeds = outputs.hidden_states[-1]

    # print(embeds.shape)  # torch.Size([8, 124, 4096])
    preds = combined_model.forward_speculator(embeds.detach(), input_texts)
    # print("preds", preds, preds.shape)   # 3, 8, 124, 128256

    # assert 1==0
    losses = []
    for i in range(preds.size(0)):
        targ = input_texts[:, i + 1: preds.size(2) + i + 1]
        # print(targ)
        loss = criterion(preds[i].reshape(-1, preds.size(3)), targ.long().reshape(-1))
        losses.append(loss)
        ddp_stats[2 + i] += loss.item()
    total_loss = sum(losses)

    return total_loss, ddp_stats, input_texts.numel()

def train_stage2(cfg, combined_model, base_model_input, input_texts, criterion, ddp_stats):
    with torch.no_grad():
        outputs = combined_model.forward_base_model.generate(input_ids=base_model_input, return_dict_in_generate=True, output_hidden_states=True)
    targs = outputs.sequences
    embeds = outputs.hidden_states[-1]

    preds = combined_model.forward_speculator(embeds.detach(), targs[:, :-1].detach())
    losses = []
    for i in range(preds.size(0)):
        targ = targs[:, i + 1: preds.size(2) + i + 1]
        loss = criterion(preds[i].reshape(-1, preds.size(3)), targ.long().reshape(-1))
        losses.append(loss)
        ddp_stats[2 + i] += loss.item()
    total_loss = sum(losses)
    return total_loss, ddp_stats, targs.numel()

def train(args):
    # writer = SummaryWriter(log_dir=args.tensorboard_path)
    create_folder(args.outf)
    set_seed(args.manualSeed)
    cudnn.benchmark = True

    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank) if args.cuda else torch.device("cpu")

    # tokenizer = AutoTokenizer.from_pretrained("/path_to/LLama3/8B-ins-hf/")
    tokenizer = AutoTokenizer.from_pretrained("/path_to/Mistral-Large-Instruct-2407/")
    # tokenizer = AutoTokenizer.from_pretrained("/path_to/Mistral-Small-Instruct-2409")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'
    # model = AutoModelForCausalLM.from_pretrained("/path_to/LLama3/8B-ins-hf/")
    model = AutoModelForCausalLM.from_pretrained(
        "/path_to/Mistral-Large-Instruct-2407/",
        torch_dtype=torch.bfloat16,
        # device_map="auto",
        low_cpu_mem_usage=True
        )
    # model = AutoModelForCausalLM.from_pretrained("/path_to/Mistral-Small-Instruct-2409")

    emb_dim = model.config.hidden_size
    vocab_size = model.config.vocab_size

    speculator = MLPSpeculator(
        emb_dim,
        4096,  # speculator_width
        vocab_size,
        3,  # n_speculator_heads
        tie_weights=True,
        scale_input=True,
    )
    speculator.reset_parameters()

    combined_model = CombinedModel(model, speculator)
    del model
    del speculator

    criterion = nn.CrossEntropyLoss()

    model_engine_combined, optimizer, _, _ = deepspeed.initialize(args=args, model=combined_model,
                                                                  model_parameters=combined_model.speculator.parameters(),
                                                                  # optimizer=optimizer
                                                                  )
    deepspeed.init_distributed()

    torch.cuda.synchronize()

    dataloader = get_alpaca_dataloader('/path_to/alpaca_data.json',
                                       tokenizer,
                                       batch_size=8,
                                       max_length=128)
    ddp_stats = torch.zeros(2 + combined_model.speculator.n_predict).to(device)
    start = time()

    for epoch in range(args.epochs):
        for i, (input_ids, target_ids, attention_mask) in enumerate(dataloader, 0):
            input_texts = input_ids.to(device)
            base_model_input = input_texts[:, :-combined_model.speculator.n_predict - 1]
            optimizer.zero_grad()

            if i < len(dataloader) // 2:  # First half of the training: Stage 1
                loss, ddp_stats, _ = train_stage1(args, combined_model, base_model_input, input_texts, criterion,
                                                  ddp_stats)
            else:  # Second half of the training: Stage 2
                assert 1==0
                loss, ddp_stats, _ = train_stage2(args, combined_model, base_model_input, input_texts, criterion,
                                                  ddp_stats)

            # print(loss)

            model_engine_combined.backward(loss)
            # optimizer.step()
            model_engine_combined.step()

            if i % 10 == 0:
                print('EPOCH [%d/%d] ITER [%d/%d] Loss: %.4f' % (epoch, args.epochs, i, len(dataloader), loss.item()))
                # writer.add_scalar("Loss", loss.item(), epoch * len(dataloader) + i)
    torch.cuda.synchronize()
    stop = time()
    print(f"total wall clock time for {args.epochs} epochs is {stop - start} secs")

def main():
    parser = get_argument_parser()
    parser = deepspeed.add_config_arguments(parser)
    args = parser.parse_args()
    train(args)

if __name__ == "__main__":
    main()

And the speculator.py is adapted from https://github.com/foundation-model-stack/fms-extras/blob/main/fms_extras/models/speculator.py

import torch.nn as nn
import torch
import math
from typing import Dict, List, Tuple, Set, Any, Optional
import torch.nn.functional as F

class LayerNormParameterized(nn.Module):
    """
    A generalized LayerNorm implementation. With all optional arguments set to True, equivalent to nn.LayerNorm up to epsilon stabilization term
    (this class divides inputs by min(norm, eps), while nn.LayerNorm divides by norm + eps).
    ...
    Args
    ----
    normalized_shape : int
        Dimensionality of input data (size of final tensor axis)
    eps : float
        Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
    elementwise_scale : bool
        Include a learned scaling term after normalization?
    elementwise_shift : bool
        Include a learned bias term after normalization?
    use_mean : bool
        Recenter inputs around zero before normalizing, or just rescale?
    """

    def __init__(
        self,
        normalized_shape,
        eps=1e-06,
        elementwise_scale=True,
        elementwise_shift=False,
        use_mean=False,
        use_high_precision_pow=False,
    ):
        super(LayerNormParameterized, self).__init__()
        self.normalized_shape = normalized_shape
        self.eps = eps
        self.elementwise_scale = elementwise_scale
        self.elementwise_shift = elementwise_shift
        self.use_mean = use_mean
        self.use_high_precision_pow = use_high_precision_pow

        if self.elementwise_scale:
            self.weight = nn.Parameter(torch.empty(self.normalized_shape))
        # else:
        #     self.register_parameter("weight", None)
        if self.elementwise_shift:
            self.bias = nn.Parameter(torch.empty(self.normalized_shape))
        # else:
        #     self.register_parameter("bias", None)

    def reset_parameters(self):
        if self.elementwise_scale:
            self.weight.data.fill_(1)
        if self.elementwise_shift:
            self.bias.data.zero_()

    def forward(self, x):
        if self.use_mean:
            x = x - x.mean(-1, keepdim=True)
        # x = F.normalize(x, dim=-1)*math.sqrt(x.size(-1))
        xf = x
        if self.use_high_precision_pow:
            xf = x.float()
        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
        x = xf.type_as(x)
        if self.elementwise_scale:
            x = self.weight * x
        if self.elementwise_shift:
            x = x + self.bias
        return x

class MLPSpeculator(nn.Module):
    """
    This is a simple MLP-based speculator that functions similarly to Medusa
    (https://arxiv.org/abs/2401.10774), ingesting context via the final embedding
    vector from the base model. However, this model also conditions on previously
    predicted tokens, similarly to an RNN, allowing it to generate better-quality n-grams.

    The architecture is as flat and simple as possible: for each prediction head,
    the current state vector is projected into a new latent space and added to the
    previous token's embedding. This sum goes through layernorm and activation, forming
    the new state vector. This state predicts the next token (or set of candidate tokens)
    for the current head, and then is passed on to the next.
    ...
    Args
    ----
    emb_dim : int
        Dimensionality of the input vector from the base model.
    inner_dim : int
        Latent dimensionality of the speculator model.
    vocab_size : int
        Number of entries in the tokenizer associated with the base model.
    n_predict : int
        Number of heads / number of tokens to guess ahead. Model size and speed scale with this value.
    tie_weights : bool
        If true, use a single set of weights for every model head/stage after the first.
        The initial projection from the base model may have a different size, so that stays separate.
    scale_input: bool
        If true, apply an extra layernorm to the initial state vector input.
        Helps training dynamics, particularly when base model output has unusual scale.
    """

    def __init__(
        self,
        emb_dim=4096,
        inner_dim=0,
        vocab_size=32000,
        n_predict=3,
        tie_weights=False,
        scale_input=False,
    ):
        super().__init__()
        self.n_predict = n_predict
        self.emb_dim = emb_dim
        inner_dim = inner_dim if inner_dim != 0 else emb_dim
        self.inner_dim = inner_dim
        self.vsize = vocab_size
        self.scale_input = scale_input
        self.emb = nn.ModuleList(
            [nn.Embedding(vocab_size, inner_dim) for _ in range(n_predict)]
        )
        self.proj = nn.ModuleList(
            [
                nn.Linear((emb_dim if i == 0 else inner_dim), inner_dim, bias=False)
                for i in range(n_predict)
            ]
        )
        self.head = nn.ModuleList(
            [nn.Linear(inner_dim, vocab_size, bias=False) for _ in range(n_predict)]
        )
        self.ln = nn.ModuleList(
            [
                LayerNormParameterized(
                    inner_dim, elementwise_shift=True, elementwise_scale=True
                )
                for _ in range(n_predict)
            ]
        )
        if self.scale_input:
            self.ln0 = LayerNormParameterized(
                emb_dim, elementwise_shift=False, elementwise_scale=False
            )
        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
        self.state_weight = 0.5 ** (0.5 / n_predict)
        self.emb_weight = math.sqrt((1 - self.state_weight**2) * (self.inner_dim / 2))
        self.activation = nn.GELU()

        # Handle weight tying as specified
        if tie_weights:
            assert (
                n_predict > 1
            ), "You cannot tie weights between stages when only 1 exists"

            for emb in self.emb:
                emb.weight = self.emb[0].weight

            for head in self.head:
                head.weight = self.head[0].weight

            for ln in self.ln:
                ln.weight = self.ln[0].weight
                ln.bias = self.ln[0].bias

            # Since first proj has different size, allow different initial proj from base into model
            for i in range(2, n_predict):
                self.proj[i].weight = self.proj[1].weight

    def reset_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Embedding) or isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 1 / math.sqrt(self.inner_dim))
            elif isinstance(m, LayerNormParameterized) and hasattr(m, "weight"):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def generate_suffixes(
        self,
        state: torch.Tensor,
        ind: torch.Tensor,
        topk: List[int] = [5, 4, 3],
        n: int = 5,
    ) -> torch.Tensor:
        """
        FOR INFERENCE
        Generate tree of candidate sequences.
        ...
        Args
        ----
        state : torch.Tensor
            Most recent embedding vector from the base model (pre-classification head).
            Expects size [b 1 d] where b is batch size and d is model width.
        ind : torch.Tensor
            Token indices of the base model's most recent predicted token(s).
            Expects size [b 1] where b is batch size.
        topk : List(int)
            Number of tokens to consider from each head when forming the candidate tree.
            For each candidate branch in the tree, head n produces topk[n] additional sub-branches.
        n : int
            Given the final tree of prod(topk) candidates, return only the top n most confident.
        ...
        Output : torch.Tensor
            The tensor of most likely candidate sequences.
            Has size [b n self.n_predict], where b is batch size and n is provided above.
        """
        # k indicates # of candidates
        # h indicates # of generated tokens
        b = state.size(0)
        k = math.prod(topk)
        out = torch.empty(
            b, 1, k, self.n_predict, device=state.device
        ).int()  # b 1 k h -> b k 1 h
        log_probs = torch.zeros(b, 1, k, device=state.device)  # b 1 k -> b k 1
        assert (
            len(topk) == self.n_predict
        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(topk)} provided)"
        if self.scale_input:
            state = self.ln0(state) / (2**0.5)
        for i in range(self.n_predict):
            # Project and predict
            z = self.emb[i](ind)  # b k d
            state = self.proj[i](state)
            # Weighted add of state_weight*state and emb_weight*z
            # Let subsequent LN take care of denominator
            # state_weight is close to 1, so shouldn't be any precision issues
            state = torch.add(state, z, alpha=self.emb_weight / self.state_weight)
            state = self.activation(self.ln[i](state))  # b k d
            probs = F.log_softmax(self.head[i](state), dim=2)  # b k v
            probs, preds = probs.topk(topk[i], dim=2)  # b k k'

            # Update candidate set with new predictions, repeating shared prefixes as needed
            out = out.view(b, preds.size(1) * preds.size(2), -1, self.n_predict)
            out[:, :, :, i] = preds.view(b, -1, 1)

            # Update state, log_probs and ind for new predictions
            state = state.unsqueeze(2).expand(-1, -1, topk[i], -1)  # b k k' d
            state = state.reshape(b, -1, state.size(3))  # b kk' d
            ind = preds.view(b, -1)  # b kk'
            log_probs = log_probs.view(b, probs.size(1) * probs.size(2), -1)
            log_probs = log_probs.add(probs.view(b, -1, 1))

        # Take only top n best guesses
        out = out.view(b, k, self.n_predict)
        log_probs = log_probs.view(b, k)
        best_guesses = log_probs.topk(n, dim=1)[1]  # b k
        return out.gather(
            1, best_guesses.unsqueeze(2).expand(-1, -1, self.n_predict)
        )  # b n h

    def forward(
        self,
        state: torch.Tensor,
        inds: torch.Tensor,
    ) -> torch.Tensor:
        """
        FOR TRAINING
        A parallel forward pass on pre-existing ground-truth tokens in pretraining contexts.
        Produces self.n_predict predicted tokens for each token embedding in state.
        Inds requires self.n_predict extra tokens on the right to "simulate" recursive
        behavior for end positions.
        ...
        Args
        ----
        state : torch.Tensor
            Embedding vectors from the base model for a given sequence.
            Expects size [b n d] where b is batch size, n is seq len, and d is model width.
        inds : torch.Tensor
            Ground-truth token indices. inds[:,i] is the prediction coming from state[:,i]
            (or the legal fiction ground truth corresponding to that prediction).
            Expects size [b n+self.n_predict].
        ...
        Output : torch.Tensor
            Prediction logits at each position, for each head of the speculator.
            Has size [self.n_predict b n v] where v is vocab size.
        """
        out = []
        if self.scale_input:
            state = self.ln0(state) / (2**0.5)
        for i in range(self.n_predict):
            z = self.emb[i](inds[:, i : i + state.size(1)])  # b n d
            state = self.proj[i](state)
            # Weighted add of state_weight*state and emb_weight*z
            # Let subsequent LN take care of denominator
            # state_weight is close to 1, so shouldn't be any precision issues
            state = torch.add(state, z, alpha=self.emb_weight / self.state_weight)
            state = self.activation(self.ln[i](state))  # b n d
            out.append(self.head[i](state))  # b n v
        return torch.stack(out, dim=0)  # h b n v

My command is

deepspeed --hostfile=hostfile --num_nodes=12 --num_gpus=8 main_new.py --cuda --deepspeed_config deepspeed_config.json

The error is (from one rank):

worker-5: [rank22]:   File "/nvfile-data/thu/hehaowei/codellama-main/fms_hhw/main_new.py", line 307, in <module>
worker-5: [rank22]:     main()
worker-5: [rank22]:   File "/nvfile-data/thu/hehaowei/codellama-main/fms_hhw/main_new.py", line 303, in main
worker-5: [rank22]:     train(args)
worker-5: [rank22]:   File "/nvfile-data/thu/hehaowei/codellama-main/fms_hhw/main_new.py", line 242, in train
worker-5: [rank22]:     model_engine_combined, optimizer, _, _ = deepspeed.initialize(args=args, model=combined_model,
worker-5: [rank22]:   File "/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/deepspeed/__init__.py", line 181, in initialize
worker-5: [rank22]:     engine = DeepSpeedEngine(args=args,
worker-5: [rank22]:   File "/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 262, in __init__
worker-5: [rank22]:     self._configure_distributed_model(model)
worker-5: [rank22]:   File "/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1103, in _configure_distributed_model
worker-5: [rank22]:     self.module.to(self.device)
worker-5: [rank22]:   File "/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1174, in to
worker-5: [rank22]:     return self._apply(convert)
worker-5: [rank22]:   File "/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/torch/nn/modules/module.py", line 780, in _apply
worker-5: [rank22]:     module._apply(fn)
worker-5: [rank22]:   File "/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/torch/nn/modules/module.py", line 780, in _apply
worker-5: [rank22]:     module._apply(fn)
worker-5: [rank22]:   File "/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/torch/nn/modules/module.py", line 780, in _apply
worker-5: [rank22]:     module._apply(fn)
worker-5: [rank22]:   [Previous line repeated 3 more times]
worker-5: [rank22]:   File "/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/torch/nn/modules/module.py", line 805, in _apply
worker-5: [rank22]:     param_applied = fn(param)
worker-5: [rank22]:   File "/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1160, in convert
worker-5: [rank22]:     return t.to(
worker-5: [rank22]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 672.00 MiB. GPU 6 has a total capacity of 79.33 GiB of which 211.81 MiB is free. Process 1600879 has 79.11 GiB memory in use. Of the allocated memory 78.70 GiB is allocated by PyTorch, and 592.50 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Expected behavior I find another code which suggest that a full Mistral-Large-2407 (a 123B model) can be trained on 12 8*80G GPUs using deepspeed zero 3. And there was much free GPU memory. So I think when I add this speculator, which is a very small customized model, there should not be CUDA OOM error.

ds_report output

[2024-11-11 14:38:53,778] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
 [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
 [WARNING]  async_io: please install the libaio-dev package with apt
 [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
 [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
 [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
 [WARNING]  using untested triton version (3.0.0), only 1.0.0 is known to be compatible
/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
  def forward(ctx, input, weight, bias=None):
/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
  def backward(ctx, grad_output):
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
 [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
 [WARNING]  async_io: please install the libaio-dev package with apt
 [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
async_io ............... [NO] ....... [NO]
fused_adam ............. [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_lion ............... [NO] ....... [OKAY]
 [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
evoformer_attn ......... [NO] ....... [NO]
fp_quantizer ........... [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
fused_lion ............. [NO] ....... [OKAY]
inference_core_ops ..... [NO] ....... [OKAY]
cutlass_ops ............ [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
ragged_device_ops ...... [NO] ....... [OKAY]
ragged_ops ............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
 [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
 [WARNING]  using untested triton version (3.0.0), only 1.0.0 is known to be compatible
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/torch']
torch version .................... 2.4.0+cu121
deepspeed install path ........... ['/root/miniconda/envs/hurry_up_hhw_h/lib/python3.10/site-packages/deepspeed']
deepspeed info ................... 0.14.4, unknown, unknown
torch cuda version ............... 12.1
torch hip version ................ None
nvcc version ..................... 12.4
deepspeed wheel compiled w. ...... torch 0.0, cuda 0.0
shared memory (/dev/shm) size .... 1007.84 GB
962086838 commented 2 days ago

Forgot to append my ds_config.json

{
  "train_batch_size": 96,
  "optimizer": {
    "type": "Adam",
    "params": {
      "lr": 0.0002,
      "betas": [
        0.5,
        0.999
      ],
      "eps": 1e-8
    }
  },
  "steps_per_print": 10,
  "bf16": {
    "enabled": true
  },
  "zero_optimization": {
    "stage": 3,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": true
    },
    "stage3_gather_16bit_weights_on_model_save": true,
    "overlap_comm": true,
    "contiguous_gradients": true,
    "reduce_bucket_size": 2e8,
    "stage3_prefetch_bucket_size": 2e7,
    "stage3_param_persistence_threshold": 1e6
  }
}
962086838 commented 2 days ago

The reason for writing such a customized model is that I can not use two deepspeed.initialize() functions to initialize these two models. A very similar issue can be found here https://github.com/microsoft/DeepSpeed/issues/3472#issuecomment-1574202568 and I tried to implement one of the suggestions.