hpcaitech / ColossalAI

Making large AI models cheaper, faster and more accessible
https://www.colossalai.org
Apache License 2.0
38.76k stars 4.34k forks source link

[BUG]: assert grad_chunk.l2_norm is not None #6102

Open liangzz1991 opened 5 days ago

liangzz1991 commented 5 days ago

Is there an existing issue for this bug?

🐛 Describe the bug

Modify the code to adapt to qwen2vl(transformers.Qwen2VLForConditionalGeneration) and find that the loss can be calculated, but partial chunk : grad_chunk.l2_norm is None.........(LLM is ok) image

modified source code to print more information: image result: image

Environment

No response

Edenzzzz commented 4 days ago

For this kind of issue, it makes more sense to share what modifications you made, since we don't support individual changes.

liangzz1991 commented 4 days ago

For this kind of issue, it makes more sense to share what modifications you made, since we don't support individual changes.

It's the problem I described. I didn't modify the source code. I used ColossalAI to train qwen2vl. The place where I modified the source code is on the 2 lines of code shown in the second picture, just to print the error details.

The following is part of the training code. The error location is ‘optimizer.step()’, as shown in the first picture.

` for step, batch in enumerate(prefetcher, start=st_step): collect_metric("time", "dataloader", time_metric)

print(batch)

    if step == args.train_steps:
        logger.info(f"rank-{rank} -> max train step reached ({step}/{args.train_steps}), stop training")
        break

    if not args.only_forward and one_rank_done.item() > 0:
        step -= 1  # to avoid inconsistent step between ranks
        break
    if not args.only_forward and prefetcher.next is None:
        logger.info(f"rank-{rank} -> no next batch in prefetcher, stop training")
        one_rank_done.add_(1.0)
    distributed.all_sum(one_rank_done)

    cur_global_tokens = batch["n_tokens"]
    distributed.all_sum(cur_global_tokens)

    batch_token_pass = cur_global_tokens.item() / args.tp_size * args.extra_dp_size
    global_token_pass += batch_token_pass
    cost_track_acc_tokens += batch_token_pass
    cost_track_acc_batches += args.batch_size
    extra_states["global_token_pass"] = global_token_pass

    cur_global_samples = batch["n_samples"]
    distributed.all_sum(cur_global_samples)
    batch_sample_pass = cur_global_samples.item() / args.tp_size * args.extra_dp_size
    global_sample_pass += batch_sample_pass
    extra_states["global_sample_pass"] = global_sample_pass

    # core logic
    outputs = model(**{key: batch[key] for key in forward_keys})
    dataset.track(batch["stats"])
    collect_metric("time", "forward", time_metric)

    reduction = "mean" if not args.only_forward else "sum"
    loss = lm_cross_entropy(outputs.logits, batch["labels"], reduction=reduction)

    if args.z_loss_coef > 0.0:
        loss += args.z_loss_coef * lm_z_loss(outputs.logits).to(loss.device)

    global_aux_loss = 0.0
    if hasattr(outputs, "aux_loss") and outputs.aux_loss is not None:
        aux_loss = outputs.aux_loss.to(loss.device)
        distributed.all_mean(aux_loss)
        global_aux_loss = aux_loss.item()

        if args.add_aux_loss:
            loss += args.router_aux_loss_coef * outputs.aux_loss.to(loss.device)

    loss /= args.grad_accum_steps
    print(loss)
    if not args.only_forward:
        booster.backward(loss, optimizer)
    collect_metric("time", "backward", time_metric)

    if not args.only_forward and step % args.grad_accum_steps == 0:
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()`
liangzz1991 commented 4 days ago

Is there an existing issue for this bug?

  • [x] I have searched the existing issues

🐛 Describe the bug

Modify the code to adapt to qwen2vl(transformers.Qwen2VLForConditionalGeneration) and find that the loss can be calculated, but partial chunk : grad_chunk.l2_norm is None.........(LLM is ok) image

modified source code to print more information: image result: image

Environment

No response

The third picture shows that loss can be printed

Edenzzzz commented 2 days ago

Seems that your code is not the newest version. Could you pull the newest main branch and try again?

liangzz1991 commented 2 days ago

Seems that your code is not the newest version. Could you pull the newest main branch and try again?

torch 2.4.0 + colossalai 0.4.5 : same error

image

image

liangzz1991 commented 2 days ago

This is a simple piece of code that will give the same error:

import torch import torch.nn as nn from transformers import GPT2LMHeadModel,GPT2Config from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor

from colossalai.nn.optimizer import HybridAdam import colossalai from colossalai.booster import Booster from colossalai.lazy import LazyInitContext from colossalai.booster.plugin import GeminiPlugin import os

os.environ["RANK"] = "0"

os.environ["LOCAL_RANK"] = "0"

class GPTLMModel(nn.Module):

def init(self,

hidden_size=768,

num_layers=12,

num_attention_heads=12,

max_seq_len=1024,

vocab_size=50257,

checkpoint=False):

super().init()

self.checkpoint = checkpoint

self.model = GPT2LMHeadModel(

GPT2Config(n_embd=hidden_size,

n_layer=num_layers,

n_head=num_attention_heads,

n_positions=max_seq_len,

n_ctx=max_seq_len,

vocab_size=vocab_size))

if checkpoint:

self.model.gradient_checkpointing_enable()

def forward(self, input_ids, attention_mask):

return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]

def gpt2_medium(checkpoint=False):

return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16, checkpoint=checkpoint)

class GPTLMLoss(nn.Module):

def __init__(self):
    super().__init__()
    self.loss_fn = nn.CrossEntropyLoss()

def forward(self, logits, labels):
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

def get_data(batch_size, seq_len, vocab_size): input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device()) attention_mask = torch.ones_like(input_ids) return input_ids, attention_mask

def main():

args = parse_args()

BATCH_SIZE = 8
SEQ_LEN = 1024
VOCAB_SIZE = 50257
NUM_STEPS = 10
colossalai.launch_from_torch()

# build GPT model
with LazyInitContext(default_device=torch.device('cuda')):
  # model = gpt2_medium(checkpoint=True)
  model = Qwen2VLForConditionalGeneration.from_pretrained("/data/liuxiaoyu/liuxiaoyu/models/Qwen2-VL-2B-Instruct", device_map="auto"
                                                          )
# build criterion
criterion = GPTLMLoss()
optimizer = HybridAdam(model.parameters(), lr=0.001)

torch.manual_seed(123)

# Gemini + ZeRO DP
plugin = GeminiPlugin(max_norm=1.0, initial_scale=2**5)
booster = Booster(plugin=plugin)
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)

torch.cuda.synchronize()
model.train()
for n in range(NUM_STEPS):
    print(n)
    # we just use randomly generated data here
    input_ids, attn_mask = get_data(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
    optimizer.zero_grad()
    outputs = model(input_ids, attn_mask)
    # print(outputs)
    loss = criterion(outputs.logits, input_ids)
    booster.backward(loss, optimizer)
    optimizer.step()

torch.cuda.synchronize()

if name =='main': main()

liangzz1991 commented 5 hours ago

@Edenzzzz

Issues-translate-bot commented 5 hours ago

Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑‍🤝‍🧑👫🧑🏿‍🤝‍🧑🏻👩🏾‍🤝‍👨🏿👬🏿


@Edenzzzz