Open liangzz1991 opened 5 days ago
For this kind of issue, it makes more sense to share what modifications you made, since we don't support individual changes.
For this kind of issue, it makes more sense to share what modifications you made, since we don't support individual changes.
It's the problem I described. I didn't modify the source code. I used ColossalAI to train qwen2vl. The place where I modified the source code is on the 2 lines of code shown in the second picture, just to print the error details.
The following is part of the training code. The error location is ‘optimizer.step()’, as shown in the first picture.
` for step, batch in enumerate(prefetcher, start=st_step): collect_metric("time", "dataloader", time_metric)
if step == args.train_steps:
logger.info(f"rank-{rank} -> max train step reached ({step}/{args.train_steps}), stop training")
break
if not args.only_forward and one_rank_done.item() > 0:
step -= 1 # to avoid inconsistent step between ranks
break
if not args.only_forward and prefetcher.next is None:
logger.info(f"rank-{rank} -> no next batch in prefetcher, stop training")
one_rank_done.add_(1.0)
distributed.all_sum(one_rank_done)
cur_global_tokens = batch["n_tokens"]
distributed.all_sum(cur_global_tokens)
batch_token_pass = cur_global_tokens.item() / args.tp_size * args.extra_dp_size
global_token_pass += batch_token_pass
cost_track_acc_tokens += batch_token_pass
cost_track_acc_batches += args.batch_size
extra_states["global_token_pass"] = global_token_pass
cur_global_samples = batch["n_samples"]
distributed.all_sum(cur_global_samples)
batch_sample_pass = cur_global_samples.item() / args.tp_size * args.extra_dp_size
global_sample_pass += batch_sample_pass
extra_states["global_sample_pass"] = global_sample_pass
# core logic
outputs = model(**{key: batch[key] for key in forward_keys})
dataset.track(batch["stats"])
collect_metric("time", "forward", time_metric)
reduction = "mean" if not args.only_forward else "sum"
loss = lm_cross_entropy(outputs.logits, batch["labels"], reduction=reduction)
if args.z_loss_coef > 0.0:
loss += args.z_loss_coef * lm_z_loss(outputs.logits).to(loss.device)
global_aux_loss = 0.0
if hasattr(outputs, "aux_loss") and outputs.aux_loss is not None:
aux_loss = outputs.aux_loss.to(loss.device)
distributed.all_mean(aux_loss)
global_aux_loss = aux_loss.item()
if args.add_aux_loss:
loss += args.router_aux_loss_coef * outputs.aux_loss.to(loss.device)
loss /= args.grad_accum_steps
print(loss)
if not args.only_forward:
booster.backward(loss, optimizer)
collect_metric("time", "backward", time_metric)
if not args.only_forward and step % args.grad_accum_steps == 0:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()`
Is there an existing issue for this bug?
- [x] I have searched the existing issues
🐛 Describe the bug
Modify the code to adapt to qwen2vl(transformers.Qwen2VLForConditionalGeneration) and find that the loss can be calculated, but partial chunk : grad_chunk.l2_norm is None.........(LLM is ok)
modified source code to print more information: result:
Environment
No response
The third picture shows that loss can be printed
Seems that your code is not the newest version. Could you pull the newest main branch and try again?
Seems that your code is not the newest version. Could you pull the newest main branch and try again?
torch 2.4.0 + colossalai 0.4.5 : same error
This is a simple piece of code that will give the same error:
import torch import torch.nn as nn from transformers import GPT2LMHeadModel,GPT2Config from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from colossalai.nn.optimizer import HybridAdam import colossalai from colossalai.booster import Booster from colossalai.lazy import LazyInitContext from colossalai.booster.plugin import GeminiPlugin import os
class GPTLMLoss(nn.Module):
def __init__(self):
super().__init__()
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, logits, labels):
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
def get_data(batch_size, seq_len, vocab_size): input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device()) attention_mask = torch.ones_like(input_ids) return input_ids, attention_mask
def main():
BATCH_SIZE = 8
SEQ_LEN = 1024
VOCAB_SIZE = 50257
NUM_STEPS = 10
colossalai.launch_from_torch()
# build GPT model
with LazyInitContext(default_device=torch.device('cuda')):
# model = gpt2_medium(checkpoint=True)
model = Qwen2VLForConditionalGeneration.from_pretrained("/data/liuxiaoyu/liuxiaoyu/models/Qwen2-VL-2B-Instruct", device_map="auto"
)
# build criterion
criterion = GPTLMLoss()
optimizer = HybridAdam(model.parameters(), lr=0.001)
torch.manual_seed(123)
# Gemini + ZeRO DP
plugin = GeminiPlugin(max_norm=1.0, initial_scale=2**5)
booster = Booster(plugin=plugin)
model, optimizer, criterion, _, _ = booster.boost(model, optimizer, criterion)
torch.cuda.synchronize()
model.train()
for n in range(NUM_STEPS):
print(n)
# we just use randomly generated data here
input_ids, attn_mask = get_data(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
optimizer.zero_grad()
outputs = model(input_ids, attn_mask)
# print(outputs)
loss = criterion(outputs.logits, input_ids)
booster.backward(loss, optimizer)
optimizer.step()
torch.cuda.synchronize()
if name =='main': main()
@Edenzzzz
Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑🤝🧑👫🧑🏿🤝🧑🏻👩🏾🤝👨🏿👬🏿
@Edenzzzz
Is there an existing issue for this bug?
🐛 Describe the bug
Modify the code to adapt to qwen2vl(transformers.Qwen2VLForConditionalGeneration) and find that the loss can be calculated, but partial chunk : grad_chunk.l2_norm is None.........(LLM is ok)
modified source code to print more information: result:
Environment
No response