Question about Supervised instructs tuning in ColossalAI/applications/Chat/

chaojiewang94 commented 1 year ago

🐛 Describe the bug

when I set the lora_rank in example/train_sft.sh to 8, the bug happens as following:

Traceback (most recent call last):
  File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py", line 185, in <module>
    train(args)
  File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py", line 156, in train
    trainer.fit(logger=logger, log_interval=args.log_interval)
  File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/sft.py", line 110, in fit
    self.strategy.optimizer_step(self.optimizer)
  File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/strategies/colossalai.py", line 154, in optimizer_step
    optimizer.step()
  File "/home/chaojiewang/anaconda3/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
    return wrapped(*args, **kwargs)
  File "/home/chaojiewang/anaconda3/lib/python3.10/site-packages/colossalai/zero/sharded_optim/low_level_optim.py", line 467, in step
    assert param_shape == flat_fp32_avg_grads.shape, \
AssertionError: fp32 param and grad have different shape torch.Size([5069312]) vs torch.Size([72192])
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py:185 in <module>                        │
│                                                                                                  │
│   182 │   parser.add_argument('--lr', type=float, default=5e-6)                                  │
│   183 │   parser.add_argument('--accimulation_steps', type=int, default=8)                       │
│   184 │   args = parser.parse_args()                                                             │
│ ❱ 185 │   train(args)                                                                            │
│   186                                                                                            │
│                                                                                                  │
│ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py:156 in train                           │
│                                                                                                  │
│   153 │   │   │   │   │   │    max_epochs=args.max_epochs,                                       │
│   154 │   │   │   │   │   │    accimulation_steps=args.accimulation_steps)                       │
│   155 │                                                                                          │
│ ❱ 156 │   trainer.fit(logger=logger, log_interval=args.log_interval)                             │
│   157 │                                                                                          │
│   158 │   # save model checkpoint after fitting on only rank0                                    │
│   159 │   trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer)          │
│                                                                                                  │
│ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/sft.py:110 in fit                     │
│                                                                                                  │
│   107 │   │   │   │                                                                              │
│   108 │   │   │   │   # gradient accumulation                                                    │
│   109 │   │   │   │   if (batch_id + 1) % self.accimulation_steps == 0:                          │
│ ❱ 110 │   │   │   │   │   self.strategy.optimizer_step(self.optimizer)                           │
│   111 │   │   │   │   │   self.optimizer.zero_grad()                                             │
│   112 │   │   │   │   │   self.scheduler.step()                                                  │
│   113 │   │   │   │   │   wandb.log({                                                            │
│                                                                                                  │
│ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/strategies/colossalai.py:154 in       │
│ optimizer_step                                                                                   │
│                                                                                                  │
│   151 │   │   optimizer.backward(loss)                                                           │
│   152 │                                                                                          │
│   153 │   def optimizer_step(self, optimizer: optim.Optimizer, **kwargs) -> None:                │
│ ❱ 154 │   │   optimizer.step()                                                                   │
│   155 │                                                                                          │
│   156 │   @staticmethod                                                                          │
│   157 │   def _unwrap_actor(actor: Actor) -> nn.Module:                                          │
│                                                                                                  │
│ /home/chaojiewang/anaconda3/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:68 in       │
│ wrapper                                                                                          │
│                                                                                                  │
│     65 │   │   │   │   instance = instance_ref()                                                 │
│     66 │   │   │   │   instance._step_count += 1                                                 │
│     67 │   │   │   │   wrapped = func.__get__(instance, cls)                                     │
│ ❱   68 │   │   │   │   return wrapped(*args, **kwargs)                                           │
│     69 │   │   │                                                                                 │
│     70 │   │   │   # Note that the returned function here is no longer a bound method,           │
│     71 │   │   │   # so attributes like `__func__` and `__self__` no longer exist.               │
│                                                                                                  │
│ /home/chaojiewang/anaconda3/lib/python3.10/site-packages/colossalai/zero/sharded_optim/low_level │
│ _optim.py:467 in step                                                                            │
│                                                                                                  │
│   464 │   │   │   flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype)                            │
│   465 │   │   │                                                                                  │
│   466 │   │   │   param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape     │
│ ❱ 467 │   │   │   assert param_shape == flat_fp32_avg_grads.shape, \                             │
│   468 │   │   │   │   f'fp32 param and grad have different shape {param_shape} vs {flat_fp32_a   │
│   469 │   │   │                                                                                  │
│   470 │   │   │   single_grad_partition_groups.append(flat_fp32_avg_grads)                       │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AssertionError: fp32 param and grad have different shape torch.Size([5069312]) vs torch.Size([72192])

Environment

torchrun --standalone --nproc_per_node=1 train_sft.py \
    --pretrain "decapoda-research/llama-7b-hf" \
    --model 'llama' \
    --strategy colossalai_zero2 \
    --log_interval 10 \
    --save_path  ./Coati-7B \
    --dataset ./data/instinwild_en.json \
    --batch_size 1 \
    --accimulation_steps 8 \
    --lr 2e-5 \
    --max_datasets_size 512 \
    --max_epochs 1 \
    --lora_rank 8

No response

akk-123 commented 1 year ago

@ht-zhou I get same error

ZHENG518 commented 1 year ago

@ht-zhou I get same error

ht-zhou commented 1 year ago

Thanks for your feedback. And I will try to reproduce the bug and response soon.

suc16 commented 1 year ago

🐛 Describe the bug

when I set the lora_rank in example/train_sft.sh to 8, the bug happens as following:

Traceback (most recent call last): File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py", line 185, in train(args) File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py", line 156, in train trainer.fit(logger=logger, log_interval=args.log_interval) File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/sft.py", line 110, in fit self.strategy.optimizer_step(self.optimizer) File "/home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/strategies/colossalai.py", line 154, in optimizer_step optimizer.step() File "/home/chaojiewang/anaconda3/lib/python3.10/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper return wrapped(args, kwargs) File "/home/chaojiewang/anaconda3/lib/python3.10/site-packages/colossalai/zero/sharded_optim/low_level_optim.py", line 467, in step assert param_shape == flat_fp32_avg_grads.shape, AssertionError: fp32 param and grad have different shape torch.Size([5069312]) vs torch.Size([72192]) ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py:185 in │ │ │ │ 182 │ parser.add_argument('--lr', type=float, default=5e-6) │ │ 183 │ parser.add_argument('--accimulation_steps', type=int, default=8) │ │ 184 │ args = parser.parse_args() │ │ ❱ 185 │ train(args) │ │ 186 │ │ │ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/train_sft.py:156 in train │ │ │ │ 153 │ │ │ │ │ │ max_epochs=args.max_epochs, │ │ 154 │ │ │ │ │ │ accimulation_steps=args.accimulation_steps) │ │ 155 │ │ │ ❱ 156 │ trainer.fit(logger=logger, log_interval=args.log_interval) │ │ 157 │ │ │ 158 │ # save model checkpoint after fitting on only rank0 │ │ 159 │ trainer.save_model(path=args.save_path, only_rank0=True, tokenizer=tokenizer) │ │ │ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/sft.py:110 in fit │ │ │ │ 107 │ │ │ │ │ │ 108 │ │ │ │ # gradient accumulation │ │ 109 │ │ │ │ if (batch_id + 1) % self.accimulation_steps == 0: │ │ ❱ 110 │ │ │ │ │ self.strategy.optimizer_step(self.optimizer) │ │ 111 │ │ │ │ │ self.optimizer.zero_grad() │ │ 112 │ │ │ │ │ self.scheduler.step() │ │ 113 │ │ │ │ │ wandb.log({ │ │ │ │ /home/chaojiewang/NeurIPS_2023/Chatgpt/coati/coati/trainer/strategies/colossalai.py:154 in │ │ optimizer_step │ │ │ │ 151 │ │ optimizer.backward(loss) │ │ 152 │ │ │ 153 │ def optimizer_step(self, optimizer: optim.Optimizer, kwargs) -> None: │ │ ❱ 154 │ │ optimizer.step() │ │ 155 │ │ │ 156 │ @staticmethod │ │ 157 │ def _unwrap_actor(actor: Actor) -> nn.Module: │ │ │ │ /home/chaojiewang/anaconda3/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:68 in │ │ wrapper │ │ │ │ 65 │ │ │ │ instance = instance_ref() │ │ 66 │ │ │ │ instance._step_count += 1 │ │ 67 │ │ │ │ wrapped = func.get(instance, cls) │ │ ❱ 68 │ │ │ │ return wrapped(args, **kwargs) │ │ 69 │ │ │ │ │ 70 │ │ │ # Note that the returned function here is no longer a bound method, │ │ 71 │ │ │ # so attributes like __func__ and __self__ no longer exist. │ │ │ │ /home/chaojiewang/anaconda3/lib/python3.10/site-packages/colossalai/zero/sharded_optim/low_level │ │ _optim.py:467 in step │ │ │ │ 464 │ │ │ flat_fp32_avg_grads = flat_fp16_avg_grads.to(dtype) │ │ 465 │ │ │ │ │ 466 │ │ │ param_shape = self._fp32_flat_param_groups_of_current_rank[group_id].shape │ │ ❱ 467 │ │ │ assert param_shape == flat_fp32_avg_grads.shape, \ │ │ 468 │ │ │ │ f'fp32 param and grad have different shape {param_shape} vs {flat_fp32_a │ │ 469 │ │ │ │ │ 470 │ │ │ single_grad_partition_groups.append(flat_fp32_avg_grads) │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ AssertionError: fp32 param and grad have different shape torch.Size([5069312]) vs torch.Size([72192])

Environment

torchrun --standalone --nproc_per_node=1 train_sft.py --pretrain "decapoda-research/llama-7b-hf" --model 'llama' --strategy colossalai_zero2 --log_interval 10 --save_path ./Coati-7B --dataset ./data/instinwild_en.json --batch_size 1 --accimulation_steps 8 --lr 2e-5 --max_datasets_size 512 --max_epochs 1 --lora_rank 8 \

No response