训练时带参数--quantization_bit 4 报错RuntimeError: self and mat2 must have the same dtype

shangzhensen commented 1 year ago

                                  ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
                                  ┃ key                       ┃ value                          ┃
                                  ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
                                  │ train_path                │ data/mixed_train_dataset.jsonl │
                                  │ dev_path                  │ data/mixed_dev_dataset.jsonl   │
                                  │ save_dir                  │ checkpoints/finetune           │
                                  │ max_source_seq_len        │ 8                              │
                                  │ max_target_seq_len        │ 8                              │
                                  │ batch_size                │ 1                              │
                                  │ learning_rate             │ 3e-05                          │
                                  │ weight_decay              │ 0.0                            │
                                  │ num_train_epochs          │ 2                              │
                                  │ warmup_ratio              │ 0.0                            │
                                  │ save_freq                 │ 1000                           │
                                  │ logging_steps             │ 100                            │
                                  │ device                    │ cuda:1                         │
                                  │ img_log_dir               │ log/fintune_log                │
                                  │ img_log_name              │ ChatGLM Fine-Tune              │
                                  │ use_lora                  │ True                           │
                                  │ use_ptuning               │ False                          │
                                  │ lora_rank                 │ 4                              │
                                  │ pre_seq_len               │ 128                            │
                                  │ prefix_projection         │ False                          │
                                  │ preprocessing_num_workers │ 1                              │
                                  │ quantization_bit          │ 4                              │
                                  └───────────────────────────┴────────────────────────────────┘

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /ml/szs/Project/LLM_szs/transformers_tasks/LLM/finetune/train.py:352 in │ │ │ │ 349 │ │ 350 │ │ 351 if name == "main": │ │ ❱ 352 │ main() │ │ 353 │ │ │ │ /ml/szs/Project/LLM_szs/transformers_tasks/LLM/finetune/train.py:295 in main │ │ │ │ 292 │ │ for batch in train_dataloader: │ │ 293 │ │ │ if args.use_lora: │ │ 294 │ │ │ │ with autocast(): │ │ ❱ 295 │ │ │ │ │ loss = model( │ │ 296 │ │ │ │ │ │ input_ids=batch['input_ids'].to(dtype=torch.long, device=args.de │ │ 297 │ │ │ │ │ │ labels=batch['labels'].to(dtype=torch.long, device=args.device) │ │ 298 │ │ │ │ │ ).loss │ │ │ │ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /ml/temp/envs/llm_env/lib/python3.8/site-packages/peft/peft_model.py:678 in forward │ │ │ │ 675 │ ): │ │ 676 │ │ peft_config = self.active_peft_config │ │ 677 │ │ if not isinstance(peft_config, PromptLearningConfig): │ │ ❱ 678 │ │ │ return self.base_model( │ │ 679 │ │ │ │ input_ids=input_ids, │ │ 680 │ │ │ │ attention_mask=attention_mask, │ │ 681 │ │ │ │ inputs_embeds=inputs_embeds, │ │ │ │ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:1160 in forward │ │ │ │ 1157 │ │ use_cache = use_cache if use_cache is not None else self.config.use_cache │ │ 1158 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │ │ 1159 │ │ │ │ ❱ 1160 │ │ transformer_outputs = self.transformer( │ │ 1161 │ │ │ input_ids=input_ids, │ │ 1162 │ │ │ position_ids=position_ids, │ │ 1163 │ │ │ attention_mask=attention_mask, │ │ │ │ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:973 in forward │ │ │ │ 970 │ │ │ │ │ output_attentions │ │ 971 │ │ │ │ ) │ │ 972 │ │ │ else: │ │ ❱ 973 │ │ │ │ layer_ret = layer( │ │ 974 │ │ │ │ │ hidden_states, │ │ 975 │ │ │ │ │ position_ids=position_ids, │ │ 976 │ │ │ │ │ attention_mask=attention_mask, │ │ │ │ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:614 in forward │ │ │ │ 611 │ │ attention_input = self.input_layernorm(hidden_states) │ │ 612 │ │ │ │ 613 │ │ # Self attention. │ │ ❱ 614 │ │ attention_outputs = self.attention( │ │ 615 │ │ │ attention_input, │ │ 616 │ │ │ position_ids, │ │ 617 │ │ │ attention_mask=attention_mask, │ │ │ │ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:439 in forward │ │ │ │ 436 │ │ """ │ │ 437 │ │ │ │ 438 │ │ # [seq_len, batch, 3 hidden_size] │ │ ❱ 439 │ │ mixed_raw_layer = self.query_key_value(hidden_states) │ │ 440 │ │ │ │ 441 │ │ # [seq_len, batch, 3 hidden_size] --> [seq_len, batch, num_attention_heads, 3 │ │ 442 │ │ new_tensor_shape = mixed_raw_layer.size()[:-1] + ( │ │ │ │ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /ml/temp/envs/llm_env/lib/python3.8/site-packages/peft/tuners/lora.py:565 in forward │ │ │ │ 562 │ │ │ │ self.unmerge() │ │ 563 │ │ │ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self. │ │ 564 │ │ elif self.r[self.active_adapter] > 0 and not self.merged: │ │ ❱ 565 │ │ │ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self. │ │ 566 │ │ │ │ │ 567 │ │ │ x = x.to(self.lora_A[self.active_adapter].weight.dtype) │ │ 568 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: self and mat2 must have the same dtype 量化后的模型，训练的时候会提示类型不匹配，多卡也是同样的问题，训练时带参数--quantization_bit 4 @HarderThenHarder 麻烦问下如何解决

shangzhensen commented 1 year ago

@HarderThenHarder 求解答

chenxyzl commented 1 year ago

@shangzhensen 请问下解决了么，碰到相同的问题

YanLingLi-AI commented 1 year ago

同求

HarderThenHarder / transformers_tasks

训练时带参数--quantization_bit 4 报错RuntimeError: self and mat2 must have the same dtype #62