⭐️ NLP Algorithms with transformers lib. Supporting Text-Classification, Text-Generation, Information-Extraction, Text-Matching, RLHF, SFT etc.
2.11k
stars
376
forks
source link
训练时带参数--quantization_bit 4 报错RuntimeError: self and mat2 must have the same dtype #62
Open
shangzhensen opened 1 year ago
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /ml/szs/Project/LLM_szs/transformers_tasks/LLM/finetune/train.py:352 in │
│ │
│ 349 │
│ 350 │
│ 351 if name == "main": │
│ ❱ 352 │ main() │
│ 353 │
│ │
│ /ml/szs/Project/LLM_szs/transformers_tasks/LLM/finetune/train.py:295 in main │
│ │
│ 292 │ │ for batch in train_dataloader: │
│ 293 │ │ │ if args.use_lora: │
│ 294 │ │ │ │ with autocast(): │
│ ❱ 295 │ │ │ │ │ loss = model( │
│ 296 │ │ │ │ │ │ input_ids=batch['input_ids'].to(dtype=torch.long, device=args.de │
│ 297 │ │ │ │ │ │ labels=batch['labels'].to(dtype=torch.long, device=args.device) │
│ 298 │ │ │ │ │ ).loss │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/peft/peft_model.py:678 in forward │
│ │
│ 675 │ ): │
│ 676 │ │ peft_config = self.active_peft_config │
│ 677 │ │ if not isinstance(peft_config, PromptLearningConfig): │
│ ❱ 678 │ │ │ return self.base_model( │
│ 679 │ │ │ │ input_ids=input_ids, │
│ 680 │ │ │ │ attention_mask=attention_mask, │
│ 681 │ │ │ │ inputs_embeds=inputs_embeds, │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:1160 in forward │
│ │
│ 1157 │ │ use_cache = use_cache if use_cache is not None else self.config.use_cache │
│ 1158 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │
│ 1159 │ │ │
│ ❱ 1160 │ │ transformer_outputs = self.transformer( │
│ 1161 │ │ │ input_ids=input_ids, │
│ 1162 │ │ │ position_ids=position_ids, │
│ 1163 │ │ │ attention_mask=attention_mask, │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(args, kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:973 in forward │
│ │
│ 970 │ │ │ │ │ output_attentions │
│ 971 │ │ │ │ ) │
│ 972 │ │ │ else: │
│ ❱ 973 │ │ │ │ layer_ret = layer( │
│ 974 │ │ │ │ │ hidden_states, │
│ 975 │ │ │ │ │ position_ids=position_ids, │
│ 976 │ │ │ │ │ attention_mask=attention_mask, │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:614 in forward │
│ │
│ 611 │ │ attention_input = self.input_layernorm(hidden_states) │
│ 612 │ │ │
│ 613 │ │ # Self attention. │
│ ❱ 614 │ │ attention_outputs = self.attention( │
│ 615 │ │ │ attention_input, │
│ 616 │ │ │ position_ids, │
│ 617 │ │ │ attention_mask=attention_mask, │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /home/yrobot/.cache/huggingface/modules/transformers_modules/modeling_chatglm.py:439 in forward │
│ │
│ 436 │ │ """ │
│ 437 │ │ │
│ 438 │ │ # [seq_len, batch, 3 hidden_size] │
│ ❱ 439 │ │ mixed_raw_layer = self.query_key_value(hidden_states) │
│ 440 │ │ │
│ 441 │ │ # [seq_len, batch, 3 hidden_size] --> [seq_len, batch, num_attention_heads, 3 │
│ 442 │ │ new_tensor_shape = mixed_raw_layer.size()[:-1] + ( │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(args, kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /ml/temp/envs/llm_env/lib/python3.8/site-packages/peft/tuners/lora.py:565 in forward │
│ │
│ 562 │ │ │ │ self.unmerge() │
│ 563 │ │ │ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self. │
│ 564 │ │ elif self.r[self.active_adapter] > 0 and not self.merged: │
│ ❱ 565 │ │ │ result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self. │
│ 566 │ │ │ │
│ 567 │ │ │ x = x.to(self.lora_A[self.active_adapter].weight.dtype) │
│ 568 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: self and mat2 must have the same dtype
量化后的模型,训练的时候会提示类型不匹配,多卡也是同样的问题,训练时带参数--quantization_bit 4
@HarderThenHarder 麻烦问下如何解决