shuxueslpi / chatGLM-6B-QLoRA

使用peft库,对chatGLM-6B/chatGLM2-6B实现4bit的QLoRA高效微调,并做lora model和base model的merge及4bit的量化(quantize)。
356 stars 46 forks source link

RuntimeError: mat1 and mat2 shapes cannot be multiplied (588x4096 and 1x9437184) ,ChatGLM2-6B做微调,请问如何设置参数?? #37

Open jhonffe opened 1 year ago

jhonffe commented 1 year ago

sh 参数如下: python3 train_qlora.py \ --train_args_json chatGLM_6B_QLoRA.json \ --model_name_or_path /mnt/disk_data/soft/text-generation-webui-main/models/ChatGLM2-6B \ --train_data_path data/train.jsonl \ --eval_data_path data/dev.jsonl \ --lora_rank 4 \ --lora_dropout 0.05 \ --compute_dtype fp16 \ --max_input_length 64 \ --max_output_length 128 \

bin /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.7/lib64/libcudart.so.11.0 CUDA SETUP: Highest compute capability among GPUs detected: 8.0 CUDA SETUP: Detected CUDA version 117 CUDA SETUP: Loading binary /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so... Using the WANDB_DISABLED environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none). The model weights are not tied. Please use the tie_weights method before using the infer_auto_device function. Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████| 7/7 [00:08<00:00, 1.24s/it] trainable params: 974,848 || all params: 3,287,312,384 || trainable%: 0.029654863491062736 Found cached dataset json (/root/.cache/huggingface/datasets/json/default-e0d9f754b035be5e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4) 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 321.06it/s] Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-e0d9f754b035be5e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-55c45c43ff9cb6e7.arrow Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/json/default-e0d9f754b035be5e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-ac4d0aad0908daff.arrow Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-e0d9f754b035be5e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-be803c5d4b114f15.arrow Found cached dataset json (/root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4) 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 938.95it/s] Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-d09ce473a7ac4a01.arrow Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-ec63b687e0fed7f9.arrow Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d547816b9814051.arrow 0%| | 0/3581 [00:00<?, ?it/s]use_cache=True is incompatible with gradient checkpointing. Setting use_cache=False... ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:229 in │ │ │ │ 226 │ │ 227 if name == "main": │ │ 228 │ args = parse_args() │ │ ❱ 229 │ train(args) │ │ 230 │ │ 231 │ │ │ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:223 in train │ │ │ │ 220 │ │ data_collator=data_collator │ │ 221 │ ) │ │ 222 │ │ │ ❱ 223 │ trainer.train(resume_from_checkpoint=resume_from_checkpoint) │ │ 224 │ trainer.model.save_pretrained(hf_train_args.output_dir) │ │ 225 │ │ 226 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1645 in train │ │ │ │ 1642 │ │ inner_training_loop = find_executable_batch_size( │ │ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1644 │ │ ) │ │ ❱ 1645 │ │ return inner_training_loop( │ │ 1646 │ │ │ args=args, │ │ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1648 │ │ │ trial=trial, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1938 in _inner_training_loop │ │ │ │ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │ │ 1936 │ │ │ │ │ │ 1937 │ │ │ │ with self.accelerator.accumulate(model): │ │ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │ │ 1939 │ │ │ │ │ │ 1940 │ │ │ │ if ( │ │ 1941 │ │ │ │ │ args.logging_nan_inf_filter │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2759 in training_step │ │ │ │ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │ │ 2757 │ │ │ │ 2758 │ │ with self.compute_loss_context_manager(): │ │ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │ │ 2760 │ │ │ │ 2761 │ │ if self.args.n_gpu > 1: │ │ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2784 in compute_loss │ │ │ │ 2781 │ │ │ labels = inputs.pop("labels") │ │ 2782 │ │ else: │ │ 2783 │ │ │ labels = None │ │ ❱ 2784 │ │ outputs = model(inputs) │ │ 2785 │ │ # Save past state if it exists │ │ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │ │ 2787 │ │ if self.args.past_index >= 0: │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │ │ e/hooks.py:165 in new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(args, kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:934 in │ │ forward │ │ │ │ 931 │ │ use_cache = use_cache if use_cache is not None else self.config.use_cache │ │ 932 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │ │ 933 │ │ │ │ ❱ 934 │ │ transformer_outputs = self.transformer( │ │ 935 │ │ │ input_ids=input_ids, │ │ 936 │ │ │ position_ids=position_ids, │ │ 937 │ │ │ attention_mask=attention_mask, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:830 in │ │ forward │ │ │ │ 827 │ │ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() │ │ 828 │ │ │ │ 829 │ │ # Run encoder. │ │ ❱ 830 │ │ hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( │ │ 831 │ │ │ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, │ │ 832 │ │ │ kv_caches=past_key_values, use_cache=use_cache, output_hiddenstates=output │ │ 833 │ │ ) │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:631 in │ │ forward │ │ │ │ 628 │ │ │ │ │ 629 │ │ │ layer = self._get_layer(index) │ │ 630 │ │ │ if self.gradient_checkpointing and self.training: │ │ ❱ 631 │ │ │ │ layer_ret = torch.utils.checkpoint.checkpoint( │ │ 632 │ │ │ │ │ layer, │ │ 633 │ │ │ │ │ hidden_states, │ │ 634 │ │ │ │ │ attention_mask, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/uti │ │ ls/checkpoint.py:249 in checkpoint │ │ │ │ 246 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwar │ │ 247 │ │ │ 248 │ if use_reentrant: │ │ ❱ 249 │ │ return CheckpointFunction.apply(function, preserve, args) │ │ 250 │ else: │ │ 251 │ │ return _checkpoint_without_reentrant( │ │ 252 │ │ │ function, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/aut │ │ ograd/function.py:506 in apply │ │ │ │ 503 │ │ if not torch._C._are_functorch_transforms_active(): │ │ 504 │ │ │ # See NOTE: [functorch vjp and autograd interaction] │ │ 505 │ │ │ args = _functorch.utils.unwrap_dead_wrappers(args) │ │ ❱ 506 │ │ │ return super().apply(args, kwargs) # type: ignore[misc] │ │ 507 │ │ │ │ 508 │ │ if cls.setup_context == _SingleLevelFunction.setup_context: │ │ 509 │ │ │ raise RuntimeError( │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/uti │ │ ls/checkpoint.py:107 in forward │ │ │ │ 104 │ │ ctx.save_for_backward(tensor_inputs) │ │ 105 │ │ │ │ 106 │ │ with torch.no_grad(): │ │ ❱ 107 │ │ │ outputs = run_function(args) │ │ 108 │ │ return outputs │ │ 109 │ │ │ 110 │ @staticmethod │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │ │ e/hooks.py:165 in new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(args, kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:544 in │ │ forward │ │ │ │ 541 │ │ # Layer norm at the beginning of the transformer layer. │ │ 542 │ │ layernorm_output = self.input_layernorm(hidden_states) │ │ 543 │ │ # Self attention. │ │ ❱ 544 │ │ attention_output, kv_cache = self.self_attention( │ │ 545 │ │ │ layernorm_output, │ │ 546 │ │ │ attention_mask, │ │ 547 │ │ │ rotary_pos_emb, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │ │ e/hooks.py:165 in new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(args, kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:376 in │ │ forward │ │ │ │ 373 │ │ # ===================== │ │ 374 │ │ │ │ 375 │ │ # Attention heads [sq, b, h] --> [sq, b, (np 3 hn)] │ │ ❱ 376 │ │ mixed_x_layer = self.query_key_value(hidden_states) │ │ 377 │ │ │ │ 378 │ │ if self.multi_query_attention: │ │ 379 │ │ │ (query_layer, key_layer, value_layer) = mixed_x_layer.split( │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │ │ e/hooks.py:165 in new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(args, kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, *kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/peft/tune │ │ rs/lora.py:1123 in forward │ │ │ │ 1120 │ │ │ │ self.active_adapter = adapter_name │ │ 1121 │ │ │ │ │ 1122 │ │ │ def forward(self, x: torch.Tensor): │ │ ❱ 1123 │ │ │ │ result = super().forward(x) │ │ 1124 │ │ │ │ │ │ 1125 │ │ │ │ if self.disable_adapters or self.active_adapter not in self.lora_A.keys( │ │ 1126 │ │ │ │ │ return result │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandby │ │ tes/nn/modules.py:219 in forward │ │ │ │ 216 │ │ │ x = x.to(self.compute_dtype) │ │ 217 │ │ │ │ 218 │ │ bias = None if self.bias is None else self.bias.to(self.compute_dtype) │ │ ❱ 219 │ │ out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.qua │ │ 220 │ │ │ │ 221 │ │ out = out.to(inp_dtype) │ │ 222 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandby │ │ tes/autograd/_functions.py:564 in matmul_4bit │ │ │ │ 561 │ │ 562 def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None): │ │ 563 │ assert quant_state is not None │ │ ❱ 564 │ return MatMul4Bit.apply(A, B, out, bias, quant_state) │ │ 565 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/aut │ │ ograd/function.py:506 in apply │ │ │ │ 503 │ │ if not torch._C._are_functorch_transforms_active(): │ │ 504 │ │ │ # See NOTE: [functorch vjp and autograd interaction] │ │ 505 │ │ │ args = _functorch.utils.unwrap_dead_wrappers(args) │ │ ❱ 506 │ │ │ return super().apply(args, **kwargs) # type: ignore[misc] │ │ 507 │ │ │ │ 508 │ │ if cls.setup_context == _SingleLevelFunction.setup_context: │ │ 509 │ │ │ raise RuntimeError( │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandby │ │ tes/autograd/_functions.py:512 in forward │ │ │ │ 509 │ │ │ │ 510 │ │ # 1. Dequantize │ │ 511 │ │ # 2. MatmulnN │ │ ❱ 512 │ │ output = torch.nn.functional.linear(A, F.dequantize_fp4(B, state).to(A.dtype).t( │ │ 513 │ │ │ │ 514 │ │ # 3. Save state │ │ 515 │ │ ctx.state = state │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: mat1 and mat2 shapes cannot be multiplied (588x4096 and 1x9437184)

L3LeTrigger-F commented 11 months ago

I had the same problem