使用peft库,对chatGLM-6B/chatGLM2-6B实现4bit的QLoRA高效微调,并做lora model和base model的merge及4bit的量化(quantize)。
356
stars
46
forks
source link
RuntimeError: mat1 and mat2 shapes cannot be multiplied (588x4096 and 1x9437184) ,ChatGLM2-6B做微调,请问如何设置参数?? #37
Open
jhonffe opened 1 year ago
sh 参数如下: python3 train_qlora.py \ --train_args_json chatGLM_6B_QLoRA.json \ --model_name_or_path /mnt/disk_data/soft/text-generation-webui-main/models/ChatGLM2-6B \ --train_data_path data/train.jsonl \ --eval_data_path data/dev.jsonl \ --lora_rank 4 \ --lora_dropout 0.05 \ --compute_dtype fp16 \ --max_input_length 64 \ --max_output_length 128 \
bin /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.7/lib64/libcudart.so.11.0 CUDA SETUP: Highest compute capability among GPUs detected: 8.0 CUDA SETUP: Detected CUDA version 117 CUDA SETUP: Loading binary /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so... Using the │
│ │
│ 226 │
│ 227 if name == "main": │
│ 228 │ args = parse_args() │
│ ❱ 229 │ train(args) │
│ 230 │
│ 231 │
│ │
│ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:223 in train │
│ │
│ 220 │ │ data_collator=data_collator │
│ 221 │ ) │
│ 222 │ │
│ ❱ 223 │ trainer.train(resume_from_checkpoint=resume_from_checkpoint) │
│ 224 │ trainer.model.save_pretrained(hf_train_args.output_dir) │
│ 225 │
│ 226 │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:1645 in train │
│ │
│ 1642 │ │ inner_training_loop = find_executable_batch_size( │
│ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1644 │ │ ) │
│ ❱ 1645 │ │ return inner_training_loop( │
│ 1646 │ │ │ args=args, │
│ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1648 │ │ │ trial=trial, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:1938 in _inner_training_loop │
│ │
│ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │
│ 1936 │ │ │ │ │
│ 1937 │ │ │ │ with self.accelerator.accumulate(model): │
│ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1939 │ │ │ │ │
│ 1940 │ │ │ │ if ( │
│ 1941 │ │ │ │ │ args.logging_nan_inf_filter │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:2759 in training_step │
│ │
│ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │
│ 2757 │ │ │
│ 2758 │ │ with self.compute_loss_context_manager(): │
│ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │
│ 2760 │ │ │
│ 2761 │ │ if self.args.n_gpu > 1: │
│ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:2784 in compute_loss │
│ │
│ 2781 │ │ │ labels = inputs.pop("labels") │
│ 2782 │ │ else: │
│ 2783 │ │ │ labels = None │
│ ❱ 2784 │ │ outputs = model(inputs) │
│ 2785 │ │ # Save past state if it exists │
│ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │
│ 2787 │ │ if self.args.past_index >= 0: │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │
│ e/hooks.py:165 in new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(args, kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:934 in │
│ forward │
│ │
│ 931 │ │ use_cache = use_cache if use_cache is not None else self.config.use_cache │
│ 932 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │
│ 933 │ │ │
│ ❱ 934 │ │ transformer_outputs = self.transformer( │
│ 935 │ │ │ input_ids=input_ids, │
│ 936 │ │ │ position_ids=position_ids, │
│ 937 │ │ │ attention_mask=attention_mask, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:830 in │
│ forward │
│ │
│ 827 │ │ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() │
│ 828 │ │ │
│ 829 │ │ # Run encoder. │
│ ❱ 830 │ │ hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( │
│ 831 │ │ │ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, │
│ 832 │ │ │ kv_caches=past_key_values, use_cache=use_cache, output_hiddenstates=output │
│ 833 │ │ ) │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(args, kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:631 in │
│ forward │
│ │
│ 628 │ │ │ │
│ 629 │ │ │ layer = self._get_layer(index) │
│ 630 │ │ │ if self.gradient_checkpointing and self.training: │
│ ❱ 631 │ │ │ │ layer_ret = torch.utils.checkpoint.checkpoint( │
│ 632 │ │ │ │ │ layer, │
│ 633 │ │ │ │ │ hidden_states, │
│ 634 │ │ │ │ │ attention_mask, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/uti │
│ ls/checkpoint.py:249 in checkpoint │
│ │
│ 246 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwar │
│ 247 │ │
│ 248 │ if use_reentrant: │
│ ❱ 249 │ │ return CheckpointFunction.apply(function, preserve, args) │
│ 250 │ else: │
│ 251 │ │ return _checkpoint_without_reentrant( │
│ 252 │ │ │ function, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/aut │
│ ograd/function.py:506 in apply │
│ │
│ 503 │ │ if not torch._C._are_functorch_transforms_active(): │
│ 504 │ │ │ # See NOTE: [functorch vjp and autograd interaction] │
│ 505 │ │ │ args = _functorch.utils.unwrap_dead_wrappers(args) │
│ ❱ 506 │ │ │ return super().apply(args, kwargs) # type: ignore[misc] │
│ 507 │ │ │
│ 508 │ │ if cls.setup_context == _SingleLevelFunction.setup_context: │
│ 509 │ │ │ raise RuntimeError( │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/uti │
│ ls/checkpoint.py:107 in forward │
│ │
│ 104 │ │ ctx.save_for_backward(tensor_inputs) │
│ 105 │ │ │
│ 106 │ │ with torch.no_grad(): │
│ ❱ 107 │ │ │ outputs = run_function(args) │
│ 108 │ │ return outputs │
│ 109 │ │
│ 110 │ @staticmethod │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │
│ e/hooks.py:165 in new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(args, kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:544 in │
│ forward │
│ │
│ 541 │ │ # Layer norm at the beginning of the transformer layer. │
│ 542 │ │ layernorm_output = self.input_layernorm(hidden_states) │
│ 543 │ │ # Self attention. │
│ ❱ 544 │ │ attention_output, kv_cache = self.self_attention( │
│ 545 │ │ │ layernorm_output, │
│ 546 │ │ │ attention_mask, │
│ 547 │ │ │ rotary_pos_emb, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │
│ e/hooks.py:165 in new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(args, kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:376 in │
│ forward │
│ │
│ 373 │ │ # ===================== │
│ 374 │ │ │
│ 375 │ │ # Attention heads [sq, b, h] --> [sq, b, (np 3 hn)] │
│ ❱ 376 │ │ mixed_x_layer = self.query_key_value(hidden_states) │
│ 377 │ │ │
│ 378 │ │ if self.multi_query_attention: │
│ 379 │ │ │ (query_layer, key_layer, value_layer) = mixed_x_layer.split( │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, *kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │
│ e/hooks.py:165 in new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(args, kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, *kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/peft/tune │
│ rs/lora.py:1123 in forward │
│ │
│ 1120 │ │ │ │ self.active_adapter = adapter_name │
│ 1121 │ │ │ │
│ 1122 │ │ │ def forward(self, x: torch.Tensor): │
│ ❱ 1123 │ │ │ │ result = super().forward(x) │
│ 1124 │ │ │ │ │
│ 1125 │ │ │ │ if self.disable_adapters or self.active_adapter not in self.lora_A.keys( │
│ 1126 │ │ │ │ │ return result │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandby │
│ tes/nn/modules.py:219 in forward │
│ │
│ 216 │ │ │ x = x.to(self.compute_dtype) │
│ 217 │ │ │
│ 218 │ │ bias = None if self.bias is None else self.bias.to(self.compute_dtype) │
│ ❱ 219 │ │ out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.qua │
│ 220 │ │ │
│ 221 │ │ out = out.to(inp_dtype) │
│ 222 │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandby │
│ tes/autograd/_functions.py:564 in matmul_4bit │
│ │
│ 561 │
│ 562 def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None): │
│ 563 │ assert quant_state is not None │
│ ❱ 564 │ return MatMul4Bit.apply(A, B, out, bias, quant_state) │
│ 565 │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/aut │
│ ograd/function.py:506 in apply │
│ │
│ 503 │ │ if not torch._C._are_functorch_transforms_active(): │
│ 504 │ │ │ # See NOTE: [functorch vjp and autograd interaction] │
│ 505 │ │ │ args = _functorch.utils.unwrap_dead_wrappers(args) │
│ ❱ 506 │ │ │ return super().apply(args, **kwargs) # type: ignore[misc] │
│ 507 │ │ │
│ 508 │ │ if cls.setup_context == _SingleLevelFunction.setup_context: │
│ 509 │ │ │ raise RuntimeError( │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/bitsandby │
│ tes/autograd/_functions.py:512 in forward │
│ │
│ 509 │ │ │
│ 510 │ │ # 1. Dequantize │
│ 511 │ │ # 2. MatmulnN │
│ ❱ 512 │ │ output = torch.nn.functional.linear(A, F.dequantize_fp4(B, state).to(A.dtype).t( │
│ 513 │ │ │
│ 514 │ │ # 3. Save state │
│ 515 │ │ ctx.state = state │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: mat1 and mat2 shapes cannot be multiplied (588x4096 and 1x9437184)
WANDB_DISABLED
environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none). The model weights are not tied. Please use thetie_weights
method before using theinfer_auto_device
function. Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████| 7/7 [00:08<00:00, 1.24s/it] trainable params: 974,848 || all params: 3,287,312,384 || trainable%: 0.029654863491062736 Found cached dataset json (/root/.cache/huggingface/datasets/json/default-e0d9f754b035be5e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4) 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 321.06it/s] Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-e0d9f754b035be5e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-55c45c43ff9cb6e7.arrow Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/json/default-e0d9f754b035be5e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-ac4d0aad0908daff.arrow Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-e0d9f754b035be5e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-be803c5d4b114f15.arrow Found cached dataset json (/root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4) 100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 938.95it/s] Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-d09ce473a7ac4a01.arrow Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-ec63b687e0fed7f9.arrow Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d547816b9814051.arrow 0%| | 0/3581 [00:00<?, ?it/s]use_cache=True
is incompatible with gradient checkpointing. Settinguse_cache=False
... ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:229 in