Open jhonffe opened 1 year ago
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d547816b9814051.arrow 0%| | 0/3581 [00:00<?, ?it/s]
use_cache=True
is incompatible with gradient checkpointing. Settinguse_cache=False
... ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:229 in │ │ │ │ 226 │ │ 227 if name == "main": │ │ 228 │ args = parse_args() │ │ ❱ 229 │ train(args) │ │ 230 │ │ 231 │ │ │ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:223 in train │ │ │ │ 220 │ │ data_collator=data_collator │ │ 221 │ ) │ │ 222 │ │ │ ❱ 223 │ trainer.train(resume_from_checkpoint=resume_from_checkpoint) │ │ 224 │ trainer.model.save_pretrained(hf_train_args.output_dir) │ │ 225 │ │ 226 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1645 in train │ │ │ │ 1642 │ │ inner_training_loop = find_executable_batch_size( │ │ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1644 │ │ ) │ │ ❱ 1645 │ │ return inner_training_loop( │ │ 1646 │ │ │ args=args, │ │ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1648 │ │ │ trial=trial, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1938 in _inner_training_loop │ │ │ │ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │ │ 1936 │ │ │ │ │ │ 1937 │ │ │ │ with self.accelerator.accumulate(model): │ │ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │ │ 1939 │ │ │ │ │ │ 1940 │ │ │ │ if ( │ │ 1941 │ │ │ │ │ args.logging_nan_inf_filter │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2759 in training_step │ │ │ │ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │ │ 2757 │ │ │ │ 2758 │ │ with self.compute_loss_context_manager(): │ │ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │ │ 2760 │ │ │ │ 2761 │ │ if self.args.n_gpu > 1: │ │ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2784 in compute_loss │ │ │ │ 2781 │ │ │ labels = inputs.pop("labels") │ │ 2782 │ │ else: │ │ 2783 │ │ │ labels = None │ │ ❱ 2784 │ │ outputs = model(inputs) │ │ 2785 │ │ # Save past state if it exists │ │ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │ │ 2787 │ │ if self.args.past_index >= 0: │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/peft/peft │ │ _model.py:922 in forward │ │ │ │ 919 │ │ │ │ │ kwargs, │ │ 920 │ │ │ │ ) │ │ 921 │ │ │ │ │ ❱ 922 │ │ │ return self.base_model( │ │ 923 │ │ │ │ input_ids=input_ids, │ │ 924 │ │ │ │ attention_mask=attention_mask, │ │ 925 │ │ │ │ inputs_embeds=inputs_embeds, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │ │ e/hooks.py:165 in new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(*args, kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, *kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:960 in │ │ forward │ │ │ │ 957 │ │ │ shift_labels = labels[..., 1:].contiguous() │ │ 958 │ │ │ # Flatten the tokens │ │ 959 │ │ │ loss_fct = CrossEntropyLoss(ignore_index=-100) │ │ ❱ 960 │ │ │ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.v │ │ 961 │ │ │ │ │ 962 │ │ │ lm_logits = lm_logits.to(hidden_states.dtype) │ │ 963 │ │ │ loss = loss.to(hidden_states.dtype) │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/loss.py:1174 in forward │ │ │ │ 1171 │ │ self.label_smoothing = label_smoothing │ │ 1172 │ │ │ 1173 │ def forward(self, input: Tensor, target: Tensor) -> Tensor: │ │ ❱ 1174 │ │ return F.cross_entropy(input, target, weight=self.weight, │ │ 1175 │ │ │ │ │ │ │ ignore_index=self.ignore_index, reduction=self.reduction, │ │ 1176 │ │ │ │ │ │ │ label_smoothing=self.label_smoothing) │ │ 1177 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ functional.py:3029 in cross_entropy │ │ │ │ 3026 │ │ ) │ │ 3027 │ if size_average is not None or reduce is not None: │ │ 3028 │ │ reduction = _Reduction.legacy_get_string(size_average, reduce) │ │ ❱ 3029 │ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(re │ │ 3030 │ │ 3031 │ │ 3032 def binary_cross_entropy( │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)
你好,请问你的问题解决了吗?我也想使用单机多卡微调qlora怎么修改代码?
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d547816b9814051.arrow 0%| | 0/3581 [00:00<?, ?it/s] │
│ │
│ 226 │
│ 227 if name == "main": │
│ 228 │ args = parse_args() │
│ ❱ 229 │ train(args) │
│ 230 │
│ 231 │
│ │
│ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:223 in train │
│ │
│ 220 │ │ data_collator=data_collator │
│ 221 │ ) │
│ 222 │ │
│ ❱ 223 │ trainer.train(resume_from_checkpoint=resume_from_checkpoint) │
│ 224 │ trainer.model.save_pretrained(hf_train_args.output_dir) │
│ 225 │
│ 226 │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:1645 in train │
│ │
│ 1642 │ │ inner_training_loop = find_executable_batch_size( │
│ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1644 │ │ ) │
│ ❱ 1645 │ │ return inner_training_loop( │
│ 1646 │ │ │ args=args, │
│ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1648 │ │ │ trial=trial, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:1938 in _inner_training_loop │
│ │
│ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │
│ 1936 │ │ │ │ │
│ 1937 │ │ │ │ with self.accelerator.accumulate(model): │
│ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │
│ 1939 │ │ │ │ │
│ 1940 │ │ │ │ if ( │
│ 1941 │ │ │ │ │ args.logging_nan_inf_filter │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:2759 in training_step │
│ │
│ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │
│ 2757 │ │ │
│ 2758 │ │ with self.compute_loss_context_manager(): │
│ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │
│ 2760 │ │ │
│ 2761 │ │ if self.args.n_gpu > 1: │
│ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │
│ ers/trainer.py:2784 in compute_loss │
│ │
│ 2781 │ │ │ labels = inputs.pop("labels") │
│ 2782 │ │ else: │
│ 2783 │ │ │ labels = None │
│ ❱ 2784 │ │ outputs = model(inputs) │
│ 2785 │ │ # Save past state if it exists │
│ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │
│ 2787 │ │ if self.args.past_index >= 0: │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(args, kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/peft/peft │
│ _model.py:922 in forward │
│ │
│ 919 │ │ │ │ │ kwargs, │
│ 920 │ │ │ │ ) │
│ 921 │ │ │ │
│ ❱ 922 │ │ │ return self.base_model( │
│ 923 │ │ │ │ input_ids=input_ids, │
│ 924 │ │ │ │ attention_mask=attention_mask, │
│ 925 │ │ │ │ inputs_embeds=inputs_embeds, │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(args, kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │
│ e/hooks.py:165 in new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, *kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:960 in │
│ forward │
│ │
│ 957 │ │ │ shift_labels = labels[..., 1:].contiguous() │
│ 958 │ │ │ # Flatten the tokens │
│ 959 │ │ │ loss_fct = CrossEntropyLoss(ignore_index=-100) │
│ ❱ 960 │ │ │ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.v │
│ 961 │ │ │ │
│ 962 │ │ │ lm_logits = lm_logits.to(hidden_states.dtype) │
│ 963 │ │ │ loss = loss.to(hidden_states.dtype) │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(args, kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ modules/loss.py:1174 in forward │
│ │
│ 1171 │ │ self.label_smoothing = label_smoothing │
│ 1172 │ │
│ 1173 │ def forward(self, input: Tensor, target: Tensor) -> Tensor: │
│ ❱ 1174 │ │ return F.cross_entropy(input, target, weight=self.weight, │
│ 1175 │ │ │ │ │ │ │ ignore_index=self.ignore_index, reduction=self.reduction, │
│ 1176 │ │ │ │ │ │ │ label_smoothing=self.label_smoothing) │
│ 1177 │
│ │
│ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │
│ functional.py:3029 in cross_entropy │
│ │
│ 3026 │ │ ) │
│ 3027 │ if size_average is not None or reduce is not None: │
│ 3028 │ │ reduction = _Reduction.legacy_get_string(size_average, reduce) │
│ ❱ 3029 │ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(re │
│ 3030 │
│ 3031 │
│ 3032 def binary_cross_entropy( │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking
argument for argument target in method wrapper_CUDA_nll_loss_forward)
use_cache=True
is incompatible with gradient checkpointing. Settinguse_cache=False
... ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:229 in