RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)，请问如何调整训练代码，适配多卡训练？？

shuxueslpi / chatGLM-6B-QLoRA

使用peft库，对chatGLM-6B/chatGLM2-6B实现4bit的QLoRA高效微调，并做lora model和base model的merge及4bit的量化（quantize）。

349 stars 46 forks source link

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d547816b9814051.arrow 0%| | 0/3581 [00:00<?, ?it/s]use_cache=True is incompatible with gradient checkpointing. Setting use_cache=False... ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:229 in │ │ │ │ 226 │ │ 227 if name == "main": │ │ 228 │ args = parse_args() │ │ ❱ 229 │ train(args) │ │ 230 │ │ 231 │ │ │ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:223 in train │ │ │ │ 220 │ │ data_collator=data_collator │ │ 221 │ ) │ │ 222 │ │ │ ❱ 223 │ trainer.train(resume_from_checkpoint=resume_from_checkpoint) │ │ 224 │ trainer.model.save_pretrained(hf_train_args.output_dir) │ │ 225 │ │ 226 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1645 in train │ │ │ │ 1642 │ │ inner_training_loop = find_executable_batch_size( │ │ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1644 │ │ ) │ │ ❱ 1645 │ │ return inner_training_loop( │ │ 1646 │ │ │ args=args, │ │ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1648 │ │ │ trial=trial, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1938 in _inner_training_loop │ │ │ │ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │ │ 1936 │ │ │ │ │ │ 1937 │ │ │ │ with self.accelerator.accumulate(model): │ │ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │ │ 1939 │ │ │ │ │ │ 1940 │ │ │ │ if ( │ │ 1941 │ │ │ │ │ args.logging_nan_inf_filter │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2759 in training_step │ │ │ │ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │ │ 2757 │ │ │ │ 2758 │ │ with self.compute_loss_context_manager(): │ │ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │ │ 2760 │ │ │ │ 2761 │ │ if self.args.n_gpu > 1: │ │ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2784 in compute_loss │ │ │ │ 2781 │ │ │ labels = inputs.pop("labels") │ │ 2782 │ │ else: │ │ 2783 │ │ │ labels = None │ │ ❱ 2784 │ │ outputs = model(inputs) │ │ 2785 │ │ # Save past state if it exists │ │ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │ │ 2787 │ │ if self.args.past_index >= 0: │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/peft/peft │ │ _model.py:922 in forward │ │ │ │ 919 │ │ │ │ │ kwargs, │ │ 920 │ │ │ │ ) │ │ 921 │ │ │ │ │ ❱ 922 │ │ │ return self.base_model( │ │ 923 │ │ │ │ input_ids=input_ids, │ │ 924 │ │ │ │ attention_mask=attention_mask, │ │ 925 │ │ │ │ inputs_embeds=inputs_embeds, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │ │ e/hooks.py:165 in new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(*args, kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, *kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:960 in │ │ forward │ │ │ │ 957 │ │ │ shift_labels = labels[..., 1:].contiguous() │ │ 958 │ │ │ # Flatten the tokens │ │ 959 │ │ │ loss_fct = CrossEntropyLoss(ignore_index=-100) │ │ ❱ 960 │ │ │ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.v │ │ 961 │ │ │ │ │ 962 │ │ │ lm_logits = lm_logits.to(hidden_states.dtype) │ │ 963 │ │ │ loss = loss.to(hidden_states.dtype) │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/loss.py:1174 in forward │ │ │ │ 1171 │ │ self.label_smoothing = label_smoothing │ │ 1172 │ │ │ 1173 │ def forward(self, input: Tensor, target: Tensor) -> Tensor: │ │ ❱ 1174 │ │ return F.cross_entropy(input, target, weight=self.weight, │ │ 1175 │ │ │ │ │ │ │ ignore_index=self.ignore_index, reduction=self.reduction, │ │ 1176 │ │ │ │ │ │ │ label_smoothing=self.label_smoothing) │ │ 1177 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ functional.py:3029 in cross_entropy │ │ │ │ 3026 │ │ ) │ │ 3027 │ if size_average is not None or reduce is not None: │ │ 3028 │ │ reduction = _Reduction.legacy_get_string(size_average, reduce) │ │ ❱ 3029 │ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(re │ │ 3030 │ │ 3031 │ │ 3032 def binary_cross_entropy( │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-bc6f7aa3b7d4f48f/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-8d547816b9814051.arrow 0%| | 0/3581 [00:00<?, ?it/s]use_cache=True is incompatible with gradient checkpointing. Setting use_cache=False... ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:229 in │ │ │ │ 226 │ │ 227 if name == "main": │ │ 228 │ args = parse_args() │ │ ❱ 229 │ train(args) │ │ 230 │ │ 231 │ │ │ │ /mnt/disk_data/LLM/chatGLM-6B-QLoRA-main/train_qlora.py:223 in train │ │ │ │ 220 │ │ data_collator=data_collator │ │ 221 │ ) │ │ 222 │ │ │ ❱ 223 │ trainer.train(resume_from_checkpoint=resume_from_checkpoint) │ │ 224 │ trainer.model.save_pretrained(hf_train_args.output_dir) │ │ 225 │ │ 226 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1645 in train │ │ │ │ 1642 │ │ inner_training_loop = find_executable_batch_size( │ │ 1643 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1644 │ │ ) │ │ ❱ 1645 │ │ return inner_training_loop( │ │ 1646 │ │ │ args=args, │ │ 1647 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1648 │ │ │ trial=trial, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:1938 in _inner_training_loop │ │ │ │ 1935 │ │ │ │ │ self.control = self.callback_handler.on_step_begin(args, self.state, │ │ 1936 │ │ │ │ │ │ 1937 │ │ │ │ with self.accelerator.accumulate(model): │ │ ❱ 1938 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │ │ 1939 │ │ │ │ │ │ 1940 │ │ │ │ if ( │ │ 1941 │ │ │ │ │ args.logging_nan_inf_filter │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2759 in training_step │ │ │ │ 2756 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │ │ 2757 │ │ │ │ 2758 │ │ with self.compute_loss_context_manager(): │ │ ❱ 2759 │ │ │ loss = self.compute_loss(model, inputs) │ │ 2760 │ │ │ │ 2761 │ │ if self.args.n_gpu > 1: │ │ 2762 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/transform │ │ ers/trainer.py:2784 in compute_loss │ │ │ │ 2781 │ │ │ labels = inputs.pop("labels") │ │ 2782 │ │ else: │ │ 2783 │ │ │ labels = None │ │ ❱ 2784 │ │ outputs = model(inputs) │ │ 2785 │ │ # Save past state if it exists │ │ 2786 │ │ # TODO: this needs to be fixed and made cleaner later. │ │ 2787 │ │ if self.args.past_index >= 0: │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/peft/peft │ │ _model.py:922 in forward │ │ │ │ 919 │ │ │ │ │ kwargs, │ │ 920 │ │ │ │ ) │ │ 921 │ │ │ │ │ ❱ 922 │ │ │ return self.base_model( │ │ 923 │ │ │ │ input_ids=input_ids, │ │ 924 │ │ │ │ attention_mask=attention_mask, │ │ 925 │ │ │ │ inputs_embeds=inputs_embeds, │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/accelerat │ │ e/hooks.py:165 in new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(*args, kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, *kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/ChatGLM2-6B/modeling_chatglm.py:960 in │ │ forward │ │ │ │ 957 │ │ │ shift_labels = labels[..., 1:].contiguous() │ │ 958 │ │ │ # Flatten the tokens │ │ 959 │ │ │ loss_fct = CrossEntropyLoss(ignore_index=-100) │ │ ❱ 960 │ │ │ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.v │ │ 961 │ │ │ │ │ 962 │ │ │ lm_logits = lm_logits.to(hidden_states.dtype) │ │ 963 │ │ │ loss = loss.to(hidden_states.dtype) │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/module.py:1501 in _call_impl │ │ │ │ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │ │ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │ │ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │ │ ❱ 1501 │ │ │ return forward_call(args, kwargs) │ │ 1502 │ │ # Do not call functions when jit is used │ │ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1504 │ │ backward_pre_hooks = [] │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ modules/loss.py:1174 in forward │ │ │ │ 1171 │ │ self.label_smoothing = label_smoothing │ │ 1172 │ │ │ 1173 │ def forward(self, input: Tensor, target: Tensor) -> Tensor: │ │ ❱ 1174 │ │ return F.cross_entropy(input, target, weight=self.weight, │ │ 1175 │ │ │ │ │ │ │ ignore_index=self.ignore_index, reduction=self.reduction, │ │ 1176 │ │ │ │ │ │ │ label_smoothing=self.label_smoothing) │ │ 1177 │ │ │ │ /mnt/disk_data/soft/oobabooga_linux/installer_files/conda/lib/python3.10/site-packages/torch/nn/ │ │ functional.py:3029 in cross_entropy │ │ │ │ 3026 │ │ ) │ │ 3027 │ if size_average is not None or reduce is not None: │ │ 3028 │ │ reduction = _Reduction.legacy_get_string(size_average, reduce) │ │ ❱ 3029 │ return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(re │ │ 3030 │ │ 3031 │ │ 3032 def binary_cross_entropy( │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)

你好，请问你的问题解决了吗？我也想使用单机多卡微调qlora怎么修改代码？

shuxueslpi / chatGLM-6B-QLoRA

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)，请问如何调整训练代码，适配多卡训练？？ #38