Closed pipiyapi closed 4 months ago
你好,这可能是使用lora进行分布式训练的时候会遇到的共通问题,我也没有试过多卡并行,所以不知道该如何解决。
请问该代码支持分布式训练吗?我尝试在两张卡上运行时报错,希望得到帮助。 运行脚本如下所示: export WANDB_DISABLED=true wandb offline CUDA_VISIBLE_DEVICES=0,1 python finetune_kopa.py --base_model 'llama-7b' --data_path 'data/CoDeX-S-train.json' --output_dir 'train-adapter' --num_epochs 3 --lora_r 64 --learning_rate 3e-4 --batch_size 12 --micro_batch_size 12 --num_prefix 1 --kge_model 'data/CoDeX-S-rotate.pth' --lora_target_modules='[q_proj,k_proj,v_proj,o_proj]' 报错如下: Traceback (most recent call last): File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 278, in fire.Fire(train) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 143, in Fire component_trace = _Fire(component, args, parsed_flag_args, context, name) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 477, in _Fire component, remaining_args = _CallAndUpdateTrace( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 693, in _CallAndUpdateTrace component = fn(varargs, kwargs) File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 267, in train trainer.train(resume_from_checkpoint=resume_from_checkpoint) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 1885, in train return inner_training_loop( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3238, in training_step loss = self.compute_loss(model, inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3264, in compute_loss outputs = model(inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward outputs = self.parallel_apply(replicas, inputs, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply output.reraise() File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/_utils.py", line 644, in reraise raise exception RuntimeError: Caught RuntimeError in replica 0 on device 0. Original Traceback (most recent call last): File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker output = module(*input, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/KoPA/kopa.py", line 106, in forward return self.llama_model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/peft_model.py", line 678, in forward return self.base_model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1164, in forward outputs = self.model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 968, in forward layer_outputs = decoder_layer( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 713, in forward hidden_states, self_attn_weights, present_key_value = self.self_attn( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 327, in forward query_states = self.q_proj(hidden_states) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/tuners/lora.py", line 565, in forward result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias) RuntimeError: expected scalar type Float but found Half
你好,请问你使用分布式训练的时候有没有出现RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!这个报错?
请问该代码支持分布式训练吗?我尝试在两张卡上运行时报错,希望得到帮助。 运行脚本如下所示: export WANDB_DISABLED=true wandb offline CUDA_VISIBLE_DEVICES=0,1 python finetune_kopa.py --base_model 'llama-7b' --data_path 'data/CoDeX-S-train.json' --output_dir 'train-adapter' --num_epochs 3 --lora_r 64 --learning_rate 3e-4 --batch_size 12 --micro_batch_size 12 --num_prefix 1 --kge_model 'data/CoDeX-S-rotate.pth' --lora_target_modules='[q_proj,k_proj,v_proj,o_proj]' 报错如下: Traceback (most recent call last): File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 278, in fire.Fire(train) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 143, in Fire component_trace = _Fire(component, args, parsed_flag_args, context, name) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 477, in _Fire component, remaining_args = _CallAndUpdateTrace( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 693, in _CallAndUpdateTrace component = fn(varargs, kwargs) File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 267, in train trainer.train(resume_from_checkpoint=resume_from_checkpoint) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 1885, in train return inner_training_loop( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3238, in training_step loss = self.compute_loss(model, inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3264, in compute_loss outputs = model(inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward outputs = self.parallel_apply(replicas, inputs, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply output.reraise() File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/_utils.py", line 644, in reraise raise exception RuntimeError: Caught RuntimeError in replica 0 on device 0. Original Traceback (most recent call last): File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker output = module(*input, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/KoPA/kopa.py", line 106, in forward return self.llama_model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/peft_model.py", line 678, in forward return self.base_model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1164, in forward outputs = self.model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 968, in forward layer_outputs = decoder_layer( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 713, in forward hidden_states, self_attn_weights, present_key_value = self.self_attn( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 327, in forward query_states = self.q_proj(hidden_states) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/tuners/lora.py", line 565, in forward result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias) RuntimeError: expected scalar type Float but found Half
你好,请问你使用分布式训练的时候有没有出现RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!这个报错?
没有诶,后面运行命令改成如下就可以跑了。 WORLD_SIZE=2 CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 --master_port 1234 finetune_kopa.py \ --base_model 'llama-7b' \ --data_path 'data/CoDeX-S-train.json' \ --output_dir 'train-adapter-muti3' \ --batch_size 6 \ --micro_batch_size 6 \ --num_prefix 1 \ --kge_model 'data/CoDeX-S-rotate.pth' \ --num_epochs 3 \ --learning_rate 3e-4 \ --lora_r 64 \ --lora_target_modules='[q_proj,k_proj,v_proj,o_proj]'
请问该代码支持分布式训练吗?我尝试在两张卡上运行时报错,希望得到帮助。 运行脚本如下所示: export WANDB_DISABLED=true wandb offline CUDA_VISIBLE_DEVICES=0,1 python finetune_kopa.py --base_model 'llama-7b' --data_path 'data/CoDeX-S-train.json' --output_dir 'train-adapter' --num_epochs 3 --lora_r 64 --learning_rate 3e-4 --batch_size 12 --micro_batch_size 12 --num_prefix 1 --kge_model 'data/CoDeX-S-rotate.pth' --lora_target_modules='[q_proj,k_proj,v_proj,o_proj]' 报错如下: Traceback (most recent call last): File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 278, in fire.Fire(train) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 143, in Fire component_trace = _Fire(component, args, parsed_flag_args, context, name) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 477, in _Fire component, remaining_args = _CallAndUpdateTrace( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 693, in _CallAndUpdateTrace component = fn(varargs, kwargs) File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 267, in train trainer.train(resume_from_checkpoint=resume_from_checkpoint) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 1885, in train return inner_training_loop( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3238, in training_step loss = self.compute_loss(model, inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3264, in compute_loss outputs = model(inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward outputs = self.parallel_apply(replicas, inputs, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply output.reraise() File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/_utils.py", line 644, in reraise raise exception RuntimeError: Caught RuntimeError in replica 0 on device 0. Original Traceback (most recent call last): File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker output = module(*input, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/KoPA/kopa.py", line 106, in forward return self.llama_model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/peft_model.py", line 678, in forward return self.base_model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1164, in forward outputs = self.model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 968, in forward layer_outputs = decoder_layer( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 713, in forward hidden_states, self_attn_weights, present_key_value = self.self_attn( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 327, in forward query_states = self.q_proj(hidden_states) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/tuners/lora.py", line 565, in forward result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias) RuntimeError: expected scalar type Float but found Half
你好,请问你使用分布式训练的时候有没有出现RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!这个报错?
没有诶,后面运行命令改成如下就可以跑了。 WORLD_SIZE=2 CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 --master_port 1234 finetune_kopa.py --base_model 'llama-7b' --data_path 'data/CoDeX-S-train.json' --output_dir 'train-adapter-muti3' --batch_size 6 --micro_batch_size 6 --num_prefix 1 --kge_model 'data/CoDeX-S-rotate.pth' --num_epochs 3 --learning_rate 3e-4 --lora_r 64 --lora_target_modules='[q_proj,k_proj,v_proj,o_proj]'
好的,我参考一下,感谢
请问该代码支持分布式训练吗?我尝试在两张卡上运行时报错,希望得到帮助。 运行脚本如下所示: export WANDB_DISABLED=true wandb offline CUDA_VISIBLE_DEVICES=0,1 python finetune_kopa.py --base_model 'llama-7b' --data_path 'data/CoDeX-S-train.json' --output_dir 'train-adapter' --num_epochs 3 --lora_r 64 --learning_rate 3e-4 --batch_size 12 --micro_batch_size 12 --num_prefix 1 --kge_model 'data/CoDeX-S-rotate.pth' --lora_target_modules='[q_proj,k_proj,v_proj,o_proj]' 报错如下: Traceback (most recent call last): File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 278, in fire.Fire(train) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 143, in Fire component_trace = _Fire(component, args, parsed_flag_args, context, name) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 477, in _Fire component, remaining_args = _CallAndUpdateTrace( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 693, in _CallAndUpdateTrace component = fn(varargs, kwargs) File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 267, in train trainer.train(resume_from_checkpoint=resume_from_checkpoint) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 1885, in train return inner_training_loop( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3238, in training_step loss = self.compute_loss(model, inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3264, in compute_loss outputs = model(inputs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward outputs = self.parallel_apply(replicas, inputs, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply output.reraise() File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/_utils.py", line 644, in reraise raise exception RuntimeError: Caught RuntimeError in replica 0 on device 0. Original Traceback (most recent call last): File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker output = module(*input, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/KoPA/kopa.py", line 106, in forward return self.llama_model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/peft_model.py", line 678, in forward return self.base_model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1164, in forward outputs = self.model( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 968, in forward layer_outputs = decoder_layer( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 713, in forward hidden_states, self_attn_weights, present_key_value = self.self_attn( File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward output = module._old_forward(*args, *kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 327, in forward query_states = self.q_proj(hidden_states) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/tuners/lora.py", line 565, in forward result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias) RuntimeError: expected scalar type Float but found Half
你好,请问分布式训练后做推理即inference_kopa时,有出现 File "/home/leike/miniconda3/envs/kopa/lib/python3.9/site-packages/peft/tuners/lora.py", line 565, in forward result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias) RuntimeError: expected scalar type Float but found Half 这个和训练时一样的报错吗?
请问该代码支持分布式训练吗?我尝试在两张卡上运行时报错,希望得到帮助。 运行脚本如下所示: export WANDB_DISABLED=true wandb offline CUDA_VISIBLE_DEVICES=0,1 python finetune_kopa.py \ --base_model 'llama-7b' \ --data_path 'data/CoDeX-S-train.json' \ --output_dir 'train-adapter' \ --num_epochs 3 \ --lora_r 64 \ --learning_rate 3e-4 \ --batch_size 12 \ --micro_batch_size 12 \ --num_prefix 1 \ --kge_model 'data/CoDeX-S-rotate.pth' \ --lora_target_modules='[q_proj,k_proj,v_proj,o_proj]' 报错如下: Traceback (most recent call last): File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 278, in
fire.Fire(train)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 143, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 477, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/fire/core.py", line 693, in _CallAndUpdateTrace
component = fn(varargs, kwargs)
File "/home/jupyter-xingcheng/KoPA/finetune_kopa.py", line 267, in train
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 1885, in train
return inner_training_loop(
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3238, in training_step
loss = self.compute_loss(model, inputs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/trainer.py", line 3264, in compute_loss
outputs = model(inputs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/_utils.py", line 644, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(*input, *kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/home/jupyter-xingcheng/KoPA/kopa.py", line 106, in forward
return self.llama_model(
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/peft_model.py", line 678, in forward
return self.base_model(
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(args, kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 1164, in forward
outputs = self.model(
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 968, in forward
layer_outputs = decoder_layer(
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(args, kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 713, in forward
hidden_states, self_attn_weights, present_key_value = self.self_attn(
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, *kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 327, in forward
query_states = self.q_proj(hidden_states)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/home/jupyter-xingcheng/.conda/envs/KoPA/lib/python3.9/site-packages/peft/tuners/lora.py", line 565, in forward
result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
RuntimeError: expected scalar type Float but found Half