Traceback (most recent call last):
Traceback (most recent call last):
File "src/train_bash.py", line 14, in
File "src/train_bash.py", line 14, in
main()
main()
File "src/train_bash.py", line 5, in main
File "src/train_bash.py", line 5, in main
run_exp()run_exp()
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/tune.py", line 30, in run_exp
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/tune.py", line 30, in run_exp
run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/ppo/workflow.py", line 30, in run_ppo
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/ppo/workflow.py", line 30, in run_ppo
model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo")model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo")
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/core/loader.py", line 184, in load_model_and_tokenizer
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/core/loader.py", line 184, in load_model_and_tokenizer
model = AutoModelForCausalLM.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained
model = AutoModelForCausalLM.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained
return model_class.from_pretrained(return model_class.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2903, in from_pretrained
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2903, in from_pretrained
) = cls._load_pretrained_model() = cls._load_pretrained_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model
new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 694, in _load_state_dict_into_meta_model
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 694, in _load_state_dict_into_meta_model
param = param.to(old_param.dtype)param = param.to(old_param.dtype)
RuntimeErrorRuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 939524096 bytes. Error code 12 (Cannot allocate memory):
[enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 939524096 bytes. Error code 12 (Cannot allocate memory)
Loading checkpoint shards: 40%|██████████████████████████████▊ | 6/15 [01:44<02:36, 17.43s/it]
Traceback (most recent call last):
File "src/train_bash.py", line 14, in
main()
File "src/train_bash.py", line 5, in main
run_exp()
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/tune.py", line 30, in run_exp
run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/ppo/workflow.py", line 30, in run_ppo
model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo")
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/core/loader.py", line 184, in load_model_and_tokenizer
model = AutoModelForCausalLM.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained
return model_class.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2903, in from_pretrained
) = cls._load_pretrained_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model
new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 694, in _load_state_dict_into_meta_model
param = param.to(old_param.dtype)
RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 939524096 bytes. Error code 12 (Cannot allocate memory)
Loading checkpoint shards: 40%|██████████████████████████████▊ | 6/15 [01:44<02:36, 17.41s/it]
Traceback (most recent call last):
File "src/train_bash.py", line 14, in
main()
File "src/train_bash.py", line 5, in main
run_exp()
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/tune.py", line 30, in run_exp
run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/ppo/workflow.py", line 30, in run_ppo
model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo")
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/core/loader.py", line 184, in load_model_and_tokenizer
model = AutoModelForCausalLM.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained
return model_class.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2903, in from_pretrained
) = cls._load_pretrained_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model
new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 694, in _load_state_dict_into_meta_model
param = param.to(old_param.dtype)
RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 939524096 bytes. Error code 12 (Cannot allocate memory)
Loading checkpoint shards: 47%|███████████████████████████████████▉ | 7/15 [01:54<02:40, 20.04s/it]WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34615 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34617 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34622 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34623 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34625 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34626 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34627 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 4 (pid: 34624) of binary: /root/miniconda3/envs/deepspeed/bin/python
Traceback (most recent call last):
File "/root/miniconda3/envs/deepspeed/bin/accelerate", line 8, in
sys.exit(main())
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/accelerate/commands/accelerate_cli.py", line 47, in main
args.func(args)
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/accelerate/commands/launch.py", line 977, in launch_command
multi_gpu_launcher(args)
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/accelerate/commands/launch.py", line 646, in multi_gpu_launcher
distrib_run.run(args)
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
src/train_bash.py FAILED
Failures:
------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-10-20_09:18:47
host : VM-0-11-centos
rank : 4 (local_rank: 4)
exitcode : -9 (pid: 34624)
error_file:
traceback : Signal 9 (SIGKILL) received by PID 34624
======================================================
启动脚本: output_model=ppo_lora_output if [ ! -d ${output_model} ];then mkdir ${output_model} fi
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch src/train_bash.py \ --stage ppo \ --model_name_or_path /data1/ptmodels/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16 \ --do_train \ --dataset alpaca_gpt4_en \ --template default \ --finetuning_type lora \ --lora_target q_proj,v_proj \ --resume_lora_training False \ --reward_model /data1/ptmodels/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16 \ --output_dir ${output_model} \ --per_device_train_batch_size 2 \ --gradient_accumulation_steps 4 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --save_steps 1000 \ --learning_rate 1e-5 \ --num_train_epochs 1.0 \ --plot_loss \ --fp16
CONFIG配置: compute_environment: LOCAL_MACHINE distributed_type: MULTI_GPU downcast_bf16: 'no' gpu_ids: all machine_rank: 0 main_training_function: main mixed_precision: fp16 num_machines: 1 num_processes: 8 rdzv_backend: static same_network: true tpu_env: [] tpu_use_cluster: false tpu_use_sudo: false use_cpu: false
报错如下: Loading checkpoint shards: 40%|██████████████████████████████▊ | 6/15 [01:44<02:36, 17.37s/it]
Traceback (most recent call last): Traceback (most recent call last): File "src/train_bash.py", line 14, in
File "src/train_bash.py", line 14, in
main()
main() File "src/train_bash.py", line 5, in main File "src/train_bash.py", line 5, in main run_exp()run_exp()
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/tune.py", line 30, in run_exp File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/tune.py", line 30, in run_exp run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/ppo/workflow.py", line 30, in run_ppo File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/ppo/workflow.py", line 30, in run_ppo model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo")model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo")
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/core/loader.py", line 184, in load_model_and_tokenizer File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/core/loader.py", line 184, in load_model_and_tokenizer model = AutoModelForCausalLM.from_pretrained( File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained model = AutoModelForCausalLM.from_pretrained( File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained return model_class.from_pretrained(return model_class.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2903, in from_pretrained File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2903, in from_pretrained ) = cls._load_pretrained_model() = cls._load_pretrained_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model( File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 694, in _load_state_dict_into_meta_model File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 694, in _load_state_dict_into_meta_model param = param.to(old_param.dtype)param = param.to(old_param.dtype)
RuntimeErrorRuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 939524096 bytes. Error code 12 (Cannot allocate memory): [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 939524096 bytes. Error code 12 (Cannot allocate memory) Loading checkpoint shards: 40%|██████████████████████████████▊ | 6/15 [01:44<02:36, 17.43s/it] Traceback (most recent call last): File "src/train_bash.py", line 14, in
main()
File "src/train_bash.py", line 5, in main
run_exp()
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/tune.py", line 30, in run_exp
run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/ppo/workflow.py", line 30, in run_ppo
model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo")
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/core/loader.py", line 184, in load_model_and_tokenizer
model = AutoModelForCausalLM.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained
return model_class.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2903, in from_pretrained
) = cls._load_pretrained_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model
new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 694, in _load_state_dict_into_meta_model
param = param.to(old_param.dtype)
RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 939524096 bytes. Error code 12 (Cannot allocate memory)
Loading checkpoint shards: 40%|██████████████████████████████▊ | 6/15 [01:44<02:36, 17.41s/it]
Traceback (most recent call last):
File "src/train_bash.py", line 14, in
main()
File "src/train_bash.py", line 5, in main
run_exp()
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/tune.py", line 30, in run_exp
run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/ppo/workflow.py", line 30, in run_ppo
model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo")
File "/data1/wangtengfei/LLaMA-Factory/src/llmtuner/tuner/core/loader.py", line 184, in load_model_and_tokenizer
model = AutoModelForCausalLM.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/models/auto/auto_factory.py", line 493, in from_pretrained
return model_class.from_pretrained(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 2903, in from_pretrained
) = cls._load_pretrained_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 3260, in _load_pretrained_model
new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/transformers/modeling_utils.py", line 694, in _load_state_dict_into_meta_model
param = param.to(old_param.dtype)
RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 939524096 bytes. Error code 12 (Cannot allocate memory)
Loading checkpoint shards: 47%|███████████████████████████████████▉ | 7/15 [01:54<02:40, 20.04s/it]WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34615 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34617 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34622 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34623 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34625 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34626 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 34627 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 4 (pid: 34624) of binary: /root/miniconda3/envs/deepspeed/bin/python
Traceback (most recent call last):
File "/root/miniconda3/envs/deepspeed/bin/accelerate", line 8, in
sys.exit(main())
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/accelerate/commands/accelerate_cli.py", line 47, in main
args.func(args)
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/accelerate/commands/launch.py", line 977, in launch_command
multi_gpu_launcher(args)
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/accelerate/commands/launch.py", line 646, in multi_gpu_launcher
distrib_run.run(args)
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/envs/deepspeed/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
src/train_bash.py FAILED
Failures: