root@DESKTOP-SG3UNG7:/mnt/d/ChatGLM/ChatGLM-1/ptuning# bash ds_train_finetune.sh
[2023-04-26 17:54:06,251] [WARNING] [runner.py:190:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2023-04-26 17:54:06,265] [INFO] [runner.py:540:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=38353 --enable_each_rank_log=None main.py --deepspeed deepspeed.json --do_train --train_file AdvertiseGen/train.json --test_file AdvertiseGen/dev.json --prompt_column content --response_column summary --overwrite_cache --model_name_or_path ./THUDM/chatglm-6b --output_dir ./output/adgen-chatglm-6b-ft-1e-4 --overwrite_output_dir --max_source_length 64 --max_target_length 64 --per_device_train_batch_size 4 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --predict_with_generate --max_steps 5000 --logging_steps 10 --save_steps 1000 --learning_rate 1e-4 --fp16
[2023-04-26 17:54:07,114] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]}
[2023-04-26 17:54:07,114] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=4, node_rank=0
[2023-04-26 17:54:07,114] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3]})
[2023-04-26 17:54:07,114] [INFO] [launch.py:247:main] dist_world_size=4
[2023-04-26 17:54:07,114] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
[2023-04-26 17:54:08,522] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
Traceback (most recent call last):
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 429, in
main()
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 58, in main
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses
obj = dtype(inputs)
File "", line 113, in init
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1227, in post_init__
and (self.device.type != "cuda")
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1659, in device
return self._setup_devices
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 54, in get__
cached = self.fget(obj)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1594, in _setup_devices
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 588, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 32, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 58, in init_process_group
torch.distributed.init_process_group(backend,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1009, in _new_process_group_helper
backend_class = ProcessGroupNCCL(backend_prefix_store, group_rank, group_size, pg_options)
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
Traceback (most recent call last):
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 429, in
Traceback (most recent call last):
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 429, in
main()
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 58, in main
main()
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 58, in main
Traceback (most recent call last):
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 429, in
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses
obj = dtype(inputs)
File "", line 113, in init
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1227, in __post_init__
and (self.device.type != "cuda")
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1659, in device
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses
return self._setup_devicesobj = dtype(**inputs)
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 54, in get
File "", line 113, in init
cached = self.fget(obj)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1594, in _setup_devices
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 588, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 32, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 58, in init_process_group
torch.distributed.init_process_group(backend,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1009, in _new_process_group_helper
backend_class = ProcessGroupNCCL(backend_prefix_store, group_rank, group_size, pg_options)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1227, in post_init__
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
and (self.device.type != "cuda")
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1659, in device
return self._setup_devices
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 54, in get
cached = self.fget(obj)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1594, in _setup_devices
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 588, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 32, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 58, in init_process_group
torch.distributed.init_process_group(backend,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1009, in _new_process_group_helper
backend_class = ProcessGroupNCCL(backend_prefix_store, group_rank, group_size, pg_options)
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
main()
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 58, in main
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses
obj = dtype(**inputs)
File "", line 113, in init
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1227, in __post_init
and (self.device.type != "cuda")
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1659, in device
return self._setup_devices
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 54, in get
cached = self.fget(obj)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1594, in _setup_devices
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 588, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 32, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 58, in init_process_group
torch.distributed.init_process_group(backend,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1009, in _new_process_group_helper
backend_class = ProcessGroupNCCL(backend_prefix_store, group_rank, group_size, pg_options)
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
[2023-04-26 17:54:09,117] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 3200
[2023-04-26 17:54:09,118] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 3201
[2023-04-26 17:54:09,118] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 3202
[2023-04-26 17:54:09,119] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 3203
[2023-04-26 17:54:09,120] [ERROR] [launch.py:434:sigkill_handler] ['/usr/bin/python3', '-u', 'main.py', '--local_rank=3', '--deepspeed', 'deepspeed.json', '--do_train', '--train_file', 'AdvertiseGen/train.json', '--test_file', 'AdvertiseGen/dev.json', '--prompt_column', 'content', '--response_column', 'summary', '--overwrite_cache', '--model_name_or_path', './THUDM/chatglm-6b', '--output_dir', './output/adgen-chatglm-6b-ft-1e-4', '--overwrite_output_dir', '--max_source_length', '64', '--max_target_length', '64', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--predict_with_generate', '--max_steps', '5000', '--logging_steps', '10', '--save_steps', '1000', '--learning_rate', '1e-4', '--fp16'] exits with return code = 1
Expected Behavior
No response
Steps To Reproduce
DeepSpeed 环境已经安装
执行脚本bash ds_train_finetune.sh报错
Environment
- OS:WSL
- Python:3.10.6
- Transformers:4.27.1
- PyTorch:2.0.0+cu117
- CUDA Support (`python -c "import torch; print(torch.cuda.is_available())"`) :
Is there an existing issue for this?
Current Behavior
执行bash ds_train_finetune.sh
root@DESKTOP-SG3UNG7:/mnt/d/ChatGLM/ChatGLM-1/ptuning# bash ds_train_finetune.sh [2023-04-26 17:54:06,251] [WARNING] [runner.py:190:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. [2023-04-26 17:54:06,265] [INFO] [runner.py:540:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=38353 --enable_each_rank_log=None main.py --deepspeed deepspeed.json --do_train --train_file AdvertiseGen/train.json --test_file AdvertiseGen/dev.json --prompt_column content --response_column summary --overwrite_cache --model_name_or_path ./THUDM/chatglm-6b --output_dir ./output/adgen-chatglm-6b-ft-1e-4 --overwrite_output_dir --max_source_length 64 --max_target_length 64 --per_device_train_batch_size 4 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --predict_with_generate --max_steps 5000 --logging_steps 10 --save_steps 1000 --learning_rate 1e-4 --fp16 [2023-04-26 17:54:07,114] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]} [2023-04-26 17:54:07,114] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=4, node_rank=0 [2023-04-26 17:54:07,114] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3]}) [2023-04-26 17:54:07,114] [INFO] [launch.py:247:main] dist_world_size=4 [2023-04-26 17:54:07,114] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3 [2023-04-26 17:54:08,522] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl Traceback (most recent call last): File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 429, in
main()
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 58, in main
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses
obj = dtype(inputs)
File "", line 113, in init
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1227, in post_init__
and (self.device.type != "cuda")
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1659, in device
return self._setup_devices
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 54, in get__
cached = self.fget(obj)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1594, in _setup_devices
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 588, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 32, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 58, in init_process_group
torch.distributed.init_process_group(backend,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1009, in _new_process_group_helper
backend_class = ProcessGroupNCCL(backend_prefix_store, group_rank, group_size, pg_options)
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
Traceback (most recent call last):
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 429, in
Traceback (most recent call last):
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 429, in
main()
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 58, in main
main()
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 58, in main
Traceback (most recent call last):
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 429, in
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses
obj = dtype( inputs)
File "", line 113, in init
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1227, in __post_init__
and (self.device.type != "cuda")
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1659, in device
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses
return self._setup_devicesobj = dtype(**inputs)
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 54, in get File "", line 113, in init
cached = self.fget(obj)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1594, in _setup_devices
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 588, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 32, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 58, in init_process_group
torch.distributed.init_process_group(backend,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1009, in _new_process_group_helper
backend_class = ProcessGroupNCCL(backend_prefix_store, group_rank, group_size, pg_options)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1227, in post_init__
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
and (self.device.type != "cuda")
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1659, in device
return self._setup_devices
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 54, in get
cached = self.fget(obj)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1594, in _setup_devices
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 588, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 32, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 58, in init_process_group
torch.distributed.init_process_group(backend,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1009, in _new_process_group_helper
backend_class = ProcessGroupNCCL(backend_prefix_store, group_rank, group_size, pg_options)
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
main()
File "/mnt/d/ChatGLM/ChatGLM-1/ptuning/main.py", line 58, in main
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "/usr/local/lib/python3.10/dist-packages/transformers/hf_argparser.py", line 332, in parse_args_into_dataclasses
obj = dtype(**inputs)
File "", line 113, in init
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1227, in __post_init
and (self.device.type != "cuda")
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1659, in device
return self._setup_devices
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/generic.py", line 54, in get
cached = self.fget(obj)
File "/usr/local/lib/python3.10/dist-packages/transformers/training_args.py", line 1594, in _setup_devices
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/comm.py", line 588, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 32, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "/usr/local/lib/python3.10/dist-packages/deepspeed/comm/torch.py", line 58, in init_process_group
torch.distributed.init_process_group(backend,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 895, in init_process_group
default_pg = _new_process_group_helper(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1009, in _new_process_group_helper
backend_class = ProcessGroupNCCL(backend_prefix_store, group_rank, group_size, pg_options)
RuntimeError: ProcessGroupNCCL is only supported with GPUs, no GPUs found!
[2023-04-26 17:54:09,117] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 3200
[2023-04-26 17:54:09,118] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 3201
[2023-04-26 17:54:09,118] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 3202
[2023-04-26 17:54:09,119] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 3203
[2023-04-26 17:54:09,120] [ERROR] [launch.py:434:sigkill_handler] ['/usr/bin/python3', '-u', 'main.py', '--local_rank=3', '--deepspeed', 'deepspeed.json', '--do_train', '--train_file', 'AdvertiseGen/train.json', '--test_file', 'AdvertiseGen/dev.json', '--prompt_column', 'content', '--response_column', 'summary', '--overwrite_cache', '--model_name_or_path', './THUDM/chatglm-6b', '--output_dir', './output/adgen-chatglm-6b-ft-1e-4', '--overwrite_output_dir', '--max_source_length', '64', '--max_target_length', '64', '--per_device_train_batch_size', '4', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--predict_with_generate', '--max_steps', '5000', '--logging_steps', '10', '--save_steps', '1000', '--learning_rate', '1e-4', '--fp16'] exits with return code = 1
Expected Behavior
No response
Steps To Reproduce
DeepSpeed 环境已经安装![1682503327726](https://user-images.githubusercontent.com/4435827/234542253-cd9b4388-7a2a-424e-884a-5298254c78e8.png)
执行脚本bash ds_train_finetune.sh报错
Environment
Anything else?
No response