[2024-09-05 09:40:49,421] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use).
[W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).
[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address.
Traceback (most recent call last):
File "~/DeepSeek-Coder/finetune_v2/finetune_deepseekcoder.py", line 194, in
train()
File "~/DeepSeek-Coder/finetune_v2/finetune_deepseekcoder.py", line 124, in train
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses
obj = dtype(inputs)
File "", line 119, in init
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1442, in post_init__
and (self.device.type != "cuda")
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1887, in device
return self._setup_devices
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/utils/generic.py", line 54, in get__
cached = self.fget(obj)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1813, in _setup_devices
self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/accelerate/state.py", line 180, in init
dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, kwargs)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/comm.py", line 670, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/torch.py", line 120, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/torch.py", line 146, in init_process_group
torch.distributed.init_process_group(backend,
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 900, in init_process_group
store, rank, world_size = next(rendezvous_iterator)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 245, in _env_rendezvous_handler
store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 176, in _create_c10d_store
return TCPStore(
RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).
[2024-09-05 09:40:50,578] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777243
[2024-09-05 09:40:50,579] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777244
[2024-09-05 09:40:50,632] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777245
[2024-09-05 09:40:50,685] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777246
[2024-09-05 09:40:49,421] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl [W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use). [W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use). [E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address. Traceback (most recent call last): File "~/DeepSeek-Coder/finetune_v2/finetune_deepseekcoder.py", line 194, in
train()
File "~/DeepSeek-Coder/finetune_v2/finetune_deepseekcoder.py", line 124, in train
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/hf_argparser.py", line 338, in parse_args_into_dataclasses
obj = dtype(inputs)
File "", line 119, in init
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1442, in post_init__
and (self.device.type != "cuda")
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1887, in device
return self._setup_devices
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/utils/generic.py", line 54, in get__
cached = self.fget(obj)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/transformers/training_args.py", line 1813, in _setup_devices
self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/accelerate/state.py", line 180, in init
dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, kwargs)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/comm.py", line 670, in init_distributed
cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/torch.py", line 120, in init
self.init_process_group(backend, timeout, init_method, rank, world_size)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/deepspeed/comm/torch.py", line 146, in init_process_group
torch.distributed.init_process_group(backend,
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 900, in init_process_group
store, rank, world_size = next(rendezvous_iterator)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 245, in _env_rendezvous_handler
store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout)
File "~/miniconda3/envs/onlyft/lib/python3.9/site-packages/torch/distributed/rendezvous.py", line 176, in _create_c10d_store
return TCPStore(
RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).
[2024-09-05 09:40:50,578] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777243
[2024-09-05 09:40:50,579] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777244
[2024-09-05 09:40:50,632] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777245
[2024-09-05 09:40:50,685] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 777246