Error while creating shared memory segment /dev/shm/nccl-CsYXMW (size 9637888)
Traceback (most recent call last):
File "/workspace/VisualGLM-6B-main/finetune_visualglm.py", line 194, in
training_main(args, model_cls=model, forward_step_function=forward_step, create_dataset_function=create_dataset_function, collate_fn=data_collator)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/sat/training/deepspeed_training.py", line 110, in training_main
model, optimizer = setup_model_untrainable_params_and_optimizer(args, model)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/sat/training/deepspeed_training.py", line 218, in setup_model_untrainable_params_andoptimizer
model, optimizer, , _ = deepspeed.initialize(
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/init.py", line 181, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 262, in init
self._configure_distributed_model(model)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
self._broadcast_model()
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
return func(*args, kwargs)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 224, in broadcast
return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 489, in _fn
return fn(*args, *kwargs)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 199, in broadcast
return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 72, in wrapper
return func(args, kwargs)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1914, in broadcast
work = group.broadcast([tensor], opts)
torch.distributed.DistBackendError: NCCL error in: /opt/conda/conda-bld/pytorch_1708025847130/work/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1691, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.19.3
ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.
Error while creating shared memory segment /dev/shm/nccl-CsYXMW (size 9637888) Traceback (most recent call last): File "/workspace/VisualGLM-6B-main/finetune_visualglm.py", line 194, in
training_main(args, model_cls=model, forward_step_function=forward_step, create_dataset_function=create_dataset_function, collate_fn=data_collator)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/sat/training/deepspeed_training.py", line 110, in training_main
model, optimizer = setup_model_untrainable_params_and_optimizer(args, model)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/sat/training/deepspeed_training.py", line 218, in setup_model_untrainable_params_andoptimizer
model, optimizer, , _ = deepspeed.initialize(
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/init.py", line 181, in initialize
engine = DeepSpeedEngine(args=args,
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 262, in init
self._configure_distributed_model(model)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1157, in _configure_distributed_model
self._broadcast_model()
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1077, in _broadcast_model
dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
return func(*args, kwargs)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 224, in broadcast
return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 489, in _fn
return fn(*args, *kwargs)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 199, in broadcast
return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 72, in wrapper
return func(args, kwargs)
File "/home/li/.conda/envs/vglm2/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1914, in broadcast
work = group.broadcast([tensor], opts)
torch.distributed.DistBackendError: NCCL error in: /opt/conda/conda-bld/pytorch_1708025847130/work/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1691, unhandled system error (run with NCCL_DEBUG=INFO for details), NCCL version 2.19.3
ncclSystemError: System call (e.g. socket, malloc) or external library call failed or device error.