I tried to run Pretrain.py using COCO dataset, but I failed and I got this error
Anybody can help me to solve this?
utils.init_distributed_mode(args) utils.init_distributed_mode(args) [30/763]
File "/Volume/ALBEF/utils.py", line 259, in init_distributed_mode File "/Volume/ALBEF/utils.py", line 259, in init_distributed_mode
utils.init_distributed_mode(args) File "/Volume/ALBEF/utils.py", line 259, in init_distributed_mode
main(args, config) File "Pretrain.py", line 87, in main
torch.distributed.barrier()torch.distributed.barrier()
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier
utils.init_distributed_mode(args) torch.distributed.barrier()
File "/Volume/ALBEF/utils.py", line 259, in init_distributed_mode File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier
torch.distributed.barrier() File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier
work = default_pg.barrier(opts=opts) RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1169, unhandled system error, NCCL version 21.0.3
ncclSystemError: System call (socket, malloc, munmap, etc) failed. work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1169, unhandled system error, NCCL version 21.0.3 ncclSystemError: System call (socket, malloc, munmap, etc) failed.work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1169, unhandled system error, NCCL version 21.0.3
ncclSystemError: System call (socket, malloc, munmap, etc) failed. work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1169, unhandled system error, NCCL version 21.0.3
ncclSystemError: System call (socket, malloc, munmap, etc) failed.
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 8304) of binary: /usr/local/bin/python
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 193, in
main()
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/run.py", line 715, in run
elastic_launch(
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Hi.
I tried to run Pretrain.py using COCO dataset, but I failed and I got this error
Anybody can help me to solve this?
utils.init_distributed_mode(args) utils.init_distributed_mode(args) [30/763]
main()
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/run.py", line 715, in run
elastic_launch(
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 245, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
File "/Volume/ALBEF/utils.py", line 259, in init_distributed_mode File "/Volume/ALBEF/utils.py", line 259, in init_distributed_mode
utils.init_distributed_mode(args) File "/Volume/ALBEF/utils.py", line 259, in init_distributed_mode
main(args, config) File "Pretrain.py", line 87, in main
torch.distributed.barrier()torch.distributed.barrier()
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier
utils.init_distributed_mode(args) torch.distributed.barrier()
File "/Volume/ALBEF/utils.py", line 259, in init_distributed_mode File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier
torch.distributed.barrier() File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 2776, in barrier
work = default_pg.barrier(opts=opts) RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1169, unhandled system error, NCCL version 21.0.3
ncclSystemError: System call (socket, malloc, munmap, etc) failed. work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1169, unhandled system error, NCCL version 21.0.3 ncclSystemError: System call (socket, malloc, munmap, etc) failed.work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1169, unhandled system error, NCCL version 21.0.3
ncclSystemError: System call (socket, malloc, munmap, etc) failed. work = default_pg.barrier(opts=opts)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1169, unhandled system error, NCCL version 21.0.3
ncclSystemError: System call (socket, malloc, munmap, etc) failed. ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 8304) of binary: /usr/local/bin/python Traceback (most recent call last): File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launch.py", line 193, in