When running bash tools/my_dist_train_detr_ssod.sh dino_detr_ssod 1 10 4 I ran into an issue that I thought might be a potential bug. The specific error message is as follows:
2023-09-14 10:26:17,052 - mmdet.detr_ssod - INFO - Checkpoints will be saved to /root/Semi-DETR-main/work_dirs/detr_ssod_dino_detr_r50_tiny_coco_0.1_240k-0914/10/1 by HardDiskBackend.
Traceback (most recent call last):
File "tools/train_detr_ssod.py", line 201, in
main()
File "tools/train_detr_ssod.py", line 189, in main
train_detector(
File "/root/Semi-DETR-main/detr_ssod/apis/train.py", line 224, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/root/miniconda3/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 117, in run
iter_loaders = [IterLoader(x) for x in data_loaders]
File "/root/miniconda3/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 117, in
iter_loaders = [IterLoader(x) for x in data_loaders]
File "/root/miniconda3/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 23, in init
self.iter_loader = iter(self._dataloader)
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 359, in iter
return self._get_iterator()
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 305, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 944, in init
self._reset(loader, first_iter=True)
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 975, in _reset
self._try_put_index()
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1209, in _try_put_index
index = self._next_index()
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 512, in _next_index
return next(self._sampler_iter) # may raise StopIteration
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/sampler.py", line 226, in iter
for idx in self.sampler:
File "/root/Semi-DETR-main/detr_ssod/datasets/samplers/semi_sampler.py", line 187, in iter
assert len(indices) == len(self)
AssertionError
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 3 (pid: 58789) of binary: /root/miniconda3/bin/python
ERROR:torch.distributed.elastic.agent.server.local_elastic_agent:[default] Worker group failed
INFO:torch.distributed.elastic.agent.server.api:[default] Worker group FAILED. 3/3 attempts left; will restart worker group
INFO:torch.distributed.elastic.agent.server.api:[default] Stopping worker group
INFO:torch.distributed.elastic.agent.server.api:[default] Rendezvous'ing worker group
INFO:torch.distributed.elastic.agent.server.api:[default] Rendezvous complete for workers. Result:
restart_count=1
master_addr=127.0.0.1
master_port=29625
group_rank=0
group_world_size=1
local_ranks=[0, 1, 2, 3]
role_ranks=[0, 1, 2, 3]
global_ranks=[0, 1, 2, 3]
role_world_sizes=[4, 4, 4, 4]
global_world_sizes=[4, 4, 4, 4]
INFO:torch.distributed.elastic.agent.server.api:[default] Starting worker group
INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_esa_wjr0/none_627jvqv3/attempt_1/0/error.json
INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_esa_wjr0/none_627jvqv3/attempt_1/1/error.json
INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_esa_wjr0/none_627jvqv3/attempt_1/2/error.json
INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_esa_wjr0/none_627jvqv3/attempt_1/3/error.json
/root/miniconda3/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 32 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
When running bash tools/my_dist_train_detr_ssod.sh dino_detr_ssod 1 10 4 I ran into an issue that I thought might be a potential bug. The specific error message is as follows: 2023-09-14 10:26:17,052 - mmdet.detr_ssod - INFO - Checkpoints will be saved to /root/Semi-DETR-main/work_dirs/detr_ssod_dino_detr_r50_tiny_coco_0.1_240k-0914/10/1 by HardDiskBackend. Traceback (most recent call last): File "tools/train_detr_ssod.py", line 201, in
main()
File "tools/train_detr_ssod.py", line 189, in main
train_detector(
File "/root/Semi-DETR-main/detr_ssod/apis/train.py", line 224, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/root/miniconda3/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 117, in run
iter_loaders = [IterLoader(x) for x in data_loaders]
File "/root/miniconda3/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 117, in
iter_loaders = [IterLoader(x) for x in data_loaders]
File "/root/miniconda3/lib/python3.8/site-packages/mmcv/runner/iter_based_runner.py", line 23, in init
self.iter_loader = iter(self._dataloader)
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 359, in iter
return self._get_iterator()
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 305, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 944, in init
self._reset(loader, first_iter=True)
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 975, in _reset
self._try_put_index()
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1209, in _try_put_index
index = self._next_index()
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 512, in _next_index
return next(self._sampler_iter) # may raise StopIteration
File "/root/miniconda3/lib/python3.8/site-packages/torch/utils/data/sampler.py", line 226, in iter
for idx in self.sampler:
File "/root/Semi-DETR-main/detr_ssod/datasets/samplers/semi_sampler.py", line 187, in iter
assert len(indices) == len(self)
AssertionError
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 3 (pid: 58789) of binary: /root/miniconda3/bin/python
ERROR:torch.distributed.elastic.agent.server.local_elastic_agent:[default] Worker group failed
INFO:torch.distributed.elastic.agent.server.api:[default] Worker group FAILED. 3/3 attempts left; will restart worker group
INFO:torch.distributed.elastic.agent.server.api:[default] Stopping worker group
INFO:torch.distributed.elastic.agent.server.api:[default] Rendezvous'ing worker group
INFO:torch.distributed.elastic.agent.server.api:[default] Rendezvous complete for workers. Result:
restart_count=1
master_addr=127.0.0.1
master_port=29625
group_rank=0
group_world_size=1
local_ranks=[0, 1, 2, 3]
role_ranks=[0, 1, 2, 3]
global_ranks=[0, 1, 2, 3]
role_world_sizes=[4, 4, 4, 4]
global_world_sizes=[4, 4, 4, 4]
INFO:torch.distributed.elastic.agent.server.api:[default] Starting worker group INFO:torch.distributed.elastic.multiprocessing:Setting worker0 reply file to: /tmp/torchelastic_esa_wjr0/none_627jvqv3/attempt_1/0/error.json INFO:torch.distributed.elastic.multiprocessing:Setting worker1 reply file to: /tmp/torchelastic_esa_wjr0/none_627jvqv3/attempt_1/1/error.json INFO:torch.distributed.elastic.multiprocessing:Setting worker2 reply file to: /tmp/torchelastic_esa_wjr0/none_627jvqv3/attempt_1/2/error.json INFO:torch.distributed.elastic.multiprocessing:Setting worker3 reply file to: /tmp/torchelastic_esa_wjr0/none_627jvqv3/attempt_1/3/error.json /root/miniconda3/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 32 leaked semaphore objects to clean up at shutdown warnings.warn('resource_tracker: There appear to be %d '