Hello, the author. The following errors occurred during the training of my coco format dataset. What are the causes and how to solve them? Thank you very much.
2022-06-06 22:11:33,374 - mmseg - INFO - Iter [50/160000] lr: 9.997e-03, eta: 11:50:52, time: 0.267, data_time: 0.007, memory: 828, decode.loss_seg: 0.1570, decode.acc_seg: 94.3870, loss: 0.1570
2022-06-06 22:11:44,482 - mmseg - INFO - Iter [100/160000] lr: 9.994e-03, eta: 10:51:19, time: 0.222, data_time: 0.003, memory: 828, decode.loss_seg: 0.1489, decode.acc_seg: 94.5770, loss: 0.1489
2022-06-06 22:11:55,525 - mmseg - INFO - Iter [150/160000] lr: 9.992e-03, eta: 10:30:11, time: 0.221, data_time: 0.003, memory: 828, decode.loss_seg: 0.1607, decode.acc_seg: 94.2994, loss: 0.1607
2022-06-06 22:11:57,747 - mmseg - INFO - Saving checkpoint at 160 iterations
[ ] 0/94768, elapsed: 0s, ETA:Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer._rebuildbuckets():
RuntimeError: replicas[0].size() == rebuilt_paramindices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer._rebuildbuckets():
RuntimeError: replicas[0].size() == rebuilt_paramindices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, *data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(input, kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer._rebuildbuckets():
RuntimeError: replicas[0].size() == rebuilt_paramindices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer._rebuildbuckets():
RuntimeError: replicas[0].size() == rebuilt_paramindices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/distributed/launch.py", line 260, in
main()
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/distributed/launch.py", line 256, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/data/home/scv4589/.conda/envs/bpr/bin/python', '-u', 'tools/train.py', '--local_rank=3', 'configs/bpr/hrnet18s_128.py', '--launcher', 'pytorch']' returned non-zero exit status 1.
Hello, the author. The following errors occurred during the training of my coco format dataset. What are the causes and how to solve them? Thank you very much. 2022-06-06 22:11:33,374 - mmseg - INFO - Iter [50/160000] lr: 9.997e-03, eta: 11:50:52, time: 0.267, data_time: 0.007, memory: 828, decode.loss_seg: 0.1570, decode.acc_seg: 94.3870, loss: 0.1570 2022-06-06 22:11:44,482 - mmseg - INFO - Iter [100/160000] lr: 9.994e-03, eta: 10:51:19, time: 0.222, data_time: 0.003, memory: 828, decode.loss_seg: 0.1489, decode.acc_seg: 94.5770, loss: 0.1489 2022-06-06 22:11:55,525 - mmseg - INFO - Iter [150/160000] lr: 9.992e-03, eta: 10:30:11, time: 0.221, data_time: 0.003, memory: 828, decode.loss_seg: 0.1607, decode.acc_seg: 94.2994, loss: 0.1607 2022-06-06 22:11:57,747 - mmseg - INFO - Saving checkpoint at 160 iterations [ ] 0/94768, elapsed: 0s, ETA:Traceback (most recent call last): File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer._rebuildbuckets():
RuntimeError: replicas[0].size() == rebuilt_paramindices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer._rebuildbuckets():
RuntimeError: replicas[0].size() == rebuilt_paramindices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, *data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(input, kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer._rebuildbuckets():
RuntimeError: replicas[0].size() == rebuilt_paramindices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "tools/train.py", line 161, in
main()
File "tools/train.py", line 157, in main
meta=meta)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 130, in run
iter_runner(iter_loaders[i], kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/home/scv4589/run/BPR-main/mmseg/core/evaluation/eval_hooks.py", line 89, in after_train_iter
gpu_collect=self.gpu_collect)
File "/data/home/scv4589/run/BPR-main/mmseg/apis/test.py", line 99, in multi_gpu_test
result = model(return_loss=False, rescale=True, data)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 606, in forward
if self.reducer._rebuildbuckets():
RuntimeError: replicas[0].size() == rebuilt_paramindices.size() INTERNAL ASSERT FAILED at "/pytorch/torch/csrc/distributed/c10d/reducer.cpp":1326, please report a bug to PyTorch. rebuilt parameter indices size is not same as original model parameters size.438 versus 70080
Traceback (most recent call last):
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/distributed/launch.py", line 260, in
main()
File "/data/home/scv4589/.conda/envs/bpr/lib/python3.7/site-packages/torch/distributed/launch.py", line 256, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/data/home/scv4589/.conda/envs/bpr/bin/python', '-u', 'tools/train.py', '--local_rank=3', 'configs/bpr/hrnet18s_128.py', '--launcher', 'pytorch']' returned non-zero exit status 1.