Hi, I got the following error when training with multiple GPUs, how should I fix it?
raceback (most recent call last):
File "train_rcnn.py", line 270, in
trainer.train(
File "/root/autodl-tmp/EPNet/tools/../tools/train_utils/train_utils.py", line 197, in train
loss, tb_dict, disp_dict = self._train_it(batch)
File "/root/autodl-tmp/EPNet/tools/../tools/train_utils/train_utils.py", line 130, in _train_it
loss, tb_dict, disp_dict = self.model_fn(self.model, batch)
File "/root/autodl-tmp/EPNet/tools/../lib/net/train_functions.py", line 46, in model_fn
ret_dict = model(input_data)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/root/miniconda3/lib/python3.8/site-packages/torch/_utils.py", line 434, in reraise
raise exception
AssertionError: Caught AssertionError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, *kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/EPNet/tools/../lib/net/point_rcnn.py", line 52, in forward
rois, roi_scores_raw = self.rpn.proposal_layer(rpn_scores_raw, rpn_reg, backbone_xyz) # (B, M, 7)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/EPNet/tools/../lib/rpn/proposal_layer.py", line 46, in forward
scores_single, proposals_single = self.distance_based_proposal(scores_single, proposals_single,
File "/root/autodl-tmp/EPNet/tools/../lib/rpn/proposal_layer.py", line 93, in distance_based_proposal
assert i == 2, '%d' % i
AssertionError: 1
Hi, I got the following error when training with multiple GPUs, how should I fix it?
raceback (most recent call last):
trainer.train(
File "/root/autodl-tmp/EPNet/tools/../tools/train_utils/train_utils.py", line 197, in train
loss, tb_dict, disp_dict = self._train_it(batch)
File "/root/autodl-tmp/EPNet/tools/../tools/train_utils/train_utils.py", line 130, in _train_it
loss, tb_dict, disp_dict = self.model_fn(self.model, batch)
File "/root/autodl-tmp/EPNet/tools/../lib/net/train_functions.py", line 46, in model_fn
ret_dict = model(input_data)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/root/miniconda3/lib/python3.8/site-packages/torch/_utils.py", line 434, in reraise
raise exception
AssertionError: Caught AssertionError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, *kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(input, kwargs)
File "/root/autodl-tmp/EPNet/tools/../lib/net/point_rcnn.py", line 52, in forward
rois, roi_scores_raw = self.rpn.proposal_layer(rpn_scores_raw, rpn_reg, backbone_xyz) # (B, M, 7)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/root/autodl-tmp/EPNet/tools/../lib/rpn/proposal_layer.py", line 46, in forward
scores_single, proposals_single = self.distance_based_proposal(scores_single, proposals_single,
File "/root/autodl-tmp/EPNet/tools/../lib/rpn/proposal_layer.py", line 93, in distance_based_proposal
assert i == 2, '%d' % i
AssertionError: 1
File "train_rcnn.py", line 270, in