WXinlong / SOLO

SOLO and SOLOv2 for instance segmentation, ECCV 2020 & NeurIPS 2020.
Other
1.69k stars 307 forks source link

Error when using 4 cards. #156

Open ztt0821 opened 3 years ago

ztt0821 commented 3 years ago

Hi, I found this problem when I run solo and solov2. /home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/functional.py:2479: UserWarning: Default upsampling behavior when mode=bilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. "See the documentation of nn.Upsample for details.".format(mode)) /home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/functional.py:2479: UserWarning: Default upsampling behavior when mode=bilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. "See the documentation of nn.Upsample for details.".format(mode)) /home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/functional.py:2479: UserWarning: Default upsampling behavior when mode=bilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. "See the documentation of nn.Upsample for details.".format(mode)) /home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/functional.py:2479: UserWarning: Default upsampling behavior when mode=bilinear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. "See the documentation of nn.Upsample for details.".format(mode)) Traceback (most recent call last): File "./tools/train.py", line 125, in main() File "./tools/train.py", line 121, in main timestamp=timestamp) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 103, in train_detector timestamp=timestamp) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 250, in _dist_train runner.run(data_loaders, cfg.workflow, cfg.total_epochs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/runner/runner.py", line 364, in run epoch_runner(data_loaders[i], kwargs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/runner/runner.py", line 268, in train self.model, data_batch, train_mode=True, kwargs) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 78, in batch_processor losses = model(data) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in call result = self.forward(*input, *kwargs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 54, in forward return self.module(inputs[0], kwargs[0]) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in call result = self.forward(*input, kwargs) File "/public/ttzhang9/SOLO/mmdet/core/fp16/decorators.py", line 49, in new_func return old_func(args, kwargs) File "/public/ttzhang9/SOLO/mmdet/models/detectors/base.py", line 142, in forward return self.forward_train(img, img_meta, kwargs) File "/public/ttzhang9/SOLO/mmdet/models/detectors/single_stage_ins.py", line 78, in forward_train loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 192, in loss featmap_sizes=featmap_sizes) File "/public/ttzhang9/SOLO/mmdet/core/utils/misc.py", line 24, in multi_apply return tuple(map(list, zip(map_results))) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 281, in solo_target_single center_ws, center_hs = center_of_mass(gt_masks_pt) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 20, in center_of_mass m10 = (bitmasks xs).sum(dim=-1).sum(dim=-1) RuntimeError: expected device cuda:0 and dtype Float but got device cuda:0 and dtype Byte Traceback (most recent call last): File "./tools/train.py", line 125, in main() File "./tools/train.py", line 121, in main timestamp=timestamp) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 103, in train_detector timestamp=timestamp) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 250, in _dist_train runner.run(data_loaders, cfg.workflow, cfg.total_epochs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/runner/runner.py", line 364, in run epoch_runner(data_loaders[i], kwargs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/runner/runner.py", line 268, in train self.model, data_batch, train_mode=True, kwargs) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 78, in batch_processor losses = model(data) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in call result = self.forward(*input, kwargs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 54, in forward return self.module(*inputs[0], *kwargs[0]) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in call result = self.forward(input, kwargs) File "/public/ttzhang9/SOLO/mmdet/core/fp16/decorators.py", line 49, in new_func return old_func(args, kwargs) File "/public/ttzhang9/SOLO/mmdet/models/detectors/base.py", line 142, in forward return self.forward_train(img, img_meta, kwargs) File "/public/ttzhang9/SOLO/mmdet/models/detectors/single_stage_ins.py", line 78, in forward_train loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 192, in loss featmap_sizes=featmap_sizes) File "/public/ttzhang9/SOLO/mmdet/core/utils/misc.py", line 24, in multi_apply return tuple(map(list, zip(map_results))) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 281, in solo_target_single center_ws, center_hs = center_of_mass(gt_masks_pt) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 20, in center_of_mass m10 = (bitmasks xs).sum(dim=-1).sum(dim=-1) RuntimeError: expected device cuda:1 and dtype Float but got device cuda:1 and dtype Byte Traceback (most recent call last): File "./tools/train.py", line 125, in main() File "./tools/train.py", line 121, in main timestamp=timestamp) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 103, in train_detector timestamp=timestamp) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 250, in _dist_train runner.run(data_loaders, cfg.workflow, cfg.total_epochs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/runner/runner.py", line 364, in run epoch_runner(data_loaders[i], kwargs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/runner/runner.py", line 268, in train self.model, data_batch, train_mode=True, kwargs) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 78, in batch_processor losses = model(data) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in call result = self.forward(*input, *kwargs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 54, in forward return self.module(inputs[0], kwargs[0]) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in call result = self.forward(*input, kwargs) File "/public/ttzhang9/SOLO/mmdet/core/fp16/decorators.py", line 49, in new_func return old_func(args, kwargs) File "/public/ttzhang9/SOLO/mmdet/models/detectors/base.py", line 142, in forward return self.forward_train(img, img_meta, kwargs) File "/public/ttzhang9/SOLO/mmdet/models/detectors/single_stage_ins.py", line 78, in forward_train loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 192, in loss featmap_sizes=featmap_sizes) File "/public/ttzhang9/SOLO/mmdet/core/utils/misc.py", line 24, in multi_apply return tuple(map(list, zip(map_results))) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 281, in solo_target_single center_ws, center_hs = center_of_mass(gt_masks_pt) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 20, in center_of_mass m10 = (bitmasks xs).sum(dim=-1).sum(dim=-1) RuntimeError: expected device cuda:3 and dtype Float but got device cuda:3 and dtype Byte Traceback (most recent call last): File "./tools/train.py", line 125, in main() File "./tools/train.py", line 121, in main timestamp=timestamp) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 103, in train_detector timestamp=timestamp) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 250, in _dist_train runner.run(data_loaders, cfg.workflow, cfg.total_epochs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/runner/runner.py", line 364, in run epoch_runner(data_loaders[i], kwargs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/runner/runner.py", line 268, in train self.model, data_batch, train_mode=True, kwargs) File "/public/ttzhang9/SOLO/mmdet/apis/train.py", line 78, in batch_processor losses = model(data) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in call result = self.forward(*input, kwargs) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 54, in forward return self.module(*inputs[0], *kwargs[0]) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in call result = self.forward(input, kwargs) File "/public/ttzhang9/SOLO/mmdet/core/fp16/decorators.py", line 49, in new_func return old_func(args, kwargs) File "/public/ttzhang9/SOLO/mmdet/models/detectors/base.py", line 142, in forward return self.forward_train(img, img_meta, kwargs) File "/public/ttzhang9/SOLO/mmdet/models/detectors/single_stage_ins.py", line 78, in forward_train loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 192, in loss featmap_sizes=featmap_sizes) File "/public/ttzhang9/SOLO/mmdet/core/utils/misc.py", line 24, in multi_apply return tuple(map(list, zip(map_results))) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 281, in solo_target_single center_ws, center_hs = center_of_mass(gt_masks_pt) File "/public/ttzhang9/SOLO/mmdet/models/anchor_heads/solo_head.py", line 20, in center_of_mass m10 = (bitmasks xs).sum(dim=-1).sum(dim=-1) RuntimeError: expected device cuda:2 and dtype Float but got device cuda:2 and dtype Byte Traceback (most recent call last): File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/distributed/launch.py", line 246, in main() File "/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/lib/python3.7/site-packages/torch/distributed/launch.py", line 242, in main cmd=cmd) subprocess.CalledProcessError: Command '['/home/grads/ttzhang9/anaconda3/envs/open-mmlabv1/bin/python', '-u', './tools/train.py', '--local_rank=3', 'configs/solo/solo_r50_fpn_4gpu_3x.py', '--launcher', 'pytorch']' returned non-zero exit status 1.