RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #2 'other' in call to _th_max when using mosaic mixup in cascade RCNN

jamiechoi1995 commented 3 years ago

As mmdetection recently release 2.15.1, I want to use Mosaic and Mixup in Cascade RCNN, However, when I replace the relevant config from YOLOX to Cascade RCNN, I got the following error:

2021-08-12 06:53:24,512 - mmdet - INFO - workflow: [('train', 1)], max: 40 epochs Traceback (most recent call last): File "tools/train.py", line 188, in main() File "tools/train.py", line 184, in main meta=meta) File "mmdetection-2.15.1/mmdet/apis/train.py", line 170, in train_detector runner.run(data_loaders, cfg.workflow) File "mmcv-1.3.9/mmcv/runner/epoch_based_runner.py", line 127, in run epoch_runner(data_loaders[i], kwargs) File "mmcv-1.3.9/mmcv/runner/epoch_based_runner.py", line 50, in train self.run_iter(data_batch, train_mode=True, kwargs) File "mmcv-1.3.9/mmcv/runner/epoch_based_runner.py", line 30, in run_iter kwargs) File "mmcv-1.3.9/mmcv/parallel/data_parallel.py", line 67, in train_step return self.module.train_step(inputs[0], kwargs[0]) File "mmdetection-2.15.1/mmdet/models/detectors/base.py", line 237, in train_step losses = self(data) File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in call result = self.forward(input, kwargs) File "mmcv-1.3.9/mmcv/runner/fp16_utils.py", line 98, in new_func return old_func(args, kwargs) File "mmdetection-2.15.1/mmdet/models/detectors/base.py", line 171, in forward return self.forward_train(img, img_metas, kwargs) File "mmdetection-2.15.1/mmdet/models/detectors/two_stage.py", line 140, in forward_train proposal_cfg=proposal_cfg) File "mmdetection-2.15.1/mmdet/models/dense_heads/base_dense_head.py", line 54, in forward_train losses = self.loss(loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) File "mmdetection-2.15.1/mmdet/models/dense_heads/rpn_head.py", line 74, in loss gt_bboxes_ignore=gt_bboxes_ignore) File "mmcv-1.3.9/mmcv/runner/fp16_utils.py", line 186, in new_func return old_func(*args, *kwargs) File "mmdetection-2.15.1/mmdet/models/dense_heads/anchor_head.py", line 463, in loss label_channels=label_channels) File "mmdetection-2.15.1/mmdet/models/dense_heads/anchor_head.py", line 345, in get_targets unmap_outputs=unmap_outputs) File "mmdetection-2.15.1/mmdet/core/utils/misc.py", line 29, in multi_apply return tuple(map(list, zip(map_results))) File "mmdetection-2.15.1/mmdet/models/dense_heads/anchor_head.py", line 219, in _get_targets_single None if self.sampling else gt_labels) File "mmdetection-2.15.1/mmdet/core/bbox/assigners/max_iou_assigner.py", line 105, in assign overlaps = self.iou_calculator(gt_bboxes, bboxes) File "mmdetection-2.15.1/mmdet/core/bbox/iou_calculators/iou2d_calculator.py", line 65, in call return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned) File "mmdetection-2.15.1/mmdet/core/bbox/iou_calculators/iou2d_calculator.py", line 233, in bbox_overlaps bboxes2[..., None, :, :2]) # [B, rows, cols, 2] RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #2 'other' in call to _th_max

And below is my config:

model = dict( type='CascadeRCNN', backbone=dict( type='ResNeXt', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict( type='Pretrained', checkpoint= 'pretrained_model/resnext101_64x4d-ee2c6f71.pth' ), groups=64, base_width=4), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)), roi_head=dict( type='CascadeRoIHead', num_stages=3, stage_loss_weights=[1, 0.5, 0.25], bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=12, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=12, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.05, 0.05, 0.1, 0.1]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=12, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.033, 0.033, 0.067, 0.067]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ]), train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=2000, max_per_img=2000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False) ]), test_cfg=dict( rpn=dict( nms_pre=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))) dataset_type = 'CocoDataset' data_root = 'dataset/trainval/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

img_scale = (832, 832)

train_pipeline = [ dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), dict( type='RandomAffine', scaling_ratio_range=(0.1, 2), border=(-img_scale[0] // 2, -img_scale[1] // 2)), dict( type='MixUp', img_scale=img_scale, ratio_range=(0.8, 1.6), pad_val=114.0), dict( type='PhotoMetricDistortion', brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Resize', keep_ratio=True), dict(type='Pad', pad_to_square=True, pad_val=114.0), dict(type='Normalize', **img_norm_cfg), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ]

test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=img_scale, flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Pad', size=img_scale, pad_val=114.0), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), #try dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img']) ]) ]

data = dict( samples_per_gpu=8, workers_per_gpu=2, train=dict( type='MultiImageMixDataset', dataset=dict( type=dataset_type, ann_file= 'dataset/trainval/annotations/instances_train2017.json', img_prefix='dataset/trainval/', pipeline=[ dict(type='LoadImageFromFile', to_float32=True), dict(type='LoadAnnotations', with_bbox=True) ], filter_empty_gt=False, ), pipeline=train_pipeline, dynamic_scale=img_scale), val=dict( type=dataset_type, ann_file= 'dataset/trainval/annotations/instances_val2017.json', img_prefix='dataset/trainval/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file= 'dataset/trainval/annotations/instances_val2017.json', img_prefix='dataset/trainval/', ) ) evaluation = dict(interval=1, metric='bbox', save_best='bbox_mAP_50') optimizer = dict( type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005, nesterov=True, paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0)) optimizer_config = dict(grad_clip=None) lr_config = dict( policy='YOLOX', warmup='exp', by_epoch=False, warmup_by_epoch=True, warmup_ratio=1, warmup_iters=5, num_last_epochs=15, min_lr_ratio=0.05) runner = dict(type='EpochBasedRunner', max_epochs=40) checkpoint_config = dict(interval=1) log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) custom_hooks = [ dict(type='YOLOXModeSwitchHook', num_last_epochs=5, priority=48), dict( type='SyncRandomSizeHook', ratio_range=(14, 26), img_scale=(640, 640), interval=1, priority=48), dict(type='SyncNormHook', num_last_epochs=15, interval=1, priority=48), dict(type='ExpMomentumEMAHook', resume_from=None, priority=49) ]

dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] work_dir = './work_dirs/cascade_rcnn_x101_64x4d_fpn_20e_coco-3-832-mixupmosaic' gpu_ids = range(0, 1)

hhaAndroid commented 3 years ago

@jamiechoi1995 In fact, we did not test the above combination configuration, and there may be an incompatibility.

jamiechoi1995 commented 3 years ago

@jamiechoi1995 In fact, we did not test the above combination configuration, and there may be an incompatibility.

I found that this error is due to the RandomAffine, MixUp and Mosaic augmentation returning Double type bbox, I solve it by forcing the bbox type of the above augmentation to float32.

hhaAndroid commented 3 years ago

@jamiechoi1995 In fact, we did not test the above combination configuration, and there may be an incompatibility.

I found that this error is due to the RandomAffine, MixUp and Mosaic augmentation returning Double type bbox, I solve it by forcing the bbox type of the above augmentation to float32.

Indeed possible. Can you create a PR to fix it?

RangiLyu commented 3 years ago

I find that it's because the dtype of warp_matrix in RandomAffine is float64, so after multiply this matrix to the box, the type of box becomes float64.

open-mmlab / mmdetection

RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #2 'other' in call to _th_max when using mosaic mixup in cascade RCNN #5870