Open luoz66 opened 8 months ago
I find this bug result from some bad labels. After image split to small size, some bounding box will be cut apart,then some extremely narrow bounding boxes will be generated nearby the new image border. After forward propagation through the network, these bounding boxes will cause the network to generate prediction tensors with particularly large dimensions, then the training process was terminated due to exceeding the computational capacity. once we know the reason, then the solution is easy to find. Below are two solutions: 1.we can find the image and label being processed once training process was terminated, then delete them; 2.find bad labels like below and delete them
Prerequisite
Task
I'm using the official example scripts/configs for the officially supported tasks/models/datasets.
Branch
master branch https://github.com/open-mmlab/mmrotate
Environment
dataset_type = 'FAIR1MDataset' data_root = '/mnt/data/luozheng/data/split_ss_fair1m/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='RResize', img_scale=(1024, 1024)), dict( type='RRandomFlip', flip_ratio=[0.25, 0.25, 0.25], direction=['horizontal', 'vertical', 'diagonal'], version='le135'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1024, 1024), flip=False, transforms=[ dict(type='RResize'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type='FAIR1MDataset', ann_file='/mnt/data/luozheng/data/split_ss_fair1m/train/annfiles/', img_prefix='/mnt/data/luozheng/data/split_ss_fair1m/train/images/', pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='RResize', img_scale=(1024, 1024)), dict( type='RRandomFlip', flip_ratio=[0.25, 0.25, 0.25], direction=['horizontal', 'vertical', 'diagonal'], version='le135'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ], version='le135'), val=dict( type='FAIR1MDataset', ann_file='/mnt/data/luozheng/data/split_ss_fair1m/val/annfiles/', img_prefix='/mnt/data/luozheng/data/split_ss_fair1m/val/images/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1024, 1024), flip=False, transforms=[ dict(type='RResize'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img']) ]) ], version='le135'), test=dict( type='FAIR1MDataset', ann_file='/mnt/data/luozheng/data/split_ss_fair1m/test1/images/', img_prefix='/mnt/data/luozheng/data/split_ss_fair1m/test1/images/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1024, 1024), flip=False, transforms=[ dict(type='RResize'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img']) ]) ], version='le135')) evaluation = dict(interval=1, metric='mAP') optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[8, 11]) runner = dict(type='EpochBasedRunner', max_epochs=12) checkpoint_config = dict(interval=1) log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = 'work_dirs/rotated_reppoints_r50_fpn_1x_all-fair1m_le135/latest.pth' workflow = [('train', 1)] opencv_num_threads = 0 mp_start_method = 'fork' angle_version = 'le135' norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) model = dict( type='RotatedRepPoints', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, zero_init_residual=False, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs='on_input', num_outs=5, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)), bbox_head=dict( type='RotatedRepPointsHead', num_classes=34, in_channels=256, feat_channels=256, point_feat_channels=256, stacked_convs=3, num_points=9, gradient_mul=0.3, point_strides=[8, 16, 32, 64, 128], point_base_scale=2, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox_init=dict(type='ConvexGIoULoss', loss_weight=0.375), loss_bbox_refine=dict(type='ConvexGIoULoss', loss_weight=1.0), transform_method='rotrect', use_reassign=False, topk=6, anti_factor=0.75), train_cfg=dict( init=dict( assigner=dict(type='ConvexAssigner', scale=4, pos_num=1), allowed_border=-1, pos_weight=-1, debug=False), refine=dict( assigner=dict( type='MaxConvexIoUAssigner', pos_iou_thr=0.4, neg_iou_thr=0.3, min_pos_iou=0, ignore_iof_thr=-1), allowed_border=-1, pos_weight=-1, debug=False)), test_cfg=dict( nms_pre=2000, min_bbox_size=0, score_thr=0.05, nms=dict(iou_thr=0.4), max_per_img=2000)) work_dir = './work_dirs/rotated_reppoints_r50_fpn_1x_all-fair1m_le135' auto_resume = False gpu_ids = range(0, 1)
Reproduces the problem - code sample
File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmcv/parallel/data_parallel.py", line 51, in forward return super().forward(*inputs, kwargs) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 166, in forward return self.module(*inputs[0], *kwargs[0]) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(input, kwargs) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 119, in new_func return old_func(*args, kwargs) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmdet/models/detectors/base.py", line 174, in forward return self.forward_test(img, img_metas, kwargs) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmdet/models/detectors/base.py", line 147, in forward_test return self.simple_test(imgs[0], img_metas[0], *kwargs) File "/home/luozheng/PyCharmProject/mmrotate/mmrotate/models/detectors/single_stage.py", line 101, in simple_test bbox_list = self.bbox_head.get_bboxes( File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 208, in new_func return old_func(args, **kwargs) File "/home/luozheng/PyCharmProject/mmrotate/mmrotate/models/dense_heads/rotated_reppoints_head.py", line 1066, in get_bboxes results = self._get_bboxes_single(cls_score_list, point_pred_list, File "/home/luozheng/PyCharmProject/mmrotate/mmrotate/models/dense_heads/rotated_reppoints_head.py", line 1159, in _get_bboxes_single mlvl_bboxes[..., :4] /= mlvl_bboxes[..., :4].new_tensor(
Reproduces the problem - command or script
CUDA_VISIBLE_DEVICES=2 python tools/test.py work_dirs/rotated_reppoints_r50_fpn_1x_all-fair1m_le135/rotated_reppoints_r50_fpn_1x_all-fair1m_le135.py work_dirs/rotated_reppoints_r50_fpn_1x_all-fair1m_le135/latest.pth --format-only --eval-options submission_dir=work_dirs/rotated_reppoints_r50_fpn_1x_all-fair1m_le135/Task1_results
Reproduces the problem - error message
return forward_call(*input, kwargs) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmcv/parallel/data_parallel.py", line 51, in forward return super().forward(*inputs, *kwargs) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 166, in forward return self.module(inputs[0], kwargs[0]) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, kwargs) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 119, in new_func return old_func(*args, kwargs) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmdet/models/detectors/base.py", line 174, in forward return self.forward_test(img, img_metas, kwargs) File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmdet/models/detectors/base.py", line 147, in forward_test return self.simple_test(imgs[0], img_metas[0], *kwargs) File "/home/luozheng/PyCharmProject/mmrotate/mmrotate/models/detectors/single_stage.py", line 101, in simple_test bbox_list = self.bbox_head.get_bboxes( File "/home/luozheng/miniconda3/envs/mmrotate/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 208, in new_func return old_func(args, kwargs) File "/home/luozheng/PyCharmProject/mmrotate/mmrotate/models/dense_heads/rotated_reppoints_head.py", line 1066, in get_bboxes results = self._get_bboxes_single(cls_score_list, point_pred_list, File "/home/luozheng/PyCharmProject/mmrotate/mmrotate/models/dense_heads/rotated_reppoints_head.py", line 1159, in _get_bboxes_single mlvl_bboxes[..., :4] /= mlvl_bboxes[..., :4].new_tensor( RuntimeError: CUDA error: an illegal memory access was encountered CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. terminate called after throwing an instance of 'c10::CUDAError' what(): CUDA error: an illegal memory access was encountered CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
Additional information
No response