open-mmlab / mmdetection

OpenMMLab Detection Toolbox and Benchmark
https://mmdetection.readthedocs.io
Apache License 2.0
29.57k stars 9.46k forks source link

Memory usage is continuously increaseing #5096

Closed yan9qu closed 3 years ago

yan9qu commented 3 years ago

PyTorch: 1.7.1+cu101 PyTorch compiling details: PyTorch built with:

TorchVision: 0.8.2+cu101 OpenCV: 4.5.1 MMCV: 1.2.4 MMCV Compiler: GCC 7.3 MMCV CUDA Compiler: 10.1 MMDetection: 2.11.0+

2021-04-30 16:43:14,905 - mmdet - INFO - Distributed training: False 2021-04-30 16:43:17,437 - mmdet - INFO - Config: model = dict( type='MaskRCNN', pretrained='pretrained/swin_small_patch4_window7_224.pth', backbone=dict( type='SwinTransformer', embed_dim=96, depths=[2, 2, 18, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, ape=False, patch_norm=True, out_indices=(0, 1, 2, 3), use_checkpoint=False), neck=dict( type='FPN', in_channels=[96, 192, 384, 768], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=15, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=15, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=2000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)), test_cfg=dict( rpn=dict( nms_pre=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5))) dataset_type = 'CocoDataset' data_root = '/home/ds/DOTA/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='AutoAugment', policies=[[{ 'type': 'Resize', 'img_scale': [(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], 'multiscale_mode': 'value', 'keep_ratio': True }], [{ 'type': 'Resize', 'img_scale': [(400, 1333), (500, 1333), (600, 1333)], 'multiscale_mode': 'value', 'keep_ratio': True }, { 'type': 'RandomCrop', 'crop_type': 'absolute_range', 'crop_size': (384, 600), 'allow_negative_crop': True }, { 'type': 'Resize', 'img_scale': [(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], 'multiscale_mode': 'value', 'override': True, 'keep_ratio': True }]]), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(800, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=1, workers_per_gpu=2, train=dict( type='CocoDataset', ann_file='/home/ds/DOTA/DOTA.json', img_prefix='/home/ds/DOTA/800/images/', pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='AutoAugment', policies=[[{ 'type': 'Resize', 'img_scale': [(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], 'multiscale_mode': 'value', 'keep_ratio': True }], [{ 'type': 'Resize', 'img_scale': [(400, 1333), (500, 1333), (600, 1333)], 'multiscale_mode': 'value', 'keep_ratio': True }, { 'type': 'RandomCrop', 'crop_type': 'absolute_range', 'crop_size': (384, 600), 'allow_negative_crop': True }, { 'type': 'Resize', 'img_scale': [(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], 'multiscale_mode': 'value', 'override': True, 'keep_ratio': True }]]), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) ]), val=dict( type='CocoDataset', ann_file='/home/ds/DOTA/DOTA.json', img_prefix='/home/ds/DOTA/800/images/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(800, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ]), test=dict( type='CocoDataset', ann_file='/home/ds/DOTA/DOTA.json', img_prefix='/home/ds/DOTA/800/images/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(800, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ])) evaluation = dict(metric=['bbox', 'segm']) optimizer = dict( type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05, paramwise_cfg=dict( custom_keys=dict( absolute_pos_embed=dict(decay_mult=0.0), relative_position_bias_table=dict(decay_mult=0.0), norm=dict(decay_mult=0.0)))) optimizer_config = dict( grad_clip=None, type='DistOptimizerHook', update_interval=1, coalesce=True, bucket_size_mb=-1, use_fp16=True) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[27, 33]) runner = dict(type='EpochBasedRunnerAmp', max_epochs=12) checkpoint_config = dict(interval=1) log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) custom_hooks = [dict(type='NumClassCheckHook')] dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] fp16 = None work_dir = './work_dirs/mask_rcnn_swin_small_patch4_window7_mstrain_480-800_adamw_3x_coco' gpu_ids = range(0, 1)

2021-04-30 16:43:17,903 - mmdet - INFO - load model from: pretrained/swin_small_patch4_window7_224.pth 2021-04-30 16:43:18,315 - mmdet - WARNING - The model and loaded state dict do not match exactly

unexpected key in source state_dict: norm.weight, norm.bias, head.weight, head.bias, layers.0.blocks.1.attn_mask, layers.1.blocks.1.attn_mask, layers.2.blocks.1.attn_mask, layers.2.blocks.3.attn_mask, layers.2.blocks.5.attn_mask, layers.2.blocks.7.attn_mask, layers.2.blocks.9.attn_mask, layers.2.blocks.11.attn_mask, layers.2.blocks.13.attn_mask, layers.2.blocks.15.attn_mask, layers.2.blocks.17.attn_mask

missing keys in source state_dict: norm0.weight, norm0.bias, norm1.weight, norm1.bias, norm2.weight, norm2.bias, norm3.weight, norm3.bias

2021-04-30 16:43:23,735 - mmdet - INFO - Start running, host: ds@ds-zmv, work_dir: /home/ds/code_base/Swin-Transformer-Object-Detection-master/work_dirs/mask_rcnn_swin_small_patch4_window7_mstrain_480-800_adamw_3x_coco 2021-04-30 16:43:23,735 - mmdet - INFO - workflow: [('train', 1)], max: 12 epochs 2021-04-30 16:43:46,409 - mmdet - INFO - Epoch [1][50/21306] lr: 9.890e-06, eta: 1 day, 8:11:37, time: 0.453, data_time: 0.062, memory: 4643, loss_rpn_cls: 0.6752, loss_rpn_bbox: 0.1157, loss_cls: 1.6888, acc: 59.2578, loss_bbox: 0.0478, loss_mask: 0.8694, loss: 3.3970 2021-04-30 16:44:09,449 - mmdet - INFO - Epoch [1][100/21306] lr: 1.988e-05, eta: 1 day, 8:27:01, time: 0.461, data_time: 0.046, memory: 5737, loss_rpn_cls: 0.5354, loss_rpn_bbox: 0.0927, loss_cls: 0.3128, acc: 95.4453, loss_bbox: 0.0394, loss_mask: 0.6069, loss: 1.5873 2021-04-30 16:44:28,821 - mmdet - INFO - Epoch [1][150/21306] lr: 2.987e-05, eta: 1 day, 6:47:43, time: 0.387, data_time: 0.010, memory: 5737, loss_rpn_cls: 0.3781, loss_rpn_bbox: 0.0976, loss_cls: 0.2065, acc: 95.6992, loss_bbox: 0.0562, loss_mask: 0.4559, loss: 1.1944 2021-04-30 16:44:49,829 - mmdet - INFO - Epoch [1][200/21306] lr: 3.986e-05, eta: 1 day, 6:32:45, time: 0.420, data_time: 0.025, memory: 5737, loss_rpn_cls: 0.3576, loss_rpn_bbox: 0.1303, loss_cls: 0.3682, acc: 91.1953, loss_bbox: 0.1695, loss_mask: 0.4374, loss: 1.4629 2021-04-30 16:45:10,512 - mmdet - INFO - Epoch [1][250/21306] lr: 4.985e-05, eta: 1 day, 6:18:06, time: 0.414, data_time: 0.017, memory: 5737, loss_rpn_cls: 0.2253, loss_rpn_bbox: 0.0762, loss_cls: 0.3006, acc: 93.8125, loss_bbox: 0.1631, loss_mask: 0.4036, loss: 1.1688 2021-04-30 16:45:29,581 - mmdet - INFO - Epoch [1][300/21306] lr: 5.984e-05, eta: 1 day, 5:45:19, time: 0.381, data_time: 0.007, memory: 5737, loss_rpn_cls: 0.2112, loss_rpn_bbox: 0.1039, loss_cls: 0.3917, acc: 91.1523, loss_bbox: 0.2605, loss_mask: 0.4451, loss: 1.4124 2021-04-30 16:45:49,922 - mmdet - INFO - Epoch [1][350/21306] lr: 6.983e-05, eta: 1 day, 5:37:16, time: 0.407, data_time: 0.018, memory: 5737, loss_rpn_cls: 0.2150, loss_rpn_bbox: 0.1069, loss_cls: 0.3294, acc: 92.2266, loss_bbox: 0.2004, loss_mask: 0.3659, loss: 1.2177 2021-04-30 16:46:09,013 - mmdet - INFO - Epoch [1][400/21306] lr: 7.982e-05, eta: 1 day, 5:17:52, time: 0.382, data_time: 0.010, memory: 5737, loss_rpn_cls: 0.2114, loss_rpn_bbox: 0.0950, loss_cls: 0.3481, acc: 91.6680, loss_bbox: 0.2058, loss_mask: 0.3928, loss: 1.2530 2021-04-30 16:46:28,983 - mmdet - INFO - Epoch [1][450/21306] lr: 8.981e-05, eta: 1 day, 5:11:00, time: 0.399, data_time: 0.009, memory: 5737, loss_rpn_cls: 0.1744, loss_rpn_bbox: 0.0986, loss_cls: 0.3765, acc: 91.8281, loss_bbox: 0.2650, loss_mask: 0.3543, loss: 1.2688 2021-04-30 16:46:48,980 - mmdet - INFO - Epoch [1][500/21306] lr: 9.980e-05, eta: 1 day, 5:05:41, time: 0.400, data_time: 0.016, memory: 5737, loss_rpn_cls: 0.1920, loss_rpn_bbox: 0.1307, loss_cls: 0.4430, acc: 88.2617, loss_bbox: 0.3553, loss_mask: 0.3898, loss: 1.5108 2021-04-30 16:47:10,576 - mmdet - INFO - Epoch [1][550/21306] lr: 1.000e-04, eta: 1 day, 5:13:37, time: 0.432, data_time: 0.021, memory: 5737, loss_rpn_cls: 0.1559, loss_rpn_bbox: 0.1154, loss_cls: 0.4082, acc: 89.4414, loss_bbox: 0.3372, loss_mask: 0.4380, loss: 1.4548 2021-04-30 16:47:31,738 - mmdet - INFO - Epoch [1][600/21306] lr: 1.000e-04, eta: 1 day, 5:17:06, time: 0.423, data_time: 0.023, memory: 5737, loss_rpn_cls: 0.2020, loss_rpn_bbox: 0.1366, loss_cls: 0.4348, acc: 88.4180, loss_bbox: 0.2813, loss_mask: 0.3627, loss: 1.4174 2021-04-30 16:47:50,597 - mmdet - INFO - Epoch [1][650/21306] lr: 1.000e-04, eta: 1 day, 5:04:57, time: 0.377, data_time: 0.010, memory: 5737, loss_rpn_cls: 0.1684, loss_rpn_bbox: 0.0853, loss_cls: 0.3858, acc: 89.8828, loss_bbox: 0.2886, loss_mask: 0.3843, loss: 1.3124 2021-04-30 16:48:10,535 - mmdet - INFO - Epoch [1][700/21306] lr: 1.000e-04, eta: 1 day, 5:01:02, time: 0.399, data_time: 0.016, memory: 5737, loss_rpn_cls: 0.1625, loss_rpn_bbox: 0.1089, loss_cls: 0.4761, acc: 87.4883, loss_bbox: 0.3826, loss_mask: 0.4025, loss: 1.5326 2021-04-30 16:48:30,053 - mmdet - INFO - Epoch [1][750/21306] lr: 1.000e-04, eta: 1 day, 4:55:12, time: 0.390, data_time: 0.008, memory: 5737, loss_rpn_cls: 0.1233, loss_rpn_bbox: 0.0821, loss_cls: 0.3691, acc: 89.4141, loss_bbox: 0.3223, loss_mask: 0.3882, loss: 1.2850 2021-04-30 16:48:49,254 - mmdet - INFO - Epoch [1][800/21306] lr: 1.000e-04, eta: 1 day, 4:48:23, time: 0.384, data_time: 0.012, memory: 5737, loss_rpn_cls: 0.1673, loss_rpn_bbox: 0.1004, loss_cls: 0.4539, acc: 88.8516, loss_bbox: 0.3543, loss_mask: 0.3961, loss: 1.4720 2021-04-30 16:49:08,144 - mmdet - INFO - Epoch [1][850/21306] lr: 1.000e-04, eta: 1 day, 4:40:47, time: 0.378, data_time: 0.011, memory: 5737, loss_rpn_cls: 0.1161, loss_rpn_bbox: 0.0837, loss_cls: 0.3776, acc: 90.4062, loss_bbox: 0.2869, loss_mask: 0.4194, loss: 1.2839 2021-04-30 16:49:28,820 - mmdet - INFO - Epoch [1][900/21306] lr: 1.000e-04, eta: 1 day, 4:42:25, time: 0.414, data_time: 0.023, memory: 5737, loss_rpn_cls: 0.1202, loss_rpn_bbox: 0.0972, loss_cls: 0.4380, acc: 88.5898, loss_bbox: 0.3504, loss_mask: 0.3959, loss: 1.4017 2021-04-30 16:49:48,799 - mmdet - INFO - Epoch [1][950/21306] lr: 1.000e-04, eta: 1 day, 4:40:43, time: 0.400, data_time: 0.015, memory: 5737, loss_rpn_cls: 0.1384, loss_rpn_bbox: 0.0940, loss_cls: 0.4897, acc: 88.3047, loss_bbox: 0.3499, loss_mask: 0.3674, loss: 1.4394 2021-04-30 16:50:10,098 - mmdet - INFO - Exp name: mask_rcnn_swin_small_patch4_window7_mstrain_480-800_adamw_3x_coco.py 2021-04-30 16:50:10,098 - mmdet - INFO - Epoch [1][1000/21306] lr: 1.000e-04, eta: 1 day, 4:44:46, time: 0.426, data_time: 0.017, memory: 5737, loss_rpn_cls: 0.1420, loss_rpn_bbox: 0.1062, loss_cls: 0.4146, acc: 88.7031, loss_bbox: 0.3501, loss_mask: 0.3843, loss: 1.3972 2021-04-30 16:50:30,530 - mmdet - INFO - Epoch [1][1050/21306] lr: 1.000e-04, eta: 1 day, 4:44:53, time: 0.409, data_time: 0.014, memory: 5737, loss_rpn_cls: 0.1245, loss_rpn_bbox: 0.0624, loss_cls: 0.4010, acc: 88.8906, loss_bbox: 0.3014, loss_mask: 0.3468, loss: 1.2360 2021-04-30 16:50:51,189 - mmdet - INFO - Epoch [1][1100/21306] lr: 1.000e-04, eta: 1 day, 4:45:50, time: 0.413, data_time: 0.014, memory: 5737, loss_rpn_cls: 0.1480, loss_rpn_bbox: 0.0858, loss_cls: 0.3865, acc: 88.4297, loss_bbox: 0.3645, loss_mask: 0.3507, loss: 1.3354 2021-04-30 16:51:10,939 - mmdet - INFO - Epoch [1][1150/21306] lr: 1.000e-04, eta: 1 day, 4:43:20, time: 0.395, data_time: 0.013, memory: 5737, loss_rpn_cls: 0.1191, loss_rpn_bbox: 0.0665, loss_cls: 0.3268, acc: 90.5195, loss_bbox: 0.2741, loss_mask: 0.3653, loss: 1.1517 2021-04-30 16:51:29,668 - mmdet - INFO - Epoch [1][1200/21306] lr: 1.000e-04, eta: 1 day, 4:37:24, time: 0.375, data_time: 0.010, memory: 5737, loss_rpn_cls: 0.0974, loss_rpn_bbox: 0.0770, loss_cls: 0.3449, acc: 89.6406, loss_bbox: 0.3273, loss_mask: 0.3673, loss: 1.2138 2021-04-30 16:51:50,950 - mmdet - INFO - Epoch [1][1250/21306] lr: 1.000e-04, eta: 1 day, 4:40:34, time: 0.426, data_time: 0.016, memory: 5737, loss_rpn_cls: 0.1030, loss_rpn_bbox: 0.0874, loss_cls: 0.3812, acc: 89.1641, loss_bbox: 0.3138, loss_mask: 0.3795, loss: 1.2650 2021-04-30 16:52:11,706 - mmdet - INFO - Epoch [1][1300/21306] lr: 1.000e-04, eta: 1 day, 4:41:45, time: 0.415, data_time: 0.016, memory: 5737, loss_rpn_cls: 0.1081, loss_rpn_bbox: 0.0748, loss_cls: 0.3342, acc: 89.6367, loss_bbox: 0.3203, loss_mask: 0.3901, loss: 1.2274 2021-04-30 16:52:31,156 - mmdet - INFO - Epoch [1][1350/21306] lr: 1.000e-04, eta: 1 day, 4:38:44, time: 0.389, data_time: 0.017, memory: 5737, loss_rpn_cls: 0.1230, loss_rpn_bbox: 0.0699, loss_cls: 0.3083, acc: 91.1211, loss_bbox: 0.2575, loss_mask: 0.3478, loss: 1.1064 2021-04-30 16:52:52,868 - mmdet - INFO - Epoch [1][1400/21306] lr: 1.000e-04, eta: 1 day, 4:42:45, time: 0.434, data_time: 0.028, memory: 5737, loss_rpn_cls: 0.1096, loss_rpn_bbox: 0.0987, loss_cls: 0.4365, acc: 86.7148, loss_bbox: 0.4093, loss_mask: 0.3603, loss: 1.4144 2021-04-30 16:53:13,472 - mmdet - INFO - Epoch [1][1450/21306] lr: 1.000e-04, eta: 1 day, 4:43:13, time: 0.412, data_time: 0.018, memory: 5737, loss_rpn_cls: 0.0971, loss_rpn_bbox: 0.0745, loss_cls: 0.3540, acc: 89.2383, loss_bbox: 0.3331, loss_mask: 0.3351, loss: 1.1938 2021-04-30 16:53:34,320 - mmdet - INFO - Epoch [1][1500/21306] lr: 1.000e-04, eta: 1 day, 4:44:19, time: 0.417, data_time: 0.016, memory: 5737, loss_rpn_cls: 0.0979, loss_rpn_bbox: 0.0828, loss_cls: 0.3408, acc: 90.2461, loss_bbox: 0.3027, loss_mask: 0.3750, loss: 1.1990 2021-04-30 16:53:55,083 - mmdet - INFO - Epoch [1][1550/21306] lr: 1.000e-04, eta: 1 day, 4:45:06, time: 0.415, data_time: 0.014, memory: 5737, loss_rpn_cls: 0.0781, loss_rpn_bbox: 0.0742, loss_cls: 0.3872, acc: 87.4023, loss_bbox: 0.3822, loss_mask: 0.3291, loss: 1.2508 2021-04-30 16:54:16,051 - mmdet - INFO - Epoch [1][1600/21306] lr: 1.000e-04, eta: 1 day, 4:46:22, time: 0.419, data_time: 0.021, memory: 5737, loss_rpn_cls: 0.0964, loss_rpn_bbox: 0.0789, loss_cls: 0.3909, acc: 87.2500, loss_bbox: 0.3988, loss_mask: 0.3297, loss: 1.2947 2021-04-30 16:54:36,138 - mmdet - INFO - Epoch [1][1650/21306] lr: 1.000e-04, eta: 1 day, 4:45:15, time: 0.402, data_time: 0.013, memory: 5737, loss_rpn_cls: 0.1300, loss_rpn_bbox: 0.0847, loss_cls: 0.3254, acc: 89.8047, loss_bbox: 0.3239, loss_mask: 0.3627, loss: 1.2268 2021-04-30 16:54:56,157 - mmdet - INFO - Epoch [1][1700/21306] lr: 1.000e-04, eta: 1 day, 4:44:02, time: 0.400, data_time: 0.010, memory: 5737, loss_rpn_cls: 0.1344, loss_rpn_bbox: 0.0855, loss_cls: 0.3569, acc: 89.0430, loss_bbox: 0.3668, loss_mask: 0.3629, loss: 1.3066 2021-04-30 16:55:16,011 - mmdet - INFO - Epoch [1][1750/21306] lr: 1.000e-04, eta: 1 day, 4:42:27, time: 0.397, data_time: 0.014, memory: 5737, loss_rpn_cls: 0.1063, loss_rpn_bbox: 0.0756, loss_cls: 0.3223, acc: 89.2891, loss_bbox: 0.3462, loss_mask: 0.3379, loss: 1.1884 2021-04-30 16:55:36,129 - mmdet - INFO - Epoch [1][1800/21306] lr: 1.000e-04, eta: 1 day, 4:41:34, time: 0.402, data_time: 0.014, memory: 5737, loss_rpn_cls: 0.1168, loss_rpn_bbox: 0.0622, loss_cls: 0.3183, acc: 89.7031, loss_bbox: 0.3284, loss_mask: 0.3735, loss: 1.1992 2021-04-30 16:55:57,035 - mmdet - INFO - Epoch [1][1850/21306] lr: 1.000e-04, eta: 1 day, 4:42:31, time: 0.418, data_time: 0.016, memory: 5737, loss_rpn_cls: 0.1502, loss_rpn_bbox: 0.1255, loss_cls: 0.4343, acc: 85.7070, loss_bbox: 0.4205, loss_mask: 0.3414, loss: 1.4717 2021-04-30 16:56:18,937 - mmdet - INFO - Epoch [1][1900/21306] lr: 1.000e-04, eta: 1 day, 4:45:36, time: 0.438, data_time: 0.022, memory: 5737, loss_rpn_cls: 0.1068, loss_rpn_bbox: 0.1059, loss_cls: 0.4277, acc: 87.2187, loss_bbox: 0.3930, loss_mask: 0.3477, loss: 1.3811 2021-04-30 16:56:40,518 - mmdet - INFO - Epoch [1][1950/21306] lr: 1.000e-04, eta: 1 day, 4:47:50, time: 0.432, data_time: 0.020, memory: 5737, loss_rpn_cls: 0.1063, loss_rpn_bbox: 0.0932, loss_cls: 0.4130, acc: 87.1016, loss_bbox: 0.4352, loss_mask: 0.3560, loss: 1.4037 2021-04-30 16:57:01,316 - mmdet - INFO - Exp name: mask_rcnn_swin_small_patch4_window7_mstrain_480-800_adamw_3x_coco.py 2021-04-30 16:57:01,316 - mmdet - INFO - Epoch [1][2000/21306] lr: 1.000e-04, eta: 1 day, 4:48:16, time: 0.416, data_time: 0.014, memory: 5737, loss_rpn_cls: 0.1005, loss_rpn_bbox: 0.0788, loss_cls: 0.3227, acc: 89.7031, loss_bbox: 0.3518, loss_mask: 0.3478, loss: 1.2017 2021-04-30 16:57:21,427 - mmdet - INFO - Epoch [1][2050/21306] lr: 1.000e-04, eta: 1 day, 4:47:15, time: 0.402, data_time: 0.015, memory: 5737, loss_rpn_cls: 0.1008, loss_rpn_bbox: 0.0810, loss_cls: 0.2912, acc: 90.6016, loss_bbox: 0.2889, loss_mask: 0.3628, loss: 1.1247 2021-04-30 16:57:42,035 - mmdet - INFO - Epoch [1][2100/21306] lr: 1.000e-04, eta: 1 day, 4:47:16, time: 0.412, data_time: 0.010, memory: 5737, loss_rpn_cls: 0.1316, loss_rpn_bbox: 0.0825, loss_cls: 0.3279, acc: 90.1758, loss_bbox: 0.3207, loss_mask: 0.3584, loss: 1.2211 2021-04-30 16:58:02,381 - mmdet - INFO - Epoch [1][2150/21306] lr: 1.000e-04, eta: 1 day, 4:46:45, time: 0.407, data_time: 0.014, memory: 5737, loss_rpn_cls: 0.1134, loss_rpn_bbox: 0.0789, loss_cls: 0.3364, acc: 89.8477, loss_bbox: 0.3267, loss_mask: 0.3703, loss: 1.2257 2021-04-30 16:58:22,204 - mmdet - INFO - Epoch [1][2200/21306] lr: 1.000e-04, eta: 1 day, 4:45:14, time: 0.396, data_time: 0.017, memory: 5737, loss_rpn_cls: 0.1134, loss_rpn_bbox: 0.0705, loss_cls: 0.2986, acc: 90.9961, loss_bbox: 0.2869, loss_mask: 0.3637, loss: 1.1331 2021-04-30 16:58:41,110 - mmdet - INFO - Epoch [1][2250/21306] lr: 1.000e-04, eta: 1 day, 4:42:03, time: 0.378, data_time: 0.005, memory: 5737, loss_rpn_cls: 0.0943, loss_rpn_bbox: 0.0667, loss_cls: 0.2806, acc: 91.3438, loss_bbox: 0.3066, loss_mask: 0.3231, loss: 1.0714 2021-04-30 16:59:02,045 - mmdet - INFO - Epoch [1][2300/21306] lr: 1.000e-04, eta: 1 day, 4:42:43, time: 0.419, data_time: 0.027, memory: 5737, loss_rpn_cls: 0.1028, loss_rpn_bbox: 0.0664, loss_cls: 0.3184, acc: 89.1758, loss_bbox: 0.3378, loss_mask: 0.3764, loss: 1.2018 2021-04-30 16:59:22,679 - mmdet - INFO - Epoch [1][2350/21306] lr: 1.000e-04, eta: 1 day, 4:42:48, time: 0.413, data_time: 0.009, memory: 5737, loss_rpn_cls: 0.0914, loss_rpn_bbox: 0.0754, loss_cls: 0.3666, acc: 88.7852, loss_bbox: 0.3623, loss_mask: 0.3417, loss: 1.2374 2021-04-30 16:59:42,440 - mmdet - INFO - Epoch [1][2400/21306] lr: 1.000e-04, eta: 1 day, 4:41:20, time: 0.395, data_time: 0.017, memory: 5737, loss_rpn_cls: 0.1181, loss_rpn_bbox: 0.0894, loss_cls: 0.3738, acc: 86.7891, loss_bbox: 0.3990, loss_mask: 0.3439, loss: 1.3243 2021-04-30 17:00:02,698 - mmdet - INFO - Epoch [1][2450/21306] lr: 1.000e-04, eta: 1 day, 4:40:46, time: 0.405, data_time: 0.012, memory: 5737, loss_rpn_cls: 0.1010, loss_rpn_bbox: 0.0852, loss_cls: 0.3944, acc: 86.5713, loss_bbox: 0.4285, loss_mask: 0.3462, loss: 1.3552 2021-04-30 17:00:24,660 - mmdet - INFO - Epoch [1][2500/21306] lr: 1.000e-04, eta: 1 day, 4:43:05, time: 0.439, data_time: 0.026, memory: 5737, loss_rpn_cls: 0.0980, loss_rpn_bbox: 0.0921, loss_cls: 0.3787, acc: 87.6992, loss_bbox: 0.3752, loss_mask: 0.3299, loss: 1.2740 2021-04-30 17:00:44,417 - mmdet - INFO - Epoch [1][2550/21306] lr: 1.000e-04, eta: 1 day, 4:41:39, time: 0.395, data_time: 0.010, memory: 5737, loss_rpn_cls: 0.0964, loss_rpn_bbox: 0.0816, loss_cls: 0.2864, acc: 90.8867, loss_bbox: 0.3216, loss_mask: 0.3602, loss: 1.1462 2021-04-30 17:01:04,405 - mmdet - INFO - Epoch [1][2600/21306] lr: 1.000e-04, eta: 1 day, 4:40:38, time: 0.400, data_time: 0.005, memory: 5737, loss_rpn_cls: 0.1054, loss_rpn_bbox: 0.0667, loss_cls: 0.2956, acc: 90.6406, loss_bbox: 0.3548, loss_mask: 0.3502, loss: 1.1726 2021-04-30 17:01:23,890 - mmdet - INFO - Epoch [1][2650/21306] lr: 1.000e-04, eta: 1 day, 4:38:51, time: 0.390, data_time: 0.013, memory: 5737, loss_rpn_cls: 0.0819, loss_rpn_bbox: 0.0656, loss_cls: 0.3245, acc: 88.7891, loss_bbox: 0.3463, loss_mask: 0.3627, loss: 1.1810 2021-04-30 17:01:46,654 - mmdet - INFO - Epoch [1][2700/21306] lr: 1.000e-04, eta: 1 day, 4:42:14, time: 0.455, data_time: 0.028, memory: 5737, loss_rpn_cls: 0.0979, loss_rpn_bbox: 0.1034, loss_cls: 0.3648, acc: 87.6484, loss_bbox: 0.4013, loss_mask: 0.3477, loss: 1.3151 2021-04-30 17:02:06,997 - mmdet - INFO - Epoch [1][2750/21306] lr: 1.000e-04, eta: 1 day, 4:41:46, time: 0.407, data_time: 0.011, memory: 5737, loss_rpn_cls: 0.0941, loss_rpn_bbox: 0.0841, loss_cls: 0.3174, acc: 88.8906, loss_bbox: 0.3430, loss_mask: 0.3304, loss: 1.1690 2021-04-30 17:02:28,024 - mmdet - INFO - Epoch [1][2800/21306] lr: 1.000e-04, eta: 1 day, 4:42:20, time: 0.421, data_time: 0.020, memory: 5737, loss_rpn_cls: 0.1151, loss_rpn_bbox: 0.0805, loss_cls: 0.3449, acc: 88.5586, loss_bbox: 0.3796, loss_mask: 0.3313, loss: 1.2514 2021-04-30 17:02:48,795 - mmdet - INFO - Epoch [1][2850/21306] lr: 1.000e-04, eta: 1 day, 4:42:29, time: 0.415, data_time: 0.023, memory: 5737, loss_rpn_cls: 0.0763, loss_rpn_bbox: 0.0906, loss_cls: 0.3194, acc: 88.7227, loss_bbox: 0.3650, loss_mask: 0.3327, loss: 1.1840

yan9qu commented 3 years ago

after Epoch[1][2850/21306], there was CUDA OOM. I used Mask RCNN last year, and the memory usage didn't change at all. Is this log normal?

ZwwWayne commented 3 years ago

The log seems to be normal. If you check the logs released in our model zoo, you will observe a similar phenomenon.

shinya7y commented 3 years ago

DETR-like random data augmentation tends to increase memory usage at random timing. Sparse R-CNN logs (RandomCrop False vs. True) are good examples.

Giving up on using RandomCrop or setting use_checkpoint=True will avoid OOM.

yan9qu commented 3 years ago

Thank you all. The problem is huge amount of bbox.

ramanathan831 commented 3 years ago

@chris075966 How to solve the problem when large number of bboxes are needed, then?

yan9qu commented 3 years ago
    if bboxes1.shape[0] == 0 or bboxes2.shape[0] == 0:
        raise ValueError('No gt or bboxes')
    gpu_assign_thr = 400
    assign_on_cpu = True if (gpu_assign_thr > 0) and (
        bboxes1.shape[0] > gpu_assign_thr) else False
    # compute overlap and assign gt on CPU when number of GT is large
    if assign_on_cpu:
        bboxes1 = bboxes1.cpu().detach().numpy()
        bboxes2 = bboxes2.cpu().detach().numpy()
        tl = np.maximum(bboxes1[:, None, :2], bboxes2[:, :2])
        br = np.minimum(bboxes1[:, None, 2:], bboxes2[:, 2:])

        iw = (br - tl + 1)[:, :, 0]
        ih = (br - tl + 1)[:, :, 1]
        iw[iw < 0] = 0
        ih[ih < 0] = 0
        overlaps = iw * ih

        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
            bboxes1[:, 3] - bboxes1[:, 1] + 1)

        if mode == 'iou':
            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
                bboxes2[:, 3] - bboxes2[:, 1] + 1)
            ious = overlaps / (area1[:, None] + area2 - overlaps)
        else:
            ious = overlaps / (area1[:, None])
        ious = torch.from_numpy(ious).cuda()         
    else:
        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]

        wh = (rb - lt + 1).clamp(min=0)  # [rows, cols, 2]
        overlap = wh[:, :, 0] * wh[:, :, 1]
        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
            bboxes1[:, 3] - bboxes1[:, 1] + 1)

        if mode == 'iou':
            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
                bboxes2[:, 3] - bboxes2[:, 1] + 1)
            ious = overlap / (area1[:, None] + area2 - overlap)
        else:
            ious = overlap / (area1[:, None])
yan9qu commented 3 years ago

Modified above in Geometry.py to solve the problem.

yan9qu commented 3 years ago

@chris075966 How to solve the problem when large number of bboxes are needed, then?

ramanathan831 commented 3 years ago

@chris075966 I can't find geometry.py, which directory is it in?