OpenGVLab / InternImage

[CVPR 2023 Highlight] InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions
https://arxiv.org/abs/2211.05778
MIT License
2.53k stars 234 forks source link

Poor performane when training model with a 2-class dataset and highly imbalance #158

Open newasu opened 1 year ago

newasu commented 1 year ago

Hi, I've faced a problem when training the model which contains 2 classes, i.e., 'bg' and 'corrosion'. Most of the pixels are bg (background).

the first 100 iterations I got this performance metric:

2023-05-19 14:46:10,250 - mmseg - INFO - Iter [100/100000] lr: 3.956e-06, eta: 3 days, 10:30:32, time: 3.068, data_time: 0.194, memory: 12331, decode.loss_ce: 0.0392, decode.acc_seg: 76.7968, aux.loss_ce: 0.0427, aux.acc_seg: 19.7051, loss: 0.0819 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 49/49, 0.7 task/s, elapsed: 69s, ETA: 0s 2023-05-19 14:47:19,316 - mmseg - INFO - per class results: 2023-05-19 14:47:19,317 - mmseg - INFO - +-----------+-------+-------+--------+-----------+--------+ | Class | IoU | Acc | Fscore | Precision | Recall | +-----------+-------+-------+--------+-----------+--------+ | bg | 58.73 | 64.02 | 74.0 | 87.65 | 64.02 | | corrosion | 1.52 | 7.15 | 2.99 | 1.89 | 7.15 | +-----------+-------+-------+--------+-----------+--------+

and then after training 4,600 iterations, the model got poor performance than the first:

2023-05-19 19:19:15,285 - mmseg - INFO - Iter [4600/100000] lr: 5.724e-05, eta: 4 days, 0:06:17, time: 3.051, data_time: 0.186, memory: 12331, decode.loss_ce: 0.0000, decode.acc_seg: 99.9998, aux.loss_ce: 0.0000, aux.acc_seg: 99.9945, loss: 0.0000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 49/49, 0.7 task/s, elapsed: 69s, ETA: 0s 2023-05-19 19:20:24,408 - mmseg - INFO - per class results: 2023-05-19 19:20:24,409 - mmseg - INFO - +-----------+-------+-------+--------+-----------+--------+ | Class | IoU | Acc | Fscore | Precision | Recall | +-----------+-------+-------+--------+-----------+--------+ | bg | 59.75 | 65.55 | 74.8 | 87.09 | 65.55 | | corrosion | 0.0 | 0.0 | nan | 0.0 | 0.0 | +-----------+-------+-------+--------+-----------+--------+

So, this is my config

@DATASETS.register_module(force=True) class WKDataset(CustomDataset): CLASSES = ('bg', 'corrosion') PALETTE = [[0, 0, 0], [0, 255, 0]] def init(self, kwargs): super().init(img_suffix='.JPG', seg_map_suffix='.png', reduce_zero_label=False, kwargs)

Config: norm_cfg = dict(type='BN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='InternImage', core_op='DCNv3', channels=64, depths=[4, 4, 18, 4], groups=[4, 8, 16, 32], mlp_ratio=4.0, drop_path_rate=0.2, norm_layer='LN', layer_scale=1.0, offset_scale=1.0, post_norm=False, with_cp=False, out_indices=(0, 1, 2, 3), init_cfg=dict( type='Pretrained', checkpoint= 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_t_1k_224.pth' ), norm_cfg=dict(type='BN', requires_grad=True), pretrained='open-mmlab://resnet50_v1c'), decode_head=dict( type='UPerHead', in_channels=[64, 128, 256, 512], in_index=[0, 1, 2, 3], pool_scales=(1, 2, 3, 6), channels=512, dropout_ratio=0.1, num_classes=2, norm_cfg=dict(type='BN', requires_grad=True), align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=256, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=2, norm_cfg=dict(type='BN', requires_grad=True), align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), train_cfg=dict(), test_cfg=dict(mode='whole')) dataset_type = 'WKDataset' data_root = '/root/dataset/corrosion/prepared' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (512, 512) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', reduce_zero_label=True), dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=6, workers_per_gpu=6, train=dict( type='WKDataset', data_root='/root/dataset/corrosion/prepared', img_dir='images', ann_dir='annotations', pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', reduce_zero_label=True), dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ], split='splits/training.txt'), val=dict( type='WKDataset', data_root='/root/dataset/corrosion/prepared', img_dir='images', ann_dir='annotations', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ], split='splits/validation.txt'), test=dict( type='WKDataset', data_root='/root/dataset/corrosion/prepared', img_dir='images', ann_dir='annotations', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ], split='splits/test.txt')) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = '/root/corr_1/InternImage/segmentation/checkpoint/upernet_internimage_t_512_160k_ade20k.pth' resume_from = None workflow = [('train', 1)] cudnn_benchmark = True optimizer = dict( type='AdamW', lr=6e-05, betas=(0.9, 0.999), weight_decay=0.05, constructor='CustomLayerDecayOptimizerConstructor', paramwise_cfg=dict( num_layers=30, layer_decay_rate=1.0, depths=[4, 4, 18, 4])) optimizer_config = dict(type='OptimizerHook') lr_config = dict( warmup='linear', warmup_iters=1500, warmup_ratio=1e-06, power=1.0, min_lr=0.0, by_epoch=False, type='PolyLrUpdaterHook') runner = dict(type='IterBasedRunner', max_iters=100000) checkpoint_config = dict( by_epoch=False, interval=100000, max_keep_ckpts=1, type='CheckpointHook') evaluation = dict( interval=100, metric=['mIoU', 'mFscore'], pre_eval=True, save_best='mIoU', by_epoch=False) pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_t_1k_224.pth' work_dir = '/root/corr_1/run_2' seed = 0 gpu_ids = range(0, 1) device = 'cuda'

So, How can I config the model to train this dataset?

ntirupathirao18 commented 1 year ago

Hi can you please refer this link hope this helps out doc , cause internimage uses mmsegmentation library which handles model training in that 255 is set default ignore index and while evaluating also it's been set to same, try to reassign the label to other than 255 and refer the link