Poor performane when training model with a 2-class dataset and highly imbalance

Hi, I've faced a problem when training the model which contains 2 classes, i.e., 'bg' and 'corrosion'. Most of the pixels are bg (background).

the first 100 iterations I got this performance metric:

2023-05-19 14:46:10,250 - mmseg - INFO - Iter [100/100000] lr: 3.956e-06, eta: 3 days, 10:30:32, time: 3.068, data_time: 0.194, memory: 12331, decode.loss_ce: 0.0392, decode.acc_seg: 76.7968, aux.loss_ce: 0.0427, aux.acc_seg: 19.7051, loss: 0.0819 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 49/49, 0.7 task/s, elapsed: 69s, ETA: 0s 2023-05-19 14:47:19,316 - mmseg - INFO - per class results: 2023-05-19 14:47:19,317 - mmseg - INFO - +-----------+-------+-------+--------+-----------+--------+ | Class | IoU | Acc | Fscore | Precision | Recall | +-----------+-------+-------+--------+-----------+--------+ | bg | 58.73 | 64.02 | 74.0 | 87.65 | 64.02 | | corrosion | 1.52 | 7.15 | 2.99 | 1.89 | 7.15 | +-----------+-------+-------+--------+-----------+--------+

and then after training 4,600 iterations, the model got poor performance than the first:

2023-05-19 19:19:15,285 - mmseg - INFO - Iter [4600/100000] lr: 5.724e-05, eta: 4 days, 0:06:17, time: 3.051, data_time: 0.186, memory: 12331, decode.loss_ce: 0.0000, decode.acc_seg: 99.9998, aux.loss_ce: 0.0000, aux.acc_seg: 99.9945, loss: 0.0000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 49/49, 0.7 task/s, elapsed: 69s, ETA: 0s 2023-05-19 19:20:24,408 - mmseg - INFO - per class results: 2023-05-19 19:20:24,409 - mmseg - INFO - +-----------+-------+-------+--------+-----------+--------+ | Class | IoU | Acc | Fscore | Precision | Recall | +-----------+-------+-------+--------+-----------+--------+ | bg | 59.75 | 65.55 | 74.8 | 87.09 | 65.55 | | corrosion | 0.0 | 0.0 | nan | 0.0 | 0.0 | +-----------+-------+-------+--------+-----------+--------+

So, this is my config

@DATASETS.register_module(force=True) class WKDataset(CustomDataset): CLASSES = ('bg', 'corrosion') PALETTE = [[0, 0, 0], [0, 255, 0]] def init(self, kwargs): super().init(img_suffix='.JPG', seg_map_suffix='.png', reduce_zero_label=False, kwargs)

Config: norm_cfg = dict(type='BN', requires_grad=True) model = dict( type='EncoderDecoder', pretrained='open-mmlab://resnet50_v1c', backbone=dict( type='InternImage', core_op='DCNv3', channels=64, depths=[4, 4, 18, 4], groups=[4, 8, 16, 32], mlp_ratio=4.0, drop_path_rate=0.2, norm_layer='LN', layer_scale=1.0, offset_scale=1.0, post_norm=False, with_cp=False, out_indices=(0, 1, 2, 3), init_cfg=dict( type='Pretrained', checkpoint= 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_t_1k_224.pth' ), norm_cfg=dict(type='BN', requires_grad=True), pretrained='open-mmlab://resnet50_v1c'), decode_head=dict( type='UPerHead', in_channels=[64, 128, 256, 512], in_index=[0, 1, 2, 3], pool_scales=(1, 2, 3, 6), channels=512, dropout_ratio=0.1, num_classes=2, norm_cfg=dict(type='BN', requires_grad=True), align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', in_channels=256, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, num_classes=2, norm_cfg=dict(type='BN', requires_grad=True), align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)), train_cfg=dict(), test_cfg=dict(mode='whole')) dataset_type = 'WKDataset' data_root = '/root/dataset/corrosion/prepared' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (512, 512) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', reduce_zero_label=True), dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=6, workers_per_gpu=6, train=dict( type='WKDataset', data_root='/root/dataset/corrosion/prepared', img_dir='images', ann_dir='annotations', pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', reduce_zero_label=True), dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ], split='splits/training.txt'), val=dict( type='WKDataset', data_root='/root/dataset/corrosion/prepared', img_dir='images', ann_dir='annotations', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ], split='splits/validation.txt'), test=dict( type='WKDataset', data_root='/root/dataset/corrosion/prepared', img_dir='images', ann_dir='annotations', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='ResizeToMultiple', size_divisor=32), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ], split='splits/test.txt')) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = '/root/corr_1/InternImage/segmentation/checkpoint/upernet_internimage_t_512_160k_ade20k.pth' resume_from = None workflow = [('train', 1)] cudnn_benchmark = True optimizer = dict( type='AdamW', lr=6e-05, betas=(0.9, 0.999), weight_decay=0.05, constructor='CustomLayerDecayOptimizerConstructor', paramwise_cfg=dict( num_layers=30, layer_decay_rate=1.0, depths=[4, 4, 18, 4])) optimizer_config = dict(type='OptimizerHook') lr_config = dict( warmup='linear', warmup_iters=1500, warmup_ratio=1e-06, power=1.0, min_lr=0.0, by_epoch=False, type='PolyLrUpdaterHook') runner = dict(type='IterBasedRunner', max_iters=100000) checkpoint_config = dict( by_epoch=False, interval=100000, max_keep_ckpts=1, type='CheckpointHook') evaluation = dict( interval=100, metric=['mIoU', 'mFscore'], pre_eval=True, save_best='mIoU', by_epoch=False) pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_t_1k_224.pth' work_dir = '/root/corr_1/run_2' seed = 0 gpu_ids = range(0, 1) device = 'cuda'

So, How can I config the model to train this dataset?

OpenGVLab / InternImage

Poor performane when training model with a 2-class dataset and highly imbalance #158