open-mmlab / mmocr

OpenMMLab Text Detection, Recognition and Understanding Toolbox
https://mmocr.readthedocs.io/en/dev-1.x/
Apache License 2.0
4.32k stars 747 forks source link

Fcenet performance on ctw1500 without DCN #446

Closed Wei-ucas closed 3 years ago

Wei-ucas commented 3 years ago

I trained the Fcenet model with a resnet50 w\o dcn backbone on ctw1500, as the paper reported, it could reach 83.1 hmean. However, with 1500 epoch training, the best performance I got is 81.7, and many easy text instances can't get a good boundary. So how to get the same performance as the paper reported? Is the DCN backbone so important for this model?

The config I used is as below, which is just modified the backbone from https://github.com/open-mmlab/mmocr/blob/main/configs/textdet/fcenet/fcenet_r50dcnv2_fpn_1500e_ctw1500.py. Is there any modification for this?

fourier_degree = 5
model = dict(
    type='FCENet',
    backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(1, 2, 3),
        frozen_stages=-1,
        norm_cfg=dict(type='BN', requires_grad=True),
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
        norm_eval=False,
        style='pytorch'),
    neck=dict(
        type='FPN',
        in_channels=[512, 1024, 2048],
        out_channels=256,
        add_extra_convs='on_output',
        num_outs=3,
        relu_before_extra_convs=True,
        act_cfg=None),
    bbox_head=dict(
        type='FCEHead',
        in_channels=256,
        scales=(8, 16, 32),
        loss=dict(type='FCELoss'),
        fourier_degree=5))
train_cfg = None
test_cfg = None
dataset_type = 'IcdarDataset'
data_root = 'data/ctw1500/'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='LoadTextAnnotations',
        with_bbox=True,
        with_mask=True,
        poly2mask=False),
    dict(
        type='ColorJitter',
        brightness=0.12549019607843137,
        saturation=0.5,
        contrast=0.5),
    dict(
        type='Normalize',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        to_rgb=True),
    dict(type='RandomScaling', size=800, scale=(0.75, 2.5)),
    dict(
        type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
    dict(
        type='RandomCropPolyInstances',
        instance_key='gt_masks',
        crop_ratio=0.8,
        min_side_ratio=0.3),
    dict(
        type='RandomRotatePolyInstances',
        rotate_ratio=0.5,
        max_angle=30,
        pad_with_fixed_color=False),
    dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
    dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
    dict(type='Pad', size_divisor=32),
    dict(
        type='FCENetTargets',
        fourier_degree=5,
        level_proportion_range=((0, 0.25), (0.2, 0.65), (0.55, 1.0))),
    dict(
        type='CustomFormatBundle',
        keys=['p3_maps', 'p4_maps', 'p5_maps'],
        visualize=dict(flag=False, boundary_key=None)),
    dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1080, 736),
        flip=False,
        transforms=[
            dict(type='Resize', img_scale=(1280, 800), keep_ratio=True),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375],
                to_rgb=True),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img'])
        ])
]
data = dict(
    samples_per_gpu=6,
    workers_per_gpu=6,
    val_dataloader=dict(samples_per_gpu=1),
    test_dataloader=dict(samples_per_gpu=1),
    train=dict(
        type='IcdarDataset',
        ann_file='data/ctw1500//instances_training.json',
        img_prefix='data/ctw1500//imgs',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='LoadTextAnnotations',
                with_bbox=True,
                with_mask=True,
                poly2mask=False),
            dict(
                type='ColorJitter',
                brightness=0.12549019607843137,
                saturation=0.5,
                contrast=0.5),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375],
                to_rgb=True),
            dict(type='RandomScaling', size=800, scale=(0.75, 2.5)),
            dict(
                type='RandomCropFlip',
                crop_ratio=0.5,
                iter_num=1,
                min_area_ratio=0.2),
            dict(
                type='RandomCropPolyInstances',
                instance_key='gt_masks',
                crop_ratio=0.8,
                min_side_ratio=0.3),
            dict(
                type='RandomRotatePolyInstances',
                rotate_ratio=0.5,
                max_angle=30,
                pad_with_fixed_color=False),
            dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
            dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
            dict(type='Pad', size_divisor=32),
            dict(
                type='FCENetTargets',
                fourier_degree=5,
                level_proportion_range=((0, 0.25), (0.2, 0.65), (0.55, 1.0))),
            dict(
                type='CustomFormatBundle',
                keys=['p3_maps', 'p4_maps', 'p5_maps'],
                visualize=dict(flag=False, boundary_key=None)),
            dict(
                type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
        ]),
    val=dict(
        type='IcdarDataset',
        ann_file='data/ctw1500//instances_test.json',
        img_prefix='data/ctw1500//imgs',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(1080, 736),
                flip=False,
                transforms=[
                    dict(
                        type='Resize', img_scale=(1280, 800), keep_ratio=True),
                    dict(
                        type='Normalize',
                        mean=[123.675, 116.28, 103.53],
                        std=[58.395, 57.12, 57.375],
                        to_rgb=True),
                    dict(type='Pad', size_divisor=32),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ]),
    test=dict(
        type='IcdarDataset',
        ann_file='data/ctw1500//instances_test.json',
        img_prefix='data/ctw1500//imgs',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(1080, 736),
                flip=False,
                transforms=[
                    dict(
                        type='Resize', img_scale=(1280, 800), keep_ratio=True),
                    dict(
                        type='Normalize',
                        mean=[123.675, 116.28, 103.53],
                        std=[58.395, 57.12, 57.375],
                        to_rgb=True),
                    dict(type='Pad', size_divisor=32),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ]))
evaluation = dict(interval=5, metric='hmean-iou')
optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005)
optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='poly', power=0.9, min_lr=1e-07, by_epoch=True)
total_epochs = 1500
checkpoint_config = dict(interval=5)
log_config = dict(interval=20, hooks=[dict(type='TextLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
Zyq-scut commented 3 years ago

Hi, you can adjust the param 'img_scale' while testing, it affect the performance a lot. Besides, more training epochs may improve the performance.

gaotongxiao commented 3 years ago

I'm closing this issue as there is no further update.