Sense-X / Co-DETR

[ICCV 2023] DETRs with Collaborative Hybrid Assignments Training
MIT License
1.02k stars 113 forks source link

Unable to reproduce Co-DINO results on LVIS benchmark - Config guidance needed #33

Closed kxqt closed 1 year ago

kxqt commented 1 year ago

Hello, thanks for your awesome work!

I am trying to reproduce the results of Co-DINO on the LVIS benchmark. Unfortunately, the config file is not released, and I am unable to achieve the claimed AP of 56.9. With my config, I can only reach an AP of 55.9.

I have added the following 3 files to projects/configs/co_dino/, and use the projects/configs/co_dino/co_dino_5scale_lsj_swin_large_1x_lvis.py as the config file.

Here are the config files I added:

model = dict( type='CoDETR', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='pytorch', init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), neck=dict( type='ChannelMapper', in_channels=[256, 512, 1024, 2048], kernel_size=1, out_channels=256, act_cfg=None, norm_cfg=dict(type='GN', num_groups=32), num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', octave_base_scale=4, scales_per_octave=3, ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64, 128]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0num_dec_layerlambda_2), loss_bbox=dict(type='L1Loss', loss_weight=1.0num_dec_layerlambda_2)), query_head=dict( type='CoDINOHead', num_query=900, num_classes=1203, num_feature_levels=5, in_channels=2048, sync_cls_avg_factor=True, as_two_stage=True, with_box_refine=True, mixed_selection=True, dn_cfg=dict( type='CdnQueryGenerator', noise_scale=dict(label=0.5, box=1.0), # 0.5, 0.4 for DN-DETR group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), transformer=dict( type='CoDinoTransformer', with_pos_coord=True, with_coord_feat=False, num_co_heads=2, num_feature_levels=5, encoder=dict( type='DetrTransformerEncoder', num_layers=6, with_cp=4, # number of layers that use checkpoint transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict( type='MultiScaleDeformableAttention', embed_dims=256, num_levels=5, dropout=0.0), feedforward_channels=2048, ffn_dropout=0.0, operation_order=('self_attn', 'norm', 'ffn', 'norm'))), decoder=dict( type='DinoTransformerDecoder', num_layers=6, return_intermediate=True, transformerlayers=dict( type='DetrTransformerDecoderLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=256, num_heads=8, dropout=0.0), dict( type='MultiScaleDeformableAttention', embed_dims=256, num_levels=5, dropout=0.0), ], feedforward_channels=2048, ffn_dropout=0.0, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')))), positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, temperature=20, normalize=True), loss_cls=dict( type='QualityFocalLoss', use_sigmoid=True, beta=2.0, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0)), roi_head=[dict( type='CoStandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32, 64], finest_scale=56), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=1203, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, reg_decoded_bbox=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0num_dec_layerlambda_2), loss_bbox=dict(type='GIoULoss', loss_weight=10.0num_dec_layerlambda_2)))], bbox_head=[dict( type='CoATSSHead', num_classes=1203, in_channels=256, stacked_convs=1, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', ratios=[1.0], octave_base_scale=8, scales_per_octave=1, strides=[4, 8, 16, 32, 64, 128]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[0.1, 0.1, 0.2, 0.2]), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0num_dec_layerlambda_2), loss_bbox=dict(type='GIoULoss', loss_weight=2.0num_dec_layerlambda_2), loss_centerness=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0num_dec_layerlambda_2)),],

model training and testing settings

train_cfg=[
    dict(
        assigner=dict(
            type='HungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=2.0),
            reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
    dict(
        rpn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.7,
                neg_iou_thr=0.3,
                min_pos_iou=0.3,
                match_low_quality=True,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=256,
                pos_fraction=0.5,
                neg_pos_ub=-1,
                add_gt_as_proposals=False),
            allowed_border=-1,
            pos_weight=-1,
            debug=False),
        rpn_proposal=dict(
            nms_pre=4000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            assigner=dict(
                type='MaxIoUAssigner',
                pos_iou_thr=0.5,
                neg_iou_thr=0.5,
                min_pos_iou=0.5,
                match_low_quality=False,
                ignore_iof_thr=-1),
            sampler=dict(
                type='RandomSampler',
                num=512,
                pos_fraction=0.25,
                neg_pos_ub=-1,
                add_gt_as_proposals=True),
            pos_weight=-1,
            debug=False)),
    dict(
        assigner=dict(type='ATSSAssigner', topk=9),
        allowed_border=-1,
        pos_weight=-1,
        debug=False),],
test_cfg=[
    dict(
        max_per_img=1000,
        nms=dict(type='soft_nms', iou_threshold=0.8)
    ),
    dict(
        rpn=dict(
            nms_pre=1000,
            max_per_img=1000,
            nms=dict(type='nms', iou_threshold=0.7),
            min_bbox_size=0),
        rcnn=dict(
            score_thr=0.0,
            nms=dict(type='nms', iou_threshold=0.5),
            max_per_img=100)),
    dict(
        nms_pre=1000,
        min_bbox_size=0,
        score_thr=0.0,
        nms=dict(type='nms', iou_threshold=0.6),
        max_per_img=100),
    # soft-nms is also supported for rcnn testing
    # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
])

find_unused_parameters = True

fp16 = dict(loss_scale=dict(init_scale=512))

img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

train_pipeline, NOTE the img_scale and the Pad's size_divisor is different

from the default setting in mmdet.

train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='AutoAugment', policies=[ [ dict( type='Resize', img_scale=[(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], multiscale_mode='value', keep_ratio=True) ], [ dict( type='Resize',

The radio of all image in train dataset < 7

                # follow the original impl
                img_scale=[(400, 4200), (500, 4200), (600, 4200)],
                multiscale_mode='value',
                keep_ratio=True),
            dict(
                type='RandomCrop',
                crop_type='absolute_range',
                crop_size=(384, 600),
                allow_negative_crop=True),
            dict(
                type='Resize',
                img_scale=[(480, 1333), (512, 1333), (544, 1333),
                           (576, 1333), (608, 1333), (640, 1333),
                           (672, 1333), (704, 1333), (736, 1333),
                           (768, 1333), (800, 1333)],
                multiscale_mode='value',
                override=True,
                keep_ratio=True)
        ]
    ]),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=1),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])

]

test_pipeline, NOTE the Pad's size_divisor is different from the default

setting (size_divisor=32). While there is little effect on the performance

whether we use the default setting or use size_divisor=1.

test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=1), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ]

data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict(filter_empty_gt=False, pipeline=train_pipeline), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))

optimizer

optimizer = dict( type='AdamW', lr=2e-4, weight_decay=0.0001,

custom_keys of sampling_offsets and reference_points in DeformDETR

paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)}))

optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))

learning policy

lr_config = dict(policy='step', step=[11]) runner = dict(type='EpochBasedRunner', max_epochs=12)

NOTE: auto_scale_lr is for automatically scaling LR,

USER SHOULD NOT CHANGE ITS VALUES.

base_batch_size = (8 GPUs) x (2 samples per GPU)

auto_scale_lr = dict(base_batch_size=16)


* projects/configs/co_dino/co_dino_5scale_lsj_r50_1x_lvis.py
```python
_base_ = [
    'co_dino_5scale_r50_1x_lvis.py'
]

model = dict(with_attn_mask=False)

img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

image_size = (1536, 1536)
load_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
    dict(
        type='Resize',
        img_scale=image_size,
        ratio_range=(0.1, 2.0),
        multiscale_mode='range',
        keep_ratio=True),
    dict(
        type='RandomCrop',
        crop_type='absolute_range',
        crop_size=image_size,
        recompute_bbox=True,
        allow_negative_crop=True),
    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))),
]
train_pipeline = [
    dict(type='CopyPaste', max_num_pasted=100),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=image_size,
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img'])
        ])
]
dataset_type = 'LVISV1Dataset'
data_root = 'data/lvis_v1/'
img_data_root = 'data/coco/'
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='MultiImageMixDataset',
        dataset=dict(
            type=dataset_type,
            ann_file=data_root + 'lvis_v1_train.json',
            img_prefix=img_data_root,
            filter_empty_gt=False,
            pipeline=load_pipeline),
        pipeline=train_pipeline),
    val=dict(pipeline=test_pipeline),
    test=dict(pipeline=test_pipeline))

img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) image_size = (1536, 1536) load_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=image_size, ratio_range=(0.1, 2.0), multiscale_mode='range', keep_ratio=True), dict( type='RandomCrop', crop_type='absolute_range', crop_size=image_size, recompute_bbox=True, allow_negative_crop=True), dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), ] train_pipeline = [ dict(type='CopyPaste', max_num_pasted=100), dict(type='Normalize', img_norm_cfg), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=image_size, flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), dict(type='Normalize', img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] dataset_type = 'LVISV1Dataset' data_root = 'data/lvis_v1/' img_data_root = 'data/coco/' data = dict( samples_per_gpu=1, workers_per_gpu=1, train=dict( type='MultiImageMixDataset', dataset=dict( type=dataset_type, ann_file=data_root + 'lvis_v1_train.json', img_prefix=img_data_root, filter_empty_gt=False, pipeline=load_pipeline), pipeline=train_pipeline), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))



I would be grateful if the author could provide some insights or guidance on how to achieve the claimed result of AP=56.9. Any help would be highly appreciated!

Thank you!
TempleX98 commented 1 year ago

Hi, thanks for your interest in our work. There are some differences:

  1. Use fedloss as DETR classification loss
  2. Image resolution 1280x1280
  3. Repeat factor sampling (RFS)
  4. Set with_attn_mask=False to remove the image attention mask when using LSJ

We will release LVIS training codes in several days.

kxqt commented 1 year ago

Thank you for your response. I apologize for any misunderstanding caused by my previous description. My goal is to achieve an AP of 56.9 on the LVIS benchmark using the provided checkpoint. To do this, I have written a config file and performed evaluation using the provided checkpoint on the LVIS benchmark, resulting in an AP of 55.9. I am not attempting to perform training.
 
Furthermore, I have a few additional questions:

  1. In your response, you mentioned that the image resolution should be set to 1280 x 1280. However, I found a passage in the paper that describes the following:

    System-level comparison on LVIS. In contrast to the COCO setting, we use Co-DINO-Deformable-DETR++ to perform intermediate finetuning on the Objects365 dataset, as we find LSJ augmentation works better on the LVIS dataset. A batch size of 192, an initial learning rate of 2 × 10−4 , and an input image size of 1280×1280 are used. We use 900 object queries and 1000 DN queries for this model. During finetuning on LVIS, we arm it with an additional auxiliary mask branch and increase the input size to 1536×1536. Besides, we train the model without EMA for 16 epochs, where the batch size is set to 64, and the initial learning rate is set to 5×10−5 , which is reduced by a factor of 0.1 at the 9-th and 15-th epoch.

This passage states that a resolution of 1536 x 1536 is used when fine-tuning on the LVIS dataset. This confused me. Which resolution should I use in evaluation?
 

  1. How should I configure the settings related to RFS?
     
    Looking forward to your reply.
TempleX98 commented 1 year ago

I get it, you should set image size 1280x1280 and with_attn_mask=False when evaluating this Swin-L model. And I need to clarify that the "system-level comparison settings" in the paper refers to the ViT-L model.

TempleX98 commented 1 year ago

The performance of this Swin-L model is: Epoch(val) [36][1239] bbox_AP: 0.5690, bbox_AP50: 0.6960, bbox_AP75: 0.6050, bbox_APs: 0.4560, bbox_APm: 0.6710, bbox_APl: 0.7480, bbox_APr: 0.4820, bbox_APc: 0.5750, bbox_APf: 0.6020, bbox_mAP_copypaste: AP:0.569 AP50:0.696 AP75:0.605 APs:0.456 APm:0.671 APl:0.748 APr:0.482 APc:0.575 APf:0.602

kxqt commented 1 year ago

Thank you for your guidance. I will give it a try.

Additionally, do you have plans to release the ViT-L model for the LVIS benchmark? I would like to explore it further and report the results in my work.

TempleX98 commented 1 year ago

We may release the weights in several months.

kxqt commented 1 year ago

Thank you! And I have succeeded to reproduce the performance AP=56.9.

leaf1170124460 commented 1 year ago

Hi, @kxqt.

Could you provide your final three configs which reproduce the performance AP=56.9? I can not reproduce the result and only get 55.9, the same as your mentioned before.

Looking forward to your reply.

TempleX98 commented 1 year ago

Hi, @kxqt.

Could you provide your final three configs which reproduce the performance AP=56.9? I can not reproduce the result and only get 55.9, the same as your mentioned before.

Looking forward to your reply.

You can try this config

img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) image_size = (1280, 1280) load_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=image_size, ratio_range=(0.1, 2.0), multiscale_mode='range', keep_ratio=True), dict( type='RandomCrop', crop_type='absolute_range', crop_size=image_size, recompute_bbox=True, allow_negative_crop=True), dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), ] train_pipeline = [ dict(type='CopyPaste', max_num_pasted=100), dict(type='Normalize', img_norm_cfg), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=image_size, flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Pad', size=image_size, pad_val=dict(img=(114, 114, 114))), dict(type='Normalize', img_norm_cfg), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] dataset_type = 'LVISV1Dataset' data_root = 'data/lvis_v1/' img_data_root = 'data/coco/' data = dict( samples_per_gpu=1, workers_per_gpu=1, train=dict( type='MultiImageMixDataset', dataset=dict( type=dataset_type, ann_file=data_root + 'lvis_v1_train.json', img_prefix=img_data_root, filter_empty_gt=False, pipeline=load_pipeline), pipeline=train_pipeline), val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))

leaf1170124460 commented 1 year ago

Great! Thanks for your reply. I will have a try.

iranroman commented 1 year ago

Hello,

Upon running the LVIS evaluation, I encountered this error:

Screenshot 2023-09-20 at 6 26 48 PM

The error is caused because I do not have the training data. However, why is the training data being loaded for the model evaluation?

Maybe there's something I'm missing.

TempleX98 commented 1 year ago

@iranroman Hi, the LVIS validation set contains some COCO training data.