exiawsh / StreamPETR

[ICCV 2023] StreamPETR: Exploring Object-Centric Temporal Modeling for Efficient Multi-View 3D Object Detection
Other
558 stars 61 forks source link

mAP is poor by sliding window training #75

Closed Jacky-gsq closed 1 year ago

Jacky-gsq commented 1 year ago

HI, when i training the StreamPETR by sliding window, the mAP is 0.1266 and NDS is 0.3104. The results is confusing. The follows is the config: `

queue_length = 6 # sliding window training, set seq_mode = False in dataset num_frame_losses = 2 # faster convergence collect_keys=['lidar2img', 'intrinsics', 'extrinsics','timestamp', 'img_timestamp', 'ego_pose', 'ego_pose_inv'] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True) model = dict( type='Petr3D', num_frame_head_grads=num_frame_losses, num_frame_backbone_grads=num_frame_losses, num_frame_losses=num_frame_losses, use_grid_mask=True, img_backbone=dict(

pretrained='torchvision://resnet50',

    type='ResNet',
    depth=50,
    num_stages=4,
    out_indices=(2, 3),
    frozen_stages=-1,
    norm_cfg=dict(type='BN2d', requires_grad=False),
    norm_eval=True,
    with_cp=True,
    style='caffe',
    # style='pytorch'
    ),
img_neck=dict(
    type='CPFPN',  ###remove unused parameters 
    in_channels=[1024, 2048],
    out_channels=256,
    num_outs=2),
img_roi_head=dict(
    type='FocalHead',
    num_classes=10,
    in_channels=256,
    loss_cls2d=dict(
        type='QualityFocalLoss',
        use_sigmoid=True,
        beta=2.0,
        loss_weight=2.0),
    loss_centerness=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
    loss_bbox2d=dict(type='L1Loss', loss_weight=5.0),
    loss_iou2d=dict(type='GIoULoss', loss_weight=2.0),
    loss_centers2d=dict(type='L1Loss', loss_weight=10.0),
    train_cfg=dict(
    assigner2d=dict(
        type='HungarianAssigner2D',
        cls_cost=dict(type='FocalLossCost', weight=2.),
        reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
        iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
        centers2d_cost=dict(type='BBox3DL1Cost', weight=10.0)))
    ),
pts_bbox_head=dict(
    type='StreamPETRHead',
    num_classes=10,
    in_channels=256,
    num_query=644,
    memory_len=1024,
    topk_proposals=256,
    num_propagated=256,
    with_ego_pos=True,
    match_with_velo=False,
    scalar=10, ##noise groups
    noise_scale = 1.0, 
    dn_weight= 1.0, ##dn loss weight
    split = 0.75, ###positive rate
    LID=True,
    with_position=True,
    position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
    code_weights = [2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
    transformer=dict(
        type='PETRTemporalTransformer',
        decoder=dict(
            type='PETRTransformerDecoder',
            return_intermediate=True,
            num_layers=6,
            transformerlayers=dict(
                type='PETRTemporalDecoderLayer',
                attn_cfgs=[
                    dict(
                        type='PETRMultiheadAttention', #fp16 for 2080Ti training (save GPU memory).
                        embed_dims=256,
                        num_heads=8,
                        dropout=0.1,
                        fp16=True),
                    dict(
                        type='PETRMultiheadFlashAttention',
                        embed_dims=256,
                        num_heads=8,
                        dropout=0.1),
                    ],
                feedforward_channels=2048,
                ffn_dropout=0.1,
                with_cp=False,  ###use checkpoint to save memory
                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                 'ffn', 'norm')),
        )),
    bbox_coder=dict(
        type='NMSFreeCoder',
        post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
        pc_range=point_cloud_range,
        max_num=300,
        voxel_size=voxel_size,
        num_classes=10), 
    loss_cls=dict(
        type='FocalLoss',
        use_sigmoid=True,
        gamma=2.0,
        alpha=0.25,
        loss_weight=2.0),
    loss_bbox=dict(type='L1Loss', loss_weight=0.25),
    loss_iou=dict(type='GIoULoss', loss_weight=0.0),),
# model training and testing settings
train_cfg=dict(pts=dict(
    grid_size=[512, 512, 1],
    voxel_size=voxel_size,
    point_cloud_range=point_cloud_range,
    out_size_factor=4,
    assigner=dict(
        type='HungarianAssigner3D',
        cls_cost=dict(type='FocalLossCost', weight=2.0),
        reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
        iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
        pc_range=point_cloud_range),)))

dataset_type = 'CustomNuScenesDataset' data_root = '/nuscenes/'

file_client_args = dict(backend='disk')

ida_aug_conf = { "resize_lim": (0.38, 0.55), "final_dim": (256, 704), "bot_pct_lim": (0.0, 0.0), "rot_lim": (0.0, 0.0), "H": 900, "W": 1600, "rand_flip": True, } train_pipeline = [ dict(type='LoadMultiViewImageFromFiles', to_float32=True), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_bbox=True, with_label=True, with_bbox_depth=True), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='ResizeCropFlipRotImage', data_aug_conf = ida_aug_conf, training=True), dict(type='GlobalRotScaleTransImage', rot_range=[-0.3925, 0.3925], translation_std=[0, 0, 0], scale_ratio_range=[0.95, 1.05], reverse_angle=True, training=True ), dict(type='NormalizeMultiviewImage', img_norm_cfg), dict(type='PadMultiViewImage', size_divisor=32), dict(type='PETRFormatBundle3D', class_names=class_names, collect_keys=collect_keys + ['prev_exists']), dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'gt_bboxes', 'gt_labels', 'centers2d', 'depths', 'prev_exists'] + collect_keys, meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'scene_token', 'gt_bboxes_3d','gt_labels_3d')) ] test_pipeline = [ dict(type='LoadMultiViewImageFromFiles', to_float32=True), dict(type='ResizeCropFlipRotImage', data_aug_conf = ida_aug_conf, training=False), dict(type='NormalizeMultiviewImage', img_norm_cfg), dict(type='PadMultiViewImage', size_divisor=32), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='PETRFormatBundle3D', collect_keys=collect_keys, class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img'] + collect_keys, meta_keys=('filename', 'ori_shape', 'img_shape','pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'scene_token')) ]) ]

data = dict( samples_per_gpu=2, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes2d_temporal_infos_train.pkl', num_frame_losses=num_frame_losses, random_length=1, #for sliding window training pipeline=train_pipeline, classes=class_names, modality=input_modality, collect_keys=collect_keys + ['img', 'prev_exists', 'img_metas'], queue_length=queue_length, test_mode=False, use_valid_flag=True, filter_empty_gt=False, box_type_3d='LiDAR'), val=dict(type=dataset_type, data_root=data_root, pipeline=test_pipeline, collect_keys=collect_keys + ['img', 'img_metas'], queue_length=queue_length, ann_file=data_root + 'nuscenes2d_temporal_infos_val.pkl', classes=class_names, modality=input_modality), test=dict(type=dataset_type, data_root=data_root, pipeline=test_pipeline, collect_keys=collect_keys + ['img', 'img_metas'], queue_length=queue_length, ann_file=data_root + 'nuscenes2d_temporal_infos_val.pkl', classes=class_names, modality=input_modality), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') )

optimizer = dict( type='AdamW', lr=4e-4, # bs 8: 2e-4 || bs 16: 4e-4 paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.25), # 0.25 only for Focal-PETR with R50-in1k pretrained weights }), weight_decay=0.01)

optimizer_config = dict(type='Fp16OptimizerHook', loss_scale='dynamic', grad_clip=dict(max_norm=35, norm_type=2))

learning policy

lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, )

total_epochs = 24 evaluation = dict(interval=24, pipeline=test_pipeline) find_unused_parameters=False #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=1, max_keep_ckpts=3) runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) load_from='ckpts/resnet50_msra-5891d200.pth' resume_from=None `

exiawsh commented 1 year ago

@Jacky-gsq You have changed some settings of backbone, e.g. the checkpoint. I haven't verify the caffe style. And what's your img_norm_cfg? The img_norm_cfg is different when using caffe style and pytorch style.

Jacky-gsq commented 1 year ago

Thanks, i will train it again.

Jacky-gsq commented 1 year ago

The latset results is normal. mAP:0.4057 NDS:0.51111 Thanks a lot

shield218 commented 1 year ago

The latset results is normal. mAP:0.4057 NDS:0.51111 Thanks a lot

Same problem encountered with a different config which also delivers fairly normal results with seq_mode=True, but the mAP is only 0.0488 with sliding window utilizing 9 frames. Wonder what have you edited to get it work fine, is there something to modify during test or is it just because of some wrong settings during training ?

exiawsh commented 1 year ago

@shield218 Hi. Would you please provide your config? I will check it.

shield218 commented 1 year ago

@shield218 Hi. Would you please provide your config? I will check it.

Thank you! And I just found that the original sequential mode doesn't work normally either, I got the 0.4ish mAP because I forget to change the checkpoint from the pretrained weights downloaded from github to what was trained by myself. And here are the configurations, the first is the configuration for sliding window training, the second is the original stream training. And I'm trying modifying batch size and learning rate, will update my modifications if any of them works.

  1. sliding window
    
    _base_ = [
    '../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
    '../../../mmdetection3d/configs/_base_/default_runtime.py'
    ]
    backbone_norm_cfg = dict(type='LN', requires_grad=True)
    plugin=True
    plugin_dir='projects/mmdet3d_plugin/'

If point cloud range is changed, the models should also change their point

cloud range accordingly

point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] voxel_size = [0.2, 0.2, 8] img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

For nuScenes we usually do 10-class detection

class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ]

num_gpus = 4 batch_size = 4 num_iters_per_epoch = 28130 // (num_gpus * batch_size) num_epochs = 60

queue_length = 9 num_frame_losses = 1 collect_keys=['lidar2img', 'intrinsics', 'extrinsics','timestamp', 'img_timestamp', 'ego_pose', 'ego_pose_inv'] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True) model = dict( type='Petr3D', num_frame_head_grads=num_frame_losses, num_frame_backbone_grads=num_frame_losses, num_frame_losses=num_frame_losses, use_grid_mask=True, img_backbone=dict( init_cfg=dict( type='Pretrained', checkpoint="ckpts/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth", prefix='backbone.'),
type='ResNet', depth=50, num_stages=4, out_indices=(2, 3), frozen_stages=-1, norm_cfg=dict(type='BN2d', requires_grad=False), norm_eval=True, with_cp=True, style='pytorch'), img_neck=dict( type='CPFPN', ###remove unused parameters in_channels=[1024, 2048], out_channels=256, num_outs=2), img_roi_head=dict( type='FocalHead', num_classes=10, in_channels=256, loss_cls2d=dict( type='QualityFocalLoss', use_sigmoid=True, beta=2.0, loss_weight=2.0), loss_centerness=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), loss_bbox2d=dict(type='L1Loss', loss_weight=5.0), loss_iou2d=dict(type='GIoULoss', loss_weight=2.0), loss_centers2d=dict(type='L1Loss', loss_weight=10.0), train_cfg=dict( assigner2d=dict( type='HungarianAssigner2D', cls_cost=dict(type='FocalLossCost', weight=2.), reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0), centers2d_cost=dict(type='BBox3DL1Cost', weight=10.0))) ), pts_bbox_head=dict( type='StreamPETRHead', num_classes=10, in_channels=256, num_query=300, memory_len=512, topk_proposals=128, num_propagated=128, with_ego_pos=True, match_with_velo=False, scalar=10, ##noise groups noise_scale = 1.0, dn_weight= 1.0, ##dn loss weight split = 0.75, ###positive rate LID=True, with_position=True, position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], code_weights = [2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], transformer=dict( type='PETRTemporalTransformer', decoder=dict( type='PETRTransformerDecoder', return_intermediate=True, num_layers=6, transformerlayers=dict( type='PETRTemporalDecoderLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=256, num_heads=8, dropout=0.1), dict( type='PETRMultiheadFlashAttention', embed_dims=256, num_heads=8, dropout=0.1), ], feedforward_channels=2048, ffn_dropout=0.1, with_cp=True, ###use checkpoint to save memory operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')), )), bbox_coder=dict( type='NMSFreeCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], pc_range=point_cloud_range, max_num=300, voxel_size=voxel_size, num_classes=10), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0),),

model training and testing settings

train_cfg=dict(pts=dict(
    grid_size=[512, 512, 1],
    voxel_size=voxel_size,
    point_cloud_range=point_cloud_range,
    out_size_factor=4,
    assigner=dict(
        type='HungarianAssigner3D',
        cls_cost=dict(type='FocalLossCost', weight=2.0),
        reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
        iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
        pc_range=point_cloud_range),)))

dataset_type = 'CustomNuScenesDataset' data_root = './data/nuscenes/'

file_client_args = dict(backend='disk')

ida_aug_conf = { "resize_lim": (0.38, 0.55), "final_dim": (256, 704), "bot_pct_lim": (0.0, 0.0), "rot_lim": (0.0, 0.0), "H": 900, "W": 1600, "rand_flip": True, } train_pipeline = [ dict(type='LoadMultiViewImageFromFiles', to_float32=True), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_bbox=True, with_label=True, with_bbox_depth=True), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='ResizeCropFlipRotImage', data_aug_conf = ida_aug_conf, training=True), dict(type='GlobalRotScaleTransImage', rot_range=[-0.3925, 0.3925], translation_std=[0, 0, 0], scale_ratio_range=[0.95, 1.05], reverse_angle=True, training=True, ), dict(type='NormalizeMultiviewImage', img_norm_cfg), dict(type='PadMultiViewImage', size_divisor=32), dict(type='PETRFormatBundle3D', class_names=class_names, collect_keys=collect_keys + ['prev_exists']), dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'gt_bboxes', 'gt_labels', 'centers2d', 'depths', 'prev_exists'] + collect_keys, meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'scene_token', 'gt_bboxes_3d','gt_labels_3d')) ] test_pipeline = [ dict(type='LoadMultiViewImageFromFiles', to_float32=True), dict(type='ResizeCropFlipRotImage', data_aug_conf = ida_aug_conf, training=False), dict(type='NormalizeMultiviewImage', img_norm_cfg), dict(type='PadMultiViewImage', size_divisor=32), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='PETRFormatBundle3D', collect_keys=collect_keys, class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img'] + collect_keys, meta_keys=('filename', 'ori_shape', 'img_shape','pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'scene_token')) ]) ]

data = dict( samples_per_gpu=batch_size, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes2d_temporal_infos_train.pkl', num_frame_losses=num_frame_losses, seq_split_num=2, # streaming video training seq_mode=False, # streaming video training pipeline=train_pipeline, classes=class_names, modality=input_modality, collect_keys=collect_keys + ['img', 'prev_exists', 'img_metas'], queue_length=queue_length, test_mode=False, use_valid_flag=True, filter_empty_gt=False, box_type_3d='LiDAR'), val=dict(type=dataset_type, pipeline=test_pipeline, collect_keys=collect_keys + ['img', 'img_metas'], queue_length=queue_length, ann_file=data_root + 'nuscenes2d_temporal_infos_val.pkl', classes=class_names, modality=input_modality), test=dict(type=dataset_type, pipeline=test_pipeline, collect_keys=collect_keys + ['img', 'img_metas'], queue_length=queue_length, ann_file=data_root + 'nuscenes2d_temporal_infos_val.pkl', classes=class_names, modality=input_modality), shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler') )

optimizer = dict( type='AdamW', lr=4e-4, # bs 8: 2e-4 || bs 16: 4e-4 paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), # set to 0.1 always better when apply 2D pretrained. }), weight_decay=0.01)

optimizer_config = dict(type='Fp16OptimizerHook', loss_scale='dynamic', grad_clip=dict(max_norm=35, norm_type=2))

learning policy

lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, )

evaluation = dict(interval=num_iters_per_epochnum_epochs, pipeline=test_pipeline) find_unused_parameters=False #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_iters_per_epoch, max_keep_ckpts=100) runner = dict(type='IterBasedRunner', max_iters=num_epochs num_iters_per_epoch) load_from=None resume_from=None


2. stream training

base = [ '../../../mmdetection3d/configs/base/datasets/nus-3d.py', '../../../mmdetection3d/configs/base/default_runtime.py' ] backbone_norm_cfg = dict(type='LN', requires_grad=True) plugin=True plugin_dir='projects/mmdet3d_plugin/'

If point cloud range is changed, the models should also change their point

cloud range accordingly

point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] voxel_size = [0.2, 0.2, 8] img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)

For nuScenes we usually do 10-class detection

class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ]

num_gpus = 4 batch_size = 4 num_iters_per_epoch = 28130 // (num_gpus * batch_size) num_epochs = 60

queue_length = 9 num_frame_losses = 1 collect_keys=['lidar2img', 'intrinsics', 'extrinsics','timestamp', 'img_timestamp', 'ego_pose', 'ego_pose_inv'] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True) model = dict( type='Petr3D', num_frame_head_grads=num_frame_losses, num_frame_backbone_grads=num_frame_losses, num_frame_losses=num_frame_losses, use_grid_mask=True, img_backbone=dict( init_cfg=dict( type='Pretrained', checkpoint="ckpts/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth", prefix='backbone.'),
type='ResNet', depth=50, num_stages=4, out_indices=(2, 3), frozen_stages=-1, norm_cfg=dict(type='BN2d', requires_grad=False), norm_eval=True, with_cp=True, style='pytorch'), img_neck=dict( type='CPFPN', ###remove unused parameters in_channels=[1024, 2048], out_channels=256, num_outs=2), img_roi_head=dict( type='FocalHead', num_classes=10, in_channels=256, loss_cls2d=dict( type='QualityFocalLoss', use_sigmoid=True, beta=2.0, loss_weight=2.0), loss_centerness=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0), loss_bbox2d=dict(type='L1Loss', loss_weight=5.0), loss_iou2d=dict(type='GIoULoss', loss_weight=2.0), loss_centers2d=dict(type='L1Loss', loss_weight=10.0), train_cfg=dict( assigner2d=dict( type='HungarianAssigner2D', cls_cost=dict(type='FocalLossCost', weight=2.), reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0), centers2d_cost=dict(type='BBox3DL1Cost', weight=10.0))) ), pts_bbox_head=dict( type='StreamPETRHead', num_classes=10, in_channels=256, num_query=300, memory_len=512, topk_proposals=128, num_propagated=128, with_ego_pos=True, match_with_velo=False, scalar=10, ##noise groups noise_scale = 1.0, dn_weight= 1.0, ##dn loss weight split = 0.75, ###positive rate LID=True, with_position=True, position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], code_weights = [2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], transformer=dict( type='PETRTemporalTransformer', decoder=dict( type='PETRTransformerDecoder', return_intermediate=True, num_layers=6, transformerlayers=dict( type='PETRTemporalDecoderLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=256, num_heads=8, dropout=0.1), dict( type='PETRMultiheadFlashAttention', embed_dims=256, num_heads=8, dropout=0.1), ], feedforward_channels=2048, ffn_dropout=0.1, with_cp=True, ###use checkpoint to save memory operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')), )), bbox_coder=dict( type='NMSFreeCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], pc_range=point_cloud_range, max_num=300, voxel_size=voxel_size, num_classes=10), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0),),

model training and testing settings

train_cfg=dict(pts=dict(
    grid_size=[512, 512, 1],
    voxel_size=voxel_size,
    point_cloud_range=point_cloud_range,
    out_size_factor=4,
    assigner=dict(
        type='HungarianAssigner3D',
        cls_cost=dict(type='FocalLossCost', weight=2.0),
        reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
        iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
        pc_range=point_cloud_range),)))

dataset_type = 'CustomNuScenesDataset' data_root = './data/nuscenes/'

file_client_args = dict(backend='disk')

ida_aug_conf = { "resize_lim": (0.38, 0.55), "final_dim": (256, 704), "bot_pct_lim": (0.0, 0.0), "rot_lim": (0.0, 0.0), "H": 900, "W": 1600, "rand_flip": True, } train_pipeline = [ dict(type='LoadMultiViewImageFromFiles', to_float32=True), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_bbox=True, with_label=True, with_bbox_depth=True), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='ResizeCropFlipRotImage', data_aug_conf = ida_aug_conf, training=True), dict(type='GlobalRotScaleTransImage', rot_range=[-0.3925, 0.3925], translation_std=[0, 0, 0], scale_ratio_range=[0.95, 1.05], reverse_angle=True, training=True, ), dict(type='NormalizeMultiviewImage', img_norm_cfg), dict(type='PadMultiViewImage', size_divisor=32), dict(type='PETRFormatBundle3D', class_names=class_names, collect_keys=collect_keys + ['prev_exists']), dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'gt_bboxes', 'gt_labels', 'centers2d', 'depths', 'prev_exists'] + collect_keys, meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'scene_token', 'gt_bboxes_3d','gt_labels_3d')) ] test_pipeline = [ dict(type='LoadMultiViewImageFromFiles', to_float32=True), dict(type='ResizeCropFlipRotImage', data_aug_conf = ida_aug_conf, training=False), dict(type='NormalizeMultiviewImage', img_norm_cfg), dict(type='PadMultiViewImage', size_divisor=32), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='PETRFormatBundle3D', collect_keys=collect_keys, class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img'] + collect_keys, meta_keys=('filename', 'ori_shape', 'img_shape','pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'scene_token')) ]) ]

data = dict( samples_per_gpu=batch_size, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes2d_temporal_infos_train.pkl', num_frame_losses=num_frame_losses, seq_split_num=2, # streaming video training seq_mode=True, # streaming video training pipeline=train_pipeline, classes=class_names, modality=input_modality, collect_keys=collect_keys + ['img', 'prev_exists', 'img_metas'], queue_length=queue_length, test_mode=False, use_valid_flag=True, filter_empty_gt=False, box_type_3d='LiDAR'), val=dict(type=dataset_type, pipeline=test_pipeline, collect_keys=collect_keys + ['img', 'img_metas'], queue_length=queue_length, ann_file=data_root + 'nuscenes2d_temporal_infos_val.pkl', classes=class_names, modality=input_modality), test=dict(type=dataset_type, pipeline=test_pipeline, collect_keys=collect_keys + ['img', 'img_metas'], queue_length=queue_length, ann_file=data_root + 'nuscenes2d_temporal_infos_val.pkl', classes=class_names, modality=input_modality), shuffler_sampler=dict(type='InfiniteGroupEachSampleInBatchSampler'), nonshuffler_sampler=dict(type='DistributedSampler') )

optimizer = dict( type='AdamW', lr=4e-4, # bs 8: 2e-4 || bs 16: 4e-4 paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), # set to 0.1 always better when apply 2D pretrained. }), weight_decay=0.01)

optimizer_config = dict(type='Fp16OptimizerHook', loss_scale='dynamic', grad_clip=dict(max_norm=35, norm_type=2))

learning policy

lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, )

evaluation = dict(interval=num_iters_per_epochnum_epochs, pipeline=test_pipeline) find_unused_parameters=False #### when use checkpoint, find_unused_parameters must be False checkpoint_config = dict(interval=num_iters_per_epoch, max_keep_ckpts=100) runner = dict(type='IterBasedRunner', max_iters=num_epochs num_iters_per_epoch) load_from=None resume_from=None

shield218 commented 11 months ago

@exiawsh Hi, I've tried many ways including increase the batch size, change the pretrained weight for image backbone, reduce the minimun learning rate ratio min_lr_ratio by a mangitude, and none of them works. By looking into to the training logs, one thing in common is a spike in all losses and grad_norm near step 3000-3500, which motivates me to try different learning rate scheduling and leads to a successful experiment by using a custom shcheduler that cuts the learning rate by the factor of 10 at step 3200, and cuts it by the factor of 2 at step 200000 and step 400000, with the minimum learning rate not less than 1e-5. BTW, the pretrained image backbone which I don't think is the key to success due to its poor performance with other configs aligned with the original version, is ResNet-50 from FCOS3D trained on NuScenes. The mAP and NDS after 60 epochs are: mAP: 0.3831, NDS: 0.4899

exiawsh commented 11 months ago

@shield218 Please try to set the num_frame_losses from 2 to 1.

shield218 commented 11 months ago

@shield218 Please try to set the num_frame_losses from 2 to 1. @exiawsh It's already 1 in all the aforementioned experiments, I'm trying a smaller equivalent batch size(currently 32 for more stable iterations) to see whether the 6.8% gap on mAP between the working configuration and the one given in the repository could be alleviated.