CAN'T REPRODUCE THE RESULT IN PAPER

I ported bevfusion to my own project, but the trained mAP is very low（up to 0.59）. I don't know what the situation is. I have tried to adjust and modify many times, can anyone help me find some reasons for using it？ Here are my configuration files and log files： 20240125_192959.log
_base_ = [#'../_base_/datasets/nus-3d.py',
          '../_base_/default_runtime.py']

voxel_size = [0.075, 0.075, 0.2]
point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
class_names = [
    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
metainfo = dict(classes=class_names)
input_modality = dict(use_lidar=True, use_camera=True)
backend_args = None
data_config={
    'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT',
             'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'],
    'Ncams': 6,
    'input_size': (256, 704),
    'src_size': (900, 1600),

    # Augmentation
    'resize': (-0.06, 0.11),
    'rot': (-5.4, 5.4),
    'flip': True,
    'crop_h': (0.0, 0.0),
    'resize_test':0.04,
}

device='cuda'
model = dict(
    type = 'BEVFusion',
    encoders = dict(
        camera = dict(
            backbone = dict(
                type = 'SwinTransformer',
                embed_dims = 96,
                depths = [2, 2, 6, 2],
                num_heads = [3, 6, 12, 24],
                window_size = 7,
                mlp_ratio = 4,
                qkv_bias = True,
                qk_scale = None,
                drop_rate = 0.0,
                attn_drop_rate = 0.0,
                drop_path_rate = 0.2,  
                patch_norm = True,
                out_indices = [1, 2, 3],
                with_cp = False,
                convert_weights = True,
                init_cfg=dict(
                    type='Pretrained',
                    checkpoint= 'pretrained/checkpoint/swint-nuimages-pretrained.pth' # noqa: E251
                    # 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa: E501
                    )

                # pretrained='torchvision://resnet50',
                # # pretrained='/home/lwx/Desktop/lwx/distill-bev/resnet50.pth',
                # type='ResNet',
                # depth=50,
                # num_stages=4,
                # out_indices=(1,2, 3),
                # frozen_stages=-1,
                # norm_cfg=dict(type='BN', requires_grad=True),
                # norm_eval=False,
                # with_cp=True,
                # style='pytorch'
            ),
            neck=dict(
                type='GeneralizedLSSFPN',
                in_channels=[192, 384, 768],
                # in_channels = [512, 1024, 2048],
                # in_channels=[1024, 2048],
                out_channels=256,
                start_level=0,
                num_outs=3,
                norm_cfg=dict(type='BN2d', requires_grad=True),
                act_cfg=dict(type='ReLU', inplace=True),
                upsample_cfg=dict(mode='bilinear', align_corners=False)
            ),
            vtransform=dict(
                type='DepthLSSTransformBEVFusion',
                in_channels=256,
                out_channels=80,
                image_size=[256, 704],
                # feature_size=[16 , 44],
                feature_size=[32, 88],
                xbound=[-54.0, 54.0, 0.3],
                ybound=[-54.0, 54.0, 0.3],
                zbound=[-10.0, 10.0, 20.0],
                dbound=[1.0, 60.0, 0.5],
                downsample=2
            ),
            # fusion_layer=dict(
            #     type='ConvFuser', in_channels=[80, 256], out_channels=256)
        ), 
        lidar = dict(
            voxelize = dict(
                max_num_points = 10,
                point_cloud_range = point_cloud_range,
                voxel_size = voxel_size,
                max_voxels = [120000, 160000] 
            ),
            backbone = dict(
                type = 'SparseEncoder',
                in_channels = 5,
                output_channels = 128,
                order=('conv', 'norm', 'act'),
                encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,128)),
                encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)),
                block_type='basicblock',
                sparse_shape = [1440, 1440, 41]
            ) 
        )
    ),
    fuser = dict(
        type = 'ConvFuser',
        in_channels = [80, 256],
        out_channels = 256
    ),
    decoder = dict(
        backbone = dict(
            type='SECOND',
            in_channels=256,
            out_channels=[128, 256],
            layer_nums=[5, 5],
            layer_strides=[1, 2],
            norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
            conv_cfg=dict(type='Conv2d', bias=False)
        ),
        neck =  dict(
            type='SECONDFPN',
            in_channels=[128, 256],
            out_channels=[256, 256],
            upsample_strides=[1, 2],
            norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
            upsample_cfg=dict(type='deconv', bias=False),
        ),
    ),
    heads = dict(
        map=None,
        object = dict(
            type='TransFusionHead',
            num_proposals=200,
            auxiliary=True,
            in_channels=512,
            hidden_channel=128,
            num_classes=10,
            nms_kernel_size=3,
            num_decoder_layers=1,
            num_heads = 8,
            ffn_channel=256,
            dropout=0.1,
            bn_momentum=0.1,
            activation='relu',
            train_cfg=dict(
                dataset='nuScenes',
                point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0],
                grid_size=[1440, 1440, 41],
                voxel_size=[0.075, 0.075, 0.2],
                out_size_factor=8,
                gaussian_overlap=0.1,
                min_radius=2,
                pos_weight=-1,
                code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
                assigner=dict(
                    type='HungarianAssigner3D',
                    iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
                    cls_cost=dict(
                        type='mmdet.FocalLossCost',
                        gamma=2.0,
                        alpha=0.25,
                        weight=0.15),
                    reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
                    iou_cost=dict(type='IoU3DCost', weight=0.25))),
            test_cfg=dict(
                dataset='nuScenes',
                grid_size=[1440, 1440, 41],
                out_size_factor=8,
                voxel_size=[0.075, 0.075],
                pc_range=[-54.0, -54.0],
                nms_type=None
            ),
            common_heads=dict(
                center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]),
            bbox_coder=dict(
                type='TransFusionBBoxCoder',
                pc_range=[-54.0, -54.0],
                post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
                score_threshold=0.0,
                out_size_factor=8,
                voxel_size=[0.075, 0.075],
                code_size=10),
            loss_cls=dict(
                type='mmdet.FocalLoss',
                use_sigmoid=True,
                gamma=2.0,
                alpha=0.25,
                reduction='mean',
                loss_weight=1.0),
            loss_heatmap=dict(
                type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
            loss_bbox=dict(
                type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)
        )
    ),
)

dataset_type = 'NuScenesDataset'
root = 'data/nuscenes/'
file_client_args = dict(backend='disk')

train_pipeline = [
    dict(
        # type='BEVLoadMultiViewImageFromFiles',
        # type = 'LoadMultiViewImageFromFiles_MITBF',
        type = 'LoadMultiViewImageFromFiles_BEVDet',
        is_train=True,
        # to_float32=True,
        data_config=data_config,
        # color_type='color',
        ),
    dict(
        type='LoadPointsFromFile_MITBF',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        ),
    dict(
        type='LoadPointsFromMultiSweeps_MITBF',
        sweeps_num=9,
        load_dim=5,
        use_dim=5,
        pad_empty_sweeps=True,
        remove_close=True,
        ),
    dict(
        type='LoadAnnotations3D_MITBF',
        with_bbox_3d=True,
        with_label_3d=True,
        with_attr_label=False),
    # dict(
    #     type='ImageAug3D',
    #     final_dim=[256, 704],
    #     resize_lim=[0.38, 0.55],
    #     bot_pct_lim=[0.0, 0.0],
    #     rot_lim=[-5.4, 5.4],
    #     rand_flip=True,
    #     is_train=True),
    dict(
        type='GlobalRotScaleTrans',
        scale_ratio_range=[0.9, 1.1],
        rot_range=[-0.78539816, 0.78539816],
        translation_std=0.5,
        update_img2lidar=True),
    dict(type='RandomFlip3D',sync_2d=False,
        flip_ratio_bev_horizontal=0.5,
        flip_ratio_bev_vertical=0.5,
        update_img2lidar=True),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(
        type='ObjectNameFilter',
        classes=class_names),
    # Actually, 'GridMask' is not used here
    # dict(
    #     type='MITGridMask',
    #     use_h=True,
    #     use_w=True,
    #     max_epoch=2,
    #     rotate=1,
    #     offset=False,
    #     ratio=0.5,
    #     mode=1,
    #     prob=0.0,
    #     fixed_prob=True),
    # dict(type='PointShuffle'),
    dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'],
         meta_keys=( 
            'cam2img', 'ori_cam2img', 'camera2ego','lidar2ego','lidar2cam', 'lidar2img', 'cam2lidar',
            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
            'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
            'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
            'lidar_aug_matrix', 'num_pts_feats'))
]

test_pipeline = [
    dict(
        type = 'LoadMultiViewImageFromFiles_BEVDet',
        is_train=True,
        data_config=data_config,
        ),
    dict(
        type='LoadPointsFromFile_MITBF',
        coord_type='LIDAR',
        load_dim=5,
        use_dim=5,
        ),
    dict(
        type='LoadPointsFromMultiSweeps_MITBF',
        sweeps_num=9,
        load_dim=5,
        use_dim=5,
        pad_empty_sweeps=True,
        remove_close=True,
        ),
    dict(
        type='LoadAnnotations3D_MITBF',
        with_bbox_3d=True,
        with_label_3d=True,
        with_attr_label=False),
    dict(
        type='GlobalRotScaleTrans',
        scale_ratio_range=[1, 1],
        # translation_std=0.5,
        rot_range = [0, 0],
        translation_std=0,
        update_img2lidar=True),
    dict(type='RandomFlip3D',sync_2d=False,
        flip_ratio_bev_horizontal=0.0,
        flip_ratio_bev_vertical=0.0,
        update_img2lidar=True),
    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
    dict(
        type='ObjectNameFilter',
        classes=class_names),
   dict(type='DefaultFormatBundle3D', class_names=class_names),
    dict(
        type='Collect3D',
        keys=['img_inputs', 'points', 'gt_bboxes_3d', 'gt_labels_3d'],
        meta_keys=[
            'cam2img', 'ori_cam2img', 'camera2ego','lidar2ego','lidar2cam', 'lidar2img', 'cam2lidar',
            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
            'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
            'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
            'lidar_aug_matrix', 'num_pts_feats'
            # 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
            # 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
            # 'lidar_path', 'img_path', 'num_pts_feats', 'num_views'
        ])
]

data_prefix = dict(
    pts='samples/LIDAR_TOP',
    CAM_FRONT='samples/CAM_FRONT',
    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
    CAM_BACK='samples/CAM_BACK',
    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
    CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
    sweeps='sweeps/LIDAR_TOP')

data = dict(
    samples_per_gpu=3,
    workers_per_gpu=8,
    train=dict(
        type='CBGSDataset_MITBF',
        dataset=dict(
            type=dataset_type,
            data_root=root,
            ann_file=root + 'nuscenes_infos_train_mitbf.pkl',
            pipeline=train_pipeline,
            classes=class_names,
            # map_classes=None,
            img_info_prototype='bevdet',
            modality=input_modality,
            test_mode=False,
            use_valid_flag=True,
            box_type_3d='LiDAR'
        )
    ),
    val=dict(
        type=dataset_type,
        data_root=root,
        ann_file=root + 'nuscenes_infos_val_mitbf.pkl',
        pipeline=test_pipeline,
        classes=class_names,
        # map_classes=None,
        img_info_prototype='bevdet',
        modality=input_modality,
        test_mode=False,
        use_valid_flag=True,
        box_type_3d='LiDAR'
    ),
    test=dict(
        type=dataset_type,
        data_root=root,
        ann_file=root + 'nuscenes_infos_val_mitbf.pkl',
        pipeline=test_pipeline,
        img_info_prototype='bevdet',
        classes=class_names,
        # map_classes=None,
        modality=input_modality,
        test_mode=False,
        use_valid_flag=True,
        box_type_3d='LiDAR',
    )
)

# learning rate
lr = 0.0001
# Optimizer
optimizer = dict(type='AdamW', lr=2e-4, weight_decay=0.01)
optimizer_config = dict(
    grad_clip=dict(max_norm=35,norm_type=2)
)
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=0.001,
    step=[16, 22])
runner = dict(type='EpochBasedRunner', max_epochs=24)
evaluation = dict(interval=1)
checkpoint=dict( interval=1)
mit-han-lab / bevfusion

CAN'T REPRODUCE THE RESULT IN PAPER #594