Closed a2394797795 closed 1 year ago
Hi, Except for the experiment of ablation, we have trained for a longer time to achieve the best performance. The result reported on val set is the result of cbgs or the equivalent training duration. We train 100epoch for r50, and 60epoch for r101 because of fcos3d pre-training.
_base_ = [
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
backbone_norm_cfg = dict(type='LN', requires_grad=True)
plugin=True
plugin_dir='projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
voxel_size = [0.2, 0.2, 8]
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True)
model = dict(
type='Petr3D',
use_grid_mask=True,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(2, 3,),
frozen_stages=-1,
norm_cfg=dict(type='BN2d', requires_grad=False),
norm_eval=True,
style='caffe',
with_cp=True,
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, False, True, True),
pretrained = 'ckpts/resnet50_msra-5891d200.pth',
),
img_neck=dict(
type='CPFPN',
in_channels=[1024, 2048],
out_channels=256,
num_outs=2),
pts_bbox_head=dict(
type='PETRv2DNHead',
num_classes=10,
in_channels=256,
num_query=900,
LID=True,
with_position=True,
with_multiview=True,
with_fpe=True,
with_time=True,
with_multi=True,
scalar = 10, ##noise groups
noise_scale = 1.0,
dn_weight= 1.0, ##dn loss weight
split = 0.75, ###positive rate
position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
transformer=dict(
type='PETRDNTransformer',
decoder=dict(
type='PETRTransformerDecoder',
return_intermediate=True,
num_layers=6,
transformerlayers=dict(
type='PETRTransformerDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.1),
dict(
type='PETRMultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.1),
],
feedforward_channels=2048,
ffn_dropout=0.1,
with_cp=True, ###use checkpoint to save memory
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')),
)),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
positional_encoding=dict(
type='SinePositionalEncoding3D', num_feats=128, normalize=True),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='L1Loss', loss_weight=0.25),
loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
# model training and testing settings
train_cfg=dict(pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
pc_range=point_cloud_range))))
dataset_type = 'CustomNuScenesDataset'
data_root = '/data/Dataset/nuScenes/'
file_client_args = dict(backend='disk')
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'nuscenes_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(
car=5,
truck=5,
bus=5,
trailer=5,
construction_vehicle=5,
traffic_cone=5,
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5)),
classes=class_names,
sample_groups=dict(
car=2,
truck=3,
construction_vehicle=7,
bus=4,
trailer=6,
barrier=2,
motorcycle=6,
bicycle=6,
pedestrian=2,
traffic_cone=2),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args))
ida_aug_conf = {
"resize_lim": (0.47, 0.625),
"final_dim": (320, 800),
"bot_pct_lim": (0.0, 0.0),
"rot_lim": (0.0, 0.0),
"H": 900,
"W": 1600,
"rand_flip": True,
}
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='LoadMultiViewImageFromMultiSweepsFiles', sweeps_num=1, to_float32=True, pad_empty_sweeps=True, test_mode=False, sweep_range=[3,27]),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='ResizeCropFlipImage', data_aug_conf = ida_aug_conf, training=True),
dict(type='GlobalRotScaleTransImage',
rot_range=[-0.3925, 0.3925],
translation_std=[0, 0, 0],
scale_ratio_range=[0.95, 1.05],
reverse_angle=True,
training=True
),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'],
meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 'intrinsics', 'extrinsics',
'pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'sample_idx', 'timestamp', 'gt_bboxes_3d','gt_labels_3d'))
]
test_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='LoadMultiViewImageFromMultiSweepsFiles', sweeps_num=1, to_float32=True, pad_empty_sweeps=True, sweep_range=[3,27]),
dict(type='ResizeCropFlipImage', data_aug_conf = ida_aug_conf, training=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img'],
meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 'intrinsics', 'extrinsics',
'pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'sample_idx', 'timestamp'))
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=4,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'mmdet3d_nuscenes_30f_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
use_valid_flag=True,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR'),
val=dict(type=dataset_type, pipeline=test_pipeline, ann_file=data_root + 'mmdet3d_nuscenes_30f_infos_val.pkl', classes=class_names, modality=input_modality),
test=dict(type=dataset_type, load_interval=10, pipeline=test_pipeline, ann_file=data_root + 'mmdet3d_nuscenes_30f_infos_val.pkl', classes=class_names, modality=input_modality))
optimizer = dict(
type='AdamW',
lr=4e-4,
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),
}),
weight_decay=0.01)
optimizer_config = dict(type='Fp16OptimizerHook', loss_scale=512., grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
min_lr_ratio=1e-3,
)
total_epochs = 100
evaluation = dict(interval=100, pipeline=test_pipeline)
find_unused_parameters=False #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=1, max_keep_ckpts=3)
runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
load_from=None
resume_from=None
##60epoch
# Evaluating bboxes of pts_bbox
# mAP: 0.3338
# mATE: 0.7818
# mASE: 0.2755
# mAOE: 0.5365
# mAVE: 0.4621
# mAAE: 0.1950
# NDS: 0.4418
# Eval time: 202.7s
# Per-class results:
# Object Class AP ATE ASE AOE AVE AAE
# car 0.540 0.535 0.153 0.087 0.375 0.184
# truck 0.276 0.770 0.218 0.144 0.367 0.207
# bus 0.325 0.941 0.216 0.097 1.031 0.288
# trailer 0.168 1.190 0.250 0.522 0.237 0.122
# construction_vehicle 0.089 1.002 0.460 1.227 0.123 0.373
# pedestrian 0.384 0.722 0.302 0.846 0.621 0.237
# motorcycle 0.307 0.735 0.263 0.709 0.744 0.143
# bicycle 0.296 0.650 0.276 1.062 0.198 0.006
# traffic_cone 0.524 0.549 0.327 nan nan nan
# barrier 0.429 0.723 0.289 0.135 nan nan
##100epoch
# Evaluating bboxes of pts_bbox
# mAP: 0.3431
# mATE: 0.7168
# mASE: 0.2773
# mAOE: 0.5288
# mAVE: 0.4628
# mAAE: 0.1890
# NDS: 0.4541
# Eval time: 200.5s
# Per-class results: Object Class AP ATE ASE AOE AVE AAE
# car 0.548 0.517 0.155 0.080 0.378 0.186
# truck 0.272 0.750 0.222 0.169 0.377 0.199
# bus 0.347 0.793 0.212 0.098 1.165 0.281
# trailer 0.159 1.091 0.250 0.596 0.233 0.086
# construction_vehicle 0.089 0.928 0.444 1.137 0.129 0.336
# pedestrian 0.368 0.712 0.309 0.874 0.637 0.234
# motorcycle 0.329 0.660 0.266 0.777 0.599 0.182
# bicycle 0.290 0.612 0.282 0.902 0.183 0.008
# traffic_cone 0.552 0.485 0.345 nan nan nan
# barrier 0.476 0.620 0.289 0.127 nan nan
##100epoch bs2
# Evaluating bboxes of pts_bbox
# mAP: 0.3497
# mATE: 0.7264
# mASE: 0.2769
# mAOE: 0.5048
# mAVE: 0.5025
# mAAE: 0.1808
# NDS: 0.4557
# Eval time: 199.2s
# Per-class results:
# Object Class AP ATE ASE AOE AVE AAE
# car 0.558 0.511 0.154 0.078 0.432 0.182
# truck 0.283 0.749 0.220 0.115 0.440 0.204
# bus 0.361 0.765 0.202 0.121 1.132 0.283
# trailer 0.153 1.138 0.264 0.600 0.260 0.049
# construction_vehicle 0.098 1.017 0.446 1.096 0.122 0.345
# pedestrian 0.380 0.696 0.311 0.885 0.631 0.235
# motorcycle 0.328 0.683 0.274 0.631 0.787 0.138
# bicycle 0.282 0.618 0.286 0.897 0.217 0.011
# traffic_cone 0.559 0.491 0.329 nan nan nan
# barrier 0.497 0.597 0.282 0.121 nan nan
Thank you for your kind reply.
I modified
projects/configs/denoise/petrv2_vovnet_gridmask_p4_800x320_dn.py
to generate a config file using ResNet-50DCN as backbone. After training, I got a lower performance compared to the paper:The reported performance is 0.456 NDS and 0.350 mAP in the original paper. Is there any difference in the config to cause the problem?
The modified config is: