Closed SpeedyGonzales949 closed 1 year ago
@SpeedyGonzales949 Would you please provide your config? I need more details to reproduce the results.
Hi, @exiawsh ! Thanks for writing back. Here is the config file. I do want to mention that I am not using the flash attention
'../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
'../../../mmdetection3d/configs/_base_/default_runtime.py'
]
backbone_norm_cfg = dict(type='LN', requires_grad=True)
plugin=True
plugin_dir='projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
voxel_size = [0.2, 0.2, 8]
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
num_gpus = 4
batch_size = 2
num_iters_per_epoch = 28130 // (num_gpus * batch_size)
num_epochs = 24
queue_length = 1
num_frame_losses = 1
collect_keys=['lidar2img', 'intrinsics', 'extrinsics','timestamp', 'img_timestamp', 'ego_pose', 'ego_pose_inv']
input_modality = dict(
use_lidar=False,
use_camera=True,
use_radar=False,
use_map=False,
use_external=True)
model = dict(
type='Petr3D',
num_frame_head_grads=num_frame_losses,
num_frame_backbone_grads=num_frame_losses,
num_frame_losses=num_frame_losses,
use_grid_mask=True,
img_backbone=dict(
init_cfg=dict(
type='Pretrained', checkpoint="ckpts/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth",
prefix='backbone.'),
type='ResNet',
depth=50,
num_stages=4,
out_indices=(2, 3),
frozen_stages=-1,
norm_cfg=dict(type='BN2d', requires_grad=False),
norm_eval=True,
with_cp=True,
style='pytorch'),
img_neck=dict(
type='CPFPN', ###remove unused parameters
in_channels=[1024, 2048],
out_channels=256,
num_outs=2),
img_roi_head=dict(
type='FocalHead',
num_classes=10,
in_channels=256,
loss_cls2d=dict(
type='QualityFocalLoss',
use_sigmoid=True,
beta=2.0,
loss_weight=2.0),
loss_centerness=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
loss_bbox2d=dict(type='L1Loss', loss_weight=5.0),
loss_iou2d=dict(type='GIoULoss', loss_weight=2.0),
loss_centers2d=dict(type='L1Loss', loss_weight=10.0),
train_cfg=dict(
assigner2d=dict(
type='HungarianAssigner2D',
cls_cost=dict(type='FocalLossCost', weight=2.),
reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
centers2d_cost=dict(type='BBox3DL1Cost', weight=10.0)))
),
pts_bbox_head=dict(
type='StreamPETRHead',
num_classes=10,
in_channels=256,
num_query=300,
memory_len=512,
topk_proposals=128,
num_propagated=128,
with_ego_pos=True,
match_with_velo=False,
scalar=10, ##noise groups
noise_scale = 1.0,
dn_weight= 1.0, ##dn loss weight
split = 0.75, ###positive rate
LID=True,
with_position=True,
position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
code_weights = [2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
transformer=dict(
type='PETRTemporalTransformer',
decoder=dict(
type='PETRTransformerDecoder',
return_intermediate=True,
num_layers=6,
transformerlayers=dict(
type='PETRTemporalDecoderLayer',
attn_cfgs=[
dict(
type='MultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.1),
dict(
type='PETRMultiheadAttention',
embed_dims=256,
num_heads=8,
dropout=0.1),
],
feedforward_channels=2048,
ffn_dropout=0.1,
with_cp=True, ###use checkpoint to save memory
operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
'ffn', 'norm')),
)),
bbox_coder=dict(
type='NMSFreeCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
pc_range=point_cloud_range,
max_num=300,
voxel_size=voxel_size,
num_classes=10),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
loss_bbox=dict(type='L1Loss', loss_weight=0.25),
loss_iou=dict(type='GIoULoss', loss_weight=0.0),),
# model training and testing settings
train_cfg=dict(pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=4,
assigner=dict(
type='HungarianAssigner3D',
cls_cost=dict(type='FocalLossCost', weight=2.0),
reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
pc_range=point_cloud_range),)))
dataset_type = 'CustomNuScenesDataset'
data_root = './data/nuscenes/'
file_client_args = dict(backend='disk')
ida_aug_conf = {
"resize_lim": (0.38, 0.55),
"final_dim": (256, 704),
"bot_pct_lim": (0.0, 0.0),
"rot_lim": (0.0, 0.0),
"H": 900,
"W": 1600,
"rand_flip": True,
}
train_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_bbox=True,
with_label=True, with_bbox_depth=True),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='ResizeCropFlipRotImage', data_aug_conf = ida_aug_conf, training=True),
dict(type='GlobalRotScaleTransImage',
rot_range=[-0.3925, 0.3925],
translation_std=[0, 0, 0],
scale_ratio_range=[0.95, 1.05],
reverse_angle=True,
training=True,
),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(type='PETRFormatBundle3D', class_names=class_names, collect_keys=collect_keys + ['prev_exists']),
dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'gt_bboxes', 'gt_labels', 'centers2d', 'depths', 'prev_exists'] + collect_keys,
meta_keys=('filename', 'ori_shape', 'img_shape', 'pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'scene_token', 'gt_bboxes_3d','gt_labels_3d'))
]
test_pipeline = [
dict(type='LoadMultiViewImageFromFiles', to_float32=True),
dict(type='ResizeCropFlipRotImage', data_aug_conf = ida_aug_conf, training=False),
dict(type='NormalizeMultiviewImage', **img_norm_cfg),
dict(type='PadMultiViewImage', size_divisor=32),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='PETRFormatBundle3D',
collect_keys=collect_keys,
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img'] + collect_keys,
meta_keys=('filename', 'ori_shape', 'img_shape','pad_shape', 'scale_factor', 'flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'scene_token'))
])
]
data = dict(
samples_per_gpu=batch_size,
workers_per_gpu=4,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes2d_temporal_infos_train.pkl',
num_frame_losses=num_frame_losses,
seq_split_num=2, # streaming video training
seq_mode=True, # streaming video training
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
collect_keys=collect_keys + ['img', 'prev_exists', 'img_metas'],
queue_length=queue_length,
test_mode=False,
use_valid_flag=True,
filter_empty_gt=False,
box_type_3d='LiDAR'),
val=dict(type=dataset_type, pipeline=test_pipeline, collect_keys=collect_keys + ['img', 'img_metas'], queue_length=queue_length, ann_file=data_root + 'nuscenes2d_temporal_infos_val.pkl', classes=class_names, modality=input_modality),
test=dict(type=dataset_type, pipeline=test_pipeline, collect_keys=collect_keys + ['img', 'img_metas'], queue_length=queue_length, ann_file=data_root + 'nuscenes2d_temporal_infos_val.pkl', classes=class_names, modality=input_modality),
shuffler_sampler=dict(type='InfiniteGroupEachSampleInBatchSampler'),
nonshuffler_sampler=dict(type='DistributedSampler')
)
optimizer = dict(
type='AdamW',
lr=2e-4, # bs 8: 2e-4 || bs 16: 4e-4
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1), # set to 0.1 always better when apply 2D pretrained.
}),
weight_decay=0.01)
optimizer_config = dict(type='Fp16OptimizerHook', loss_scale='dynamic', grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=500,
warmup_ratio=1.0 / 3,
min_lr_ratio=1e-3,
)
evaluation = dict(interval=num_iters_per_epoch*num_epochs, pipeline=test_pipeline)
find_unused_parameters=False #### when use checkpoint, find_unused_parameters must be False
checkpoint_config = dict(interval=num_iters_per_epoch, max_keep_ckpts=3)
runner = dict(
type='IterBasedRunner', max_iters=num_epochs * num_iters_per_epoch)
load_from=None
resume_from=None
@SpeedyGonzales949 Hi, if you use our provided checkpoint, is the result abnormal?
Yes, I am using the provided checkpoint. This one: cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth Or do you mean if I use your full model?
@SpeedyGonzales949 Yes, I want to check the full model performance. The backbone checkpoint is right.
If evaluate your full model, I do get the same metrics as you on NuScenes Dataset.
How about changing the batchsize to 4 per gpu and setting learning rate to 4e-4?
Unfortunately, i dont have enough memory on gpu for batch size of 4
Unfortunately, i dont have enough memory on gpu for batch size of 4
You have said that you got the same results on v2-99-900q models. The same results refer to 0 mAP or 48 mAP as ours?
if I evaluate the model (vs-99-900q) with your weights provided for the full model, I do get the same metrics as yours (mAP=48) on NuScenes Dataset. But if I train it myself, the mAP is 0. Also i did evaluate multiple checkpoints from different epochs and still the mAP is 0.
if I evaluate the model (vs-99-900q) with your weights provided for the full model, I do get the same metrics as yours (mAP=48) on NuScenes Dataset. But if I train it myself, the mAP is 0. Also i did evaluate multiple checkpoints from different epochs and still the mAP is 0.
And the loss_cls of all models are 0? You can send me your projects to my email:1027293308@qq.com and tell me which config have you used (0 mAP). Because you changed some codes (for flash attention), I need to check the code first.
@SpeedyGonzales949 And I just loaded a new version with deformable attention, you can try this frist, which saves more GPU memory and no need for flash attention. https://github.com/exiawsh/StreamPETR/blob/main/projects/configs/RepDETR3D/repdetr3d_vov_800_bs2_seq_24e.py
Well for the flash attention module i just changed the dictonary for FlashAttention, as you said in the Train & Inference .md. Result is this: dict( type='PETRMultiheadAttention', embed_dims=256, num_heads=8, dropout=0.1, fp16=True), ] Yes the results of all models I tried (R50 - 428q and v2-99-900q) are mAP=0. I also tried running on a single GPU, but still the mAP = 0. I do want to specify when i train on a single gpu, I do not get any loss to be 0, but the mAP of the models, after evaluating them, are still 0. I will now try now the new config you provide it.
Hi! Two errors appear when running with new config however they do not stop the training.
2023-08-09 16:37:40,270 - mmdet - INFO - load checkpoint from local path: ckpts/fcos3d_vovnet_imgbackbone-remapped.pth 2023-08-09 16:37:40,708 - mmdet - WARNING - The model and loaded state dict do not match exactly
The second one is : error in ms_deformable_im2col_cuda: no kernel image is available for execution on the device This one keeps printi g in the background multiple times.
@SpeedyGonzales949 The first warning is normal in the training process. I'm not sure if the second one will affect the results. Are you sure your mmcv version and pytorch version correspond to our requirements?
Hi! I am using higher version of PyTorch, but the same version of mm-dependencies as you. I think this might be the problem. I am using mmcv 1.6.0 with pytorch 1.13 and cuda 11.6. I will check now if changing the mmcv -version resolves the problem. In the mean time, here are the dependecies I use
sys.platform: linux
Python: 3.8.17 (default, Jul 5 2023, 21:04:15) [GCC 11.2.0]
PyTorch: 1.13.0
PyTorch compiling details: PyTorch built with:
- GCC 9.3
- C++ Version: 201402
- CUDA Runtime 11.6
- CuDNN 8.3.2 (built against CUDA 11.5)
- Magma 2.6.1
TorchVision: 0.14.0
OpenCV: 4.8.0
MMCV: 1.6.0
MMCV Compiler: GCC 10.2
MMCV CUDA Compiler: 11.6
MMDetection: 2.28.2
MMSegmentation: 0.30.0
MMDetection3D: 1.0.0rc6+bbdcb33
spconv2.0: False
I meet the same problem. The main problem is the focal loss, and you can try to utilize py_sigmoid_focal_loss instead of C++ focal loss.
@dk-liang Do you mean this dict?
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=2.0),
yes. for the focal_loss.py:
Actually, I copy the py file from mmdet source. And then close the c++ version focal loss
Upgrading the mmcv - version to fit the pytorch version did not work. Training on 1080 TI still has loss 0. However, using the Tesla T4 GPU, solved all the issues, there is no loss=0 during training, and after evaluating the model, it really learned something. There was a slight issue with this GPU, Tesla T4:
[E ProcessGroupNCCL.cpp:587] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808499 milliseconds before timing out.
[E ProcessGroupNCCL.cpp:587] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(OpType=BROADCAST, Timeout(ms)=1800000) ran for 1808493 milliseconds before timing out.
But I manage to solve this issue using this command: export NCCL_P2P_DISABLE=1
I also tested this GPU, with Deformable Attention and Flash Attention and still works.
Hello, @exiawsh ! I am currently trying to train the same model for 3d object detection, but I am getting that the loss_cls is 0 from the start of the training. Also after evaluating the checkpoints, the model does not learn something, mAP = 0. Do you know where could be the issue. Thank you so much for your help. I also tried to train with the setting V2-99 - 900q, but got the same results. As setup i used 4 1080 TI GPUs and batch size of 2.
2023-08-08 14:05:29,147 - mmcv - INFO - Reducer buckets have been rebuilt in this iteration. 2023-08-08 14:06:24,603 - mmdet - INFO - Iter [50/84384] lr: 7.973e-05, eta: 1 day, 6:26:08, time: 1.299, data_time: 0.110, memory: 3149, frame_0_loss_cls: 0.0000, frame_0_loss_bbox: 3.7671, frame_0_d0.loss_cls: 0.0000, frame_0_d0.loss_bbox: 3.7788, frame_0_d1.loss_cls: 0.0000, frame_0_d1.loss_bbox: 3.7760, frame_0_d2.loss_cls: 0.0000, frame_0_d2.loss_bbox: 3.7727, frame_0_d3.loss_cls: 0.0000, frame_0_d3.loss_bbox: 3.7728, frame_0_d4.loss_cls: 0.0000, frame_0_d4.loss_bbox: 3.7748, frame_0_dn_loss_cls: 0.0000, frame_0_dn_loss_bbox: 1.9941, frame_0_d0.dn_loss_cls: 0.0000, frame_0_d0.dn_loss_bbox: 2.0607, frame_0_d1.dn_loss_cls: 0.0000, frame_0_d1.dn_loss_bbox: 2.0653, frame_0_d2.dn_loss_cls: 0.0000, frame_0_d2.dn_loss_bbox: 2.0303, frame_0_d3.dn_loss_cls: 0.0000, frame_0_d3.dn_loss_bbox: 2.0818, frame_0_d4.dn_loss_cls: 0.0000, frame_0_d4.dn_loss_bbox: 1.9912, frame_0_enc_loss_cls: 0.5157, frame_0_enc_loss_bbox: 1.9015, frame_0_enc_loss_iou: 1.4707, frame_0_centers2d_losses: 0.5320, frame_0_centerness_losses: 1.5092, loss: 40.7948, grad_norm: inf 2023-08-08 14:07:19,864 - mmdet - INFO - Iter [100/84384] lr: 9.307e-05, eta: 1 day, 4:08:42, time: 1.105, data_time: 0.054, memory: 3395, frame_0_loss_cls: 0.0000, frame_0_loss_bbox: 3.6936, frame_0_d0.loss_cls: 0.0000, frame_0_d0.loss_bbox: 3.6956, frame_0_d1.loss_cls: 0.0000, frame_0_d1.loss_bbox: 3.6859, frame_0_d2.loss_cls: 0.0000, frame_0_d2.loss_bbox: 3.6789, frame_0_d3.loss_cls: 0.0000, frame_0_d3.loss_bbox: 3.6837, frame_0_d4.loss_cls: 0.0000, frame_0_d4.loss_bbox: 3.6854, frame_0_dn_loss_cls: 0.0000, frame_0_dn_loss_bbox: 1.9260, frame_0_d0.dn_loss_cls: 0.0000, frame_0_d0.dn_loss_bbox: 1.9484, frame_0_d1.dn_loss_cls: 0.0000, frame_0_d1.dn_loss_bbox: 1.9029, frame_0_d2.dn_loss_cls: 0.0000, frame_0_d2.dn_loss_bbox: 1.8973, frame_0_d3.dn_loss_cls: 0.0000, frame_0_d3.dn_loss_bbox: 1.8894, frame_0_d4.dn_loss_cls: 0.0000, frame_0_d4.dn_loss_bbox: 1.9043, frame_0_enc_loss_cls: 0.8143, frame_0_enc_loss_bbox: 0.5043, frame_0_enc_loss_iou: 1.0161, frame_0_centers2d_losses: 0.3659, frame_0_centerness_losses: 0.7476, loss: 37.0396, grad_norm: 57.3683 2023-08-08 14:08:14,213 - mmdet - INFO - Iter [150/84384] lr: 1.064e-04, eta: 1 day, 3:14:08, time: 1.088, data_time: 0.052, memory: 3395, frame_0_loss_cls: 0.0000, frame_0_loss_bbox: 3.4121, frame_0_d0.loss_cls: 0.0000, frame_0_d0.loss_bbox: 3.4036, frame_0_d1.loss_cls: 0.0000, frame_0_d1.loss_bbox: 3.3995, frame_0_d2.loss_cls: 0.0000, frame_0_d2.loss_bbox: 3.4013, frame_0_d3.loss_cls: 0.0000, frame_0_d3.loss_bbox: 3.4040, frame_0_d4.loss_cls: 0.0000, frame_0_d4.loss_bbox: 3.4070, frame_0_dn_loss_cls: 0.0000, frame_0_dn_loss_bbox: 1.8255, frame_0_d0.dn_loss_cls: 0.0000, frame_0_d0.dn_loss_bbox: 1.8361, frame_0_d1.dn_loss_cls: 0.0000, frame_0_d1.dn_loss_bbox: 1.8242, frame_0_d2.dn_loss_cls: 0.0000, frame_0_d2.dn_loss_bbox: 1.8139, frame_0_d3.dn_loss_cls: 0.0000, frame_0_d3.dn_loss_bbox: 1.8137, frame_0_d4.dn_loss_cls: 0.0000, frame_0_d4.dn_loss_bbox: 1.8213, frame_0_enc_loss_cls: 0.8254, frame_0_enc_loss_bbox: 0.4245, frame_0_enc_loss_iou: 0.9331, frame_0_centers2d_losses: 0.3189, frame_0_centerness_losses: 0.7648, loss: 34.6288, grad_norm: 63.2657