open-mmlab / mmdetection3d

OpenMMLab's next-generation platform for general 3D object detection.
https://mmdetection3d.readthedocs.io/en/latest/
Apache License 2.0
5.31k stars 1.54k forks source link

a problem when i train the centerpoint using kitti dataet #2728

Closed randomfforest closed 1 year ago

randomfforest commented 1 year ago

Prerequisite

Task

I'm using the official example scripts/configs for the officially supported tasks/models/datasets.

Branch

main branch https://github.com/open-mmlab/mmdetection3d

Environment

System environment: sys.platform: linux Python: 3.10.0 (default, Mar 3 2022, 09:58:08) [GCC 7.5.0] CUDA available: True numpy_random_seed: 545278729 GPU 0: NVIDIA RTX A4000 Laptop GPU CUDA_HOME: /usr/local/cuda-11.7 NVCC: Cuda compilation tools, release 11.7, V11.7.99 GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 PyTorch: 2.0.1+cu117 PyTorch compiling details: PyTorch built with:

Runtime environment: cudnn_benchmark: False mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} dist_cfg: {'backend': 'nccl'} seed: 545278729 Distributed launcher: none Distributed training: False GPU number: 1

Reproduces the problem - code sample

voxel_size = [0.2, 0.2, 4] #xg point_cloud_range = [0, -40, -3, 70.4, 40, 1] model = dict( type='CenterPoint', data_preprocessor=dict( type='Det3DDataPreprocessor', voxel=True, voxel_layer=dict( max_num_points=20, point_cloud_range = point_cloud_range, #xg voxel_size=voxel_size,

max_voxels=(30000, 40000))), #xg

        max_voxels=(16000, 40000))),
pts_voxel_encoder=dict(
    type='PillarFeatureNet',
    #in_channels=5,
    in_channels=4,      #xg
    point_cloud_range = point_cloud_range,   
    feat_channels=[64],
    with_distance=False,
    voxel_size=(0.2, 0.2, 4), #xg
    norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
    legacy=False),
pts_middle_encoder=dict(
    type='PointPillarsScatter', in_channels=64, output_shape=(400, 352)), #xg
pts_backbone=dict(
    type='SECOND',
    in_channels=64,
    out_channels=[64, 128, 256],
    layer_nums=[3, 5, 5],
    layer_strides=[2, 2, 2],
    norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
    conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
    type='SECONDFPN',
    in_channels=[64, 128, 256],
    out_channels=[128, 128, 128],
    upsample_strides=[0.5, 1, 2],
    # norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),  
    upsample_cfg=dict(type='deconv', bias=False),
    use_conv_for_no_stride=True),
pts_bbox_head=dict(
    type='CenterHead',
    in_channels=sum([128, 128, 128]),
    tasks=[
            dict(num_class=1, class_names=['car']),
            dict(num_class=1, class_names=['cyclist']),
            dict(num_class=1, class_names=['pedestrian']),
    ],  #xg
    common_heads=dict(
        # reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),  
        reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2)),
    share_conv_channel=64,
    bbox_coder=dict(
        type='CenterPointBBoxCoder',
        # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
        post_center_range=point_cloud_range,
        # max_num=500,
        pc_range =point_cloud_range[:2],  #xg
        max_num=100,
        score_threshold=0.1,
        out_size_factor=4, 
        voxel_size=voxel_size[:2],
        # code_size=9,
        code_size=7),     #xg,
    separate_head=dict(
        type='SeparateHead', init_bias=-2.19, final_kernel=3),
    loss_cls=dict(type='mmdet.GaussianFocalLoss', reduction='mean'),
    loss_bbox=dict(
        type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
    norm_bbox=True),
# model training and testing settings
train_cfg=dict(
    pts=dict(
        # grid_size=[512, 512, 1],
        grid_size=[325, 400, 1],
        voxel_size=voxel_size,
        out_size_factor=4,
        point_cloud_range = point_cloud_range, #add
        dense_reg=1,
        gaussian_overlap=0.1,
        max_objs=500,
        min_radius=2,
        # code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])),  #
test_cfg=dict(
    pts=dict(
        # post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
        post_center_limit_range=[0, -40, -3, 70.4, 40, 1],
        max_per_img=500,
        max_pool_nms=False,
        min_radius=[4, 12, 10, 1, 0.85, 0.175],
        score_threshold=0.1,
        # pc_range=[-51.2, -51.2],
        pc_range = [0, -40, -3, 70.4, 40, 1][:2],
        out_size_factor=4,
        voxel_size=voxel_size[:2],
        nms_type='rotate',
        # pre_max_size=1000,
        # post_max_size=83,
        pre_max_size=4000,
        post_max_size=500,
        nms_thr=0.2)))

dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car']

point_cloud_range = [0, -40, -3, 70.4, 40, 1]

input_modality = dict(use_lidar=True, use_camera=False) metainfo = dict(classes=class_names) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), classes=class_names,

sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6), #centerpoint-kitti

sample_groups=dict(Car=12, Pedestrian=10, Cyclist=10),
points_loader=dict(
    type='LoadPointsFromFile',
    coord_type='LIDAR',
    load_dim=4,
    use_dim=4,
    backend_args=backend_args),
backend_args=backend_args) 

train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, # x, y, z, intensity use_dim=4, backend_args=backend_args), # dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0.5], global_rot_range=[0.0, 0.0], rot_range=[-0.78539816, 0.78539816]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range= point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range= point_cloud_range), dict(type='PointShuffle'), dict( type='Pack3DDetInputs', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) # test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range) ]), dict(type='Pack3DDetInputs', keys=['points']) ] eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), dict(type='Pack3DDetInputs', keys=['points']) ] train_dataloader = dict( batch_size=2, num_workers=4, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='RepeatDataset', times=2, # dataset=dict( type=dataset_type, data_root=data_root, ann_file='kitti_infos_train.pkl', # data_prefix=dict(pts='training/velodyne_reduced'), # pipeline=train_pipeline, modality=input_modality, test_mode=False, metainfo=metainfo,

we use box_type_3d='LiDAR' in kitti and nuscenes dataset

        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
        box_type_3d='LiDAR',
        backend_args=backend_args)))

vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') lr = 0.0018 optim_wrapper = dict( type='OptimWrapper', optimizer=dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01), clip_grad=dict(max_norm=10, norm_type=2)) param_scheduler = [ dict( type='CosineAnnealingLR', T_max=16, eta_min=lr 10, begin=0, end=16, by_epoch=True, convert_to_iter_based=True), dict( type='CosineAnnealingLR', T_max=24, eta_min=lr 1e-4, begin=16, end=40, by_epoch=True, convert_to_iter_based=True), dict( type='CosineAnnealingMomentum', T_max=16, eta_min=0.85 / 0.95, begin=0, end=16, by_epoch=True, convert_to_iter_based=True), dict( type='CosineAnnealingMomentum', T_max=24, eta_min=1, begin=16, end=40, by_epoch=True, convert_to_iter_based=True) ] train_cfg = dict(by_epoch=True, max_epochs=40, val_interval=1)

val_cfg = dict()

test_cfg = dict()

auto_scale_lr = dict(enable=False, base_batch_size=48) default_scope = 'mmdet3d' default_hooks = dict( timer=dict(type='IterTimerHook'), logger=dict(type='LoggerHook', interval=50), param_scheduler=dict(type='ParamSchedulerHook'), checkpoint=dict(type='CheckpointHook', interval=2), sampler_seed=dict(type='DistSamplerSeedHook'), visualization=dict(type='Det3DVisualizationHook')) env_cfg = dict( cudnn_benchmark=False, mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), dist_cfg=dict(backend='nccl'), ) log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) log_level = 'INFO' load_from = None resume = False

Reproduces the problem - command or script

python tools/train config/centerpoint/centerpoint_kitti-3d.py

centerpoint_head.py(changed) def get_targets_single(self, gt_instances_3d: InstanceData) -> Tuple[Tensor]: """Generate training targets for a single sample.

    Args:
        gt_instances_3d (:obj:`InstanceData`): Gt_instances of
            single data sample. It usually includes
            ``bboxes_3d`` and ``labels_3d`` attributes.

    Returns:
        tuple[list[torch.Tensor]]: Tuple of target including
            the following results in order.

            - list[torch.Tensor]: Heatmap scores.
            - list[torch.Tensor]: Ground truth boxes.
            - list[torch.Tensor]: Indexes indicating the position
                of the valid boxes.
            - list[torch.Tensor]: Masks indicating which boxes
                are valid.
    """
    gt_labels_3d = gt_instances_3d.labels_3d
    gt_bboxes_3d = gt_instances_3d.bboxes_3d
    device = gt_labels_3d.device
    gt_bboxes_3d = torch.cat(
        (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
        dim=1).to(device)
    max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
    grid_size = torch.tensor(self.train_cfg['grid_size']).to(device)
    pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
    voxel_size = torch.tensor(self.train_cfg['voxel_size'])
    gt_annotation_num = len(self.train_cfg['code_weights'])   #修改

    feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']

    # reorganize the gt_dict by tasks
    task_masks = []
    flag = 0
    for class_name in self.class_names:
        task_masks.append([
            torch.where(gt_labels_3d == class_name.index(i) + flag)
            for i in class_name
        ])
        flag += len(class_name)

    task_boxes = []
    task_classes = []
    flag2 = 0
    for idx, mask in enumerate(task_masks):
        task_box = []
        task_class = []
        for m in mask:
            task_box.append(gt_bboxes_3d[m])
            # 0 is background for each task, so we need to add 1 here.
            task_class.append(gt_labels_3d[m] + 1 - flag2)
        task_boxes.append(torch.cat(task_box, axis=0).to(device))
        task_classes.append(torch.cat(task_class).long().to(device))
        flag2 += len(mask)
    draw_gaussian = draw_heatmap_gaussian
    heatmaps, anno_boxes, inds, masks = [], [], [], []

    for idx, task_head in enumerate(self.task_heads):
        heatmap = gt_bboxes_3d.new_zeros(
            (len(self.class_names[idx]), feature_map_size[1],
             feature_map_size[0]))

        # anno_box = gt_bboxes_3d.new_zeros((max_objs, 10),
        #                                   dtype=torch.float32)

        anno_box = gt_bboxes_3d.new_zeros((max_objs, gt_annotation_num),
                                           dtype=torch.float32)

        ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
        mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)

        num_objs = min(task_boxes[idx].shape[0], max_objs)

        for k in range(num_objs):
            cls_id = task_classes[idx][k] - 1

            length = task_boxes[idx][k][3]
            width = task_boxes[idx][k][4]
            length = length / voxel_size[0] / self.train_cfg[
                'out_size_factor']
            width = width / voxel_size[1] / self.train_cfg[
                'out_size_factor']

            if width > 0 and length > 0:
                radius = gaussian_radius(
                    (width, length),
                    min_overlap=self.train_cfg['gaussian_overlap'])
                radius = max(self.train_cfg['min_radius'], int(radius))

                # be really careful for the coordinate system of
                # your box annotation.
                x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
                    1], task_boxes[idx][k][2]

                coor_x = (
                    x - pc_range[0]
                ) / voxel_size[0] / self.train_cfg['out_size_factor']
                coor_y = (
                    y - pc_range[1]
                ) / voxel_size[1] / self.train_cfg['out_size_factor']

                center = torch.tensor([coor_x, coor_y],
                                      dtype=torch.float32,
                                      device=device)
                center_int = center.to(torch.int32)

                # throw out not in range objects to avoid out of array
                # area when creating the heatmap
                if not (0 <= center_int[0] < feature_map_size[0]
                        and 0 <= center_int[1] < feature_map_size[1]):
                    continue

                draw_gaussian(heatmap[cls_id], center_int, radius)

                new_idx = k
                x, y = center_int[0], center_int[1]

                assert (y * feature_map_size[0] + x <
                        feature_map_size[0] * feature_map_size[1])

                ind[new_idx] = y * feature_map_size[0] + x
                mask[new_idx] = 1
                # TODO: support other outdoor dataset
                # vx, vy = task_boxes[idx][k][7:]
                rot = task_boxes[idx][k][6]
                box_dim = task_boxes[idx][k][3:6]
                if self.norm_bbox:
                    box_dim = box_dim.log()
                # anno_box[new_idx] = torch.cat([
                #     center - torch.tensor([x, y], device=device),
                #     z.unsqueeze(0), box_dim,
                #     torch.sin(rot).unsqueeze(0),
                #     torch.cos(rot).unsqueeze(0),
                #     vx.unsqueeze(0),
                #     vy.unsqueeze(0)
                # ])
                anno_elems = [
                    center - torch.tensor([x, y], device=device),
                    z.unsqueeze(0), box_dim,
                    torch.sin(rot).unsqueeze(0),
                    torch.cos(rot).unsqueeze(0)
                ]
                if gt_annotation_num == 10:
                    vx, vy = task_boxes[idx][k][7:10]
                    anno_elems += [vx.unsqueeze(0), vy.unsqueeze(0)]

                anno_box[new_idx] = torch.cat(anno_elems)

        heatmaps.append(heatmap)
        anno_boxes.append(anno_box)
        masks.append(mask)
        inds.append(ind)
    return heatmaps, anno_boxes, inds, masks

def loss(self, pts_feats: List[Tensor],
         batch_data_samples: List[Det3DDataSample], *args,
         **kwargs) -> Dict[str, Tensor]:
    """Forward function for point cloud branch.

    Args:
        pts_feats (list[torch.Tensor]): Features of point cloud branch
        batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
            Samples. It usually includes information such as
            `gt_instance_3d`, .

    Returns:
        dict: Losses of each branch.
    """
    outs = self(pts_feats)
    batch_gt_instance_3d = []
    for data_sample in batch_data_samples:
        batch_gt_instance_3d.append(data_sample.gt_instances_3d)
    losses = self.loss_by_feat(outs, batch_gt_instance_3d)
    return losses

def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                 batch_gt_instances_3d: List[InstanceData], *args,
                 **kwargs):
    """Loss function for CenterHead.

    Args:
        preds_dicts (tuple[list[dict]]): Prediction results of
            multiple tasks. The outer tuple indicate  different
            tasks head, and the internal list indicate different
            FPN level.
        batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
            gt_instances. It usually includes ``bboxes_3d`` and\
            ``labels_3d`` attributes.

    Returns:
        dict[str,torch.Tensor]: Loss of heatmap and bbox of each task.
    """

    heatmaps, anno_boxes, inds, masks = self.get_targets(
        batch_gt_instances_3d)
    loss_dict = dict()
    for task_id, preds_dict in enumerate(preds_dicts):
        # heatmap focal loss
        preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
        num_pos = heatmaps[task_id].eq(1).float().sum().item()
        loss_heatmap = self.loss_cls(
            preds_dict[0]['heatmap'],
            heatmaps[task_id],
            avg_factor=max(num_pos, 1))
        target_box = anno_boxes[task_id]
        # reconstruct the anno_box from multiple reg heads
        # Default keys assumed to exist for annotations with standard
        # KITTI-like 7 values   #修改kitti with 7 values
        anno_box = [
            preds_dict[0]['reg'], preds_dict[0]['height'],
            preds_dict[0]['dim'], preds_dict[0]['rot']
        ]
        # Key assumed to exist for bbox annotations with 9 values
        if 'vel' in preds_dict[0]:
            anno_box.append(preds_dict[0]['vel'])
        preds_dict[0]['anno_box'] = torch.cat(anno_box, dim=1)

        # preds_dict[0]['anno_box'] = torch.cat(
        #     (preds_dict[0]['reg'], preds_dict[0]['height'],
        #      preds_dict[0]['dim'], preds_dict[0]['rot'],
        #      preds_dict[0]['vel']),
        #     dim=1)

        # Regression loss for dimension, offset, height, rotation
        ind = inds[task_id]
        num = masks[task_id].float().sum()
        pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
        pred = pred.view(pred.size(0), -1, pred.size(3))
        pred = self._gather_feat(pred, ind)
        mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
        isnotnan = (~torch.isnan(target_box)).float()
        mask *= isnotnan

        code_weights = self.train_cfg.get('code_weights', None)
        bbox_weights = mask * mask.new_tensor(code_weights)
        loss_bbox = self.loss_bbox(
            pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
        loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
        loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
    return loss_dict

Reproduces the problem - error message

Traceback (most recent call last): File "/home/ypx/mmdetection3d/tools/train.py", line 135, in main() File "/home/ypx/mmdetection3d/tools/train.py", line 131, in main runner.train() File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1745, in train model = self.train_loop.run() # type: ignore File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmengine/runner/loops.py", line 96, in run self.run_epoch() File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmengine/runner/loops.py", line 112, in run_epoch self.run_iter(idx, data_batch) File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmengine/runner/loops.py", line 128, in run_iter outputs = self.runner.model.train_step( File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmengine/model/base_model/base_model.py", line 114, in train_step losses = self._run_forward(data, mode='loss') # type: ignore File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmengine/model/base_model/base_model.py", line 340, in _run_forward results = self(data, mode=mode) File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmdet3d-1.2.0-py3.10.egg/mmdet3d/models/detectors/base.py", line 75, in forward return self.loss(inputs, data_samples, kwargs) File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmdet3d-1.2.0-py3.10.egg/mmdet3d/models/detectors/mvx_two_stage.py", line 274, in loss losses_pts = self.pts_bbox_head.loss(pts_feats, batch_data_samples, File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmdet3d-1.2.0-py3.10.egg/mmdet3d/models/dense_heads/centerpoint_head.py", line 622, in loss losses = self.loss_by_feat(outs, batch_gt_instance_3d) File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/mmdet3d-1.2.0-py3.10.egg/mmdet3d/models/dense_heads/centerpoint_head.py", line 650, in loss_by_feat loss_heatmap = self.loss_cls( File "/home/ypx/miniconda3/envs/mtdet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/home/ypx/mmdetection/mmdet/models/losses/gaussian_focal_loss.py", line 176, in forward loss_reg = self.loss_weight * gaussian_focal_loss( File "/home/ypx/mmdetection/mmdet/models/losses/utils.py", line 121, in wrapper loss = loss_func(pred, target, *kwargs) File "/home/ypx/mmdetection/mmdet/models/losses/gaussian_focal_loss.py", line 35, in gaussian_focal_loss pos_loss = -(pred + eps).log() (1 - pred).pow(alpha) * pos_weights RuntimeError: The size of tensor a (88) must match the size of tensor b (81) at non-singleton dimension 3

Additional information

I have changed the centerpoint_head.py because the different between the kitti dataset and nuscenes datasets.but there is still a problem, the error seems from 'loss' in centerpoint_head.py,but i donot know how to solve it.

StevenZhangzhexu commented 6 months ago

Hi @randomfforest Have you solved the issue? Can you please share your method? thanks

chenwen60 commented 2 months ago

你好,这个问题你解决了吗?

clw5180 commented 1 month ago

你好,这个问题你解决了吗?

Pred和gt维度不一样,调试了一下发现是pred维度的问题,然后往回推发现配置文件里面的 SparseEncoder里面的 sparse_shape的值应该是实际的点云范围差值除以voxel_size,而不是那个 (41, 1024, 1024)