Open schatur2 opened 1 year ago
We haven't tried these, so you can try it yourself. But note that unlike nuScenes, which has six surround-view images, KITTI only has two foreground images, which may make it difficult for BEVFusion.
Okay. Thank you @sunjiahao1999 for responding. If possible can you please check the following BEVFusion config for Kitti Dataset to see if everything looks good to you? When I am doing training using the following Kitti dataset config, I am getting the error "File "/data2/saket/tp_mmdetection3d-master/mmdetection3d/projects/BEVFusion/bevfusion/loading.py", line 135, in transform filename.append(cam_item['img_path']) KeyError: 'img_path'"
The above error is connected with def transform(self, results: dict) -> Optional[dict]: ("Call function to load multi-view image from files"). It does not have results dict. Is this problem connected with multi-view image? How can I solve the problem?
Thank you so much! Regards, Saket
Config: base = ['../../../configs/base/default_runtime.py'] custom_imports = dict( imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1] class_names = ['Pedestrian', 'Cyclist', 'Car']
metainfo = dict(classes=class_names) dataset_type = 'KittiDataset' data_root = '/data2/saket/KITTI/' data_prefix = dict(pts='training/velodyne_reduced', img='training/image_2') input_modality = dict(use_lidar=True, use_camera=True) backend_args = None
model = dict( type='BEVFusion', data_preprocessor=dict( type='Det3DDataPreprocessor', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], bgr_to_rgb=False), img_backbone=dict( type='mmdet.SwinTransformer', embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=[1, 2, 3], with_cp=False, convert_weights=True, init_cfg=dict( type='Pretrained', checkpoint= # noqa: E251 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa: E501 )), img_neck=dict( type='GeneralizedLSSFPN', in_channels=[192, 384, 768], out_channels=256, start_level=0, num_outs=3, norm_cfg=dict(type='BN2d', requires_grad=True), act_cfg=dict(type='ReLU', inplace=True), upsample_cfg=dict(mode='bilinear', align_corners=False)), view_transform=dict( type='DepthLSSTransform', in_channels=256, out_channels=80, image_size=[256, 704], feature_size=[32, 88], xbound=[-54.0, 54.0, 0.3], ybound=[-54.0, 54.0, 0.3], zbound=[-10.0, 10.0, 20.0], dbound=[1.0, 60.0, 0.5], downsample=2), fusion_layer=dict( type='ConvFuser', in_channels=[80, 256], out_channels=256))
train_pipeline = [ dict( type='BEVLoadMultiViewImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, backend_args=backend_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, load_dim=5, use_dim=5, pad_empty_sweeps=True, remove_close=True, backend_args=backend_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), dict( type='ImageAug3D', final_dim=[256, 704], resize_lim=[0.38, 0.55], bot_pct_lim=[0.0, 0.0], rot_lim=[-5.4, 5.4], rand_flip=True, is_train=True), dict( type='BEVFusionGlobalRotScaleTrans', scale_ratio_range=[0.9, 1.1], rot_range=[-0.78539816, 0.78539816], translation_std=0.5), dict(type='BEVFusionRandomFlip3D'), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict( type='ObjectNameFilter', classes=['Pedestrian', 'Cyclist', 'Car']),
dict(
type='GridMask',
use_h=True,
use_w=True,
max_epoch=6,
rotate=1,
offset=False,
ratio=0.5,
mode=1,
prob=0.0,
fixed_prob=True),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=[
'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
'gt_labels'
],
meta_keys=[
'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
'lidar_aug_matrix'
])
]
test_pipeline = [ dict( type='BEVLoadMultiViewImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, backend_args=backend_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, load_dim=5, use_dim=5, pad_empty_sweeps=True, remove_close=True, backend_args=backend_args), dict( type='ImageAug3D', final_dim=[256, 704], resize_lim=[0.48, 0.48], bot_pct_lim=[0.0, 0.0], rot_lim=[0.0, 0.0], rand_flip=False, is_train=False), dict( type='PointsRangeFilter', point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]), dict( type='Pack3DDetInputs', keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'], meta_keys=[ 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', 'lidar_path', 'img_path' ]) ]
train_dataloader = dict( batch_size=2, num_workers=2, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file='kitti_infos_train.pkl', pipeline=train_pipeline, metainfo=metainfo, modality=input_modality, test_mode=False, data_prefix=data_prefix, filter_empty_gt=False,#use_valid_flag=True,
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR')))
val_dataloader = dict( batch_size=1, num_workers=1, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file='kitti_infos_val.pkl', pipeline=test_pipeline, metainfo=metainfo, modality=input_modality, data_prefix=data_prefix, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args)) test_dataloader = val_dataloader
param_scheduler = [ dict( type='LinearLR', start_factor=0.33333333, by_epoch=False, begin=0, end=500), dict( type='CosineAnnealingLR', begin=0, T_max=6, end=6, by_epoch=True, eta_min_ratio=1e-4, convert_to_iter_based=True),
# During the first 8 epochs, momentum increases from 1 to 0.85 / 0.95
# during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
dict(
type='CosineAnnealingMomentum',
eta_min=0.85 / 0.95,
begin=0,
end=2.4,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingMomentum',
eta_min=1,
begin=2.4,
end=6,
by_epoch=True,
convert_to_iter_based=True)
]
db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict( car=5, truck=5, bus=5, trailer=5, construction_vehicle=5, traffic_cone=5, barrier=5, motorcycle=5, bicycle=5, pedestrian=5)), classes=class_names, sample_groups=dict( car=2, truck=3, construction_vehicle=7, bus=4, trailer=6, barrier=2, motorcycle=6, bicycle=6, pedestrian=2, traffic_cone=2), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], backend_args=backend_args))
val_evaluator = dict( type='KittiMetric', ann_file=data_root + 'kitti_infos_val.pkl') test_evaluator = val_evaluator vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1) val_cfg = dict() test_cfg = dict()
optim_wrapper = dict( type='OptimWrapper', optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01), clip_grad=dict(max_norm=35, norm_type=2))
enable
means enable scaling LR automaticallybase_batch_size
= (8 GPUs) x (4 samples per GPU).auto_scale_lr = dict(enable=False, base_batch_size=32)
default_hooks = dict( logger=dict(type='LoggerHook', interval=50), checkpoint=dict(type='CheckpointHook', interval=1)) del base.custom_hooks
@schatur2 maybe you're using an older version of infos.pkl
. Generate new infos.pkl
with tools/create_data.py
or download new infos.pkl
in here
@sunjiahao1999, I was earlier using an older version of infos.pkl and was getting error "TypeError: The annotations loaded from annotation file should be a dict, but got <class 'list'>!". I updated the infos.pkl with new versions then the problem was solved. Do you still think this could be because of the older version of infos.pkl or it can be because of a changed dataset in config?
Thank you so much! Regards, Saket
@sunjiahao1999, I was earlier using an older version of infos.pkl and was getting error "TypeError: The annotations loaded from annotation file should be a dict, but got <class 'list'>!". I updated the infos.pkl with new versions then the problem was solved. Do you still think this could be because of the older version of infos.pkl or it can be because of a changed dataset in config?
Thank you so much! Regards, Saket
@schatur2 Hello, did you succeed to train BEVFusion with this config ? Will you share your trained model ? If not, could you please tell me how much time takes to train the model on kitti
Thank you Lehna
@lehnasamia, I am trying to train BEVFusion model with the kitti_config config by solving the above key_cam problem.
I think it is a quite heavy work to transfer bevfusion to kitti. The structure of kitti_infos_train.pkl and nuscenes_infos_train.pkl are quite different, such as info[‘instances’] and info[‘images’]. So, in my opinion, to successfully train bevfusion on kitti, you should not only rewrite config.py but also the transformation function in loading.py. And now I am trying to do that.
Okay @liyih. Please let me know once you have things setup on kitti dataset.
Thank you so much! Regards, Saket
@schatur2 I have rewriten loading.py(projects/BEVFusion/bevfusion/loading_kitti.py) and config.py(projects/BEVFusion/configs/bev_kitti.py) and now the problem you mention before filename.append(cam_item['img_path']) KeyError: 'img_path'" has be solved. However, I meet the new problem as follow: File "mmdetection3d/projects/BEVFusion/bevfusion/depth_lss.py", line 299, in forward cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords) IndexError: too many indices for tensor of dimension 2 It looks like a normal question. But I don't want to do some changes in the model codes. By the way, I don't know if use nuScenes will meet the same question. Whether if the bug for the model code or I should rewrite the model. the loading.py are as follows:
import copy from typing import Optional
import mmcv import numpy as np from mmengine.fileio import get
from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles from mmdet3d.registry import TRANSFORMS
@TRANSFORMS.register_module() class BEVLoadKittiImageFromFiles(LoadMultiViewImageFromFiles): """Load multi channel images from a list of separate channel files.
``BEVLoadMultiViewImageFromFiles`` adds the following keys for the
convenience of view transforms in the forward:
- 'cam2lidar'
- 'lidar2img'
Args:
to_float32 (bool): Whether to convert the img to float32.
Defaults to False.
color_type (str): Color type of the file. Defaults to 'unchanged'.
backend_args (dict, optional): Arguments to instantiate the
corresponding backend. Defaults to None.
num_views (int): Number of view in a frame. Defaults to 5.
num_ref_frames (int): Number of frame in loading. Defaults to -1.
test_mode (bool): Whether is test mode in loading. Defaults to False.
set_default_scale (bool): Whether to set default scale.
Defaults to True.
"""
def transform(self, results: dict) -> Optional[dict]:
"""Call function to load multi-view image from files.
Args:
results (dict): Result dict containing multi-view image filenames.
Returns:
dict: The result dict containing the multi-view image data.
Added keys and values are described below.
- filename (str): Multi-view image filenames.
- img (np.ndarray): Multi-view image arrays.
- img_shape (tuple[int]): Shape of multi-view image arrays.
- ori_shape (tuple[int]): Shape of original image arrays.
- pad_shape (tuple[int]): Shape of padded image arrays.
- scale_factor (float): Scale factor.
- img_norm_cfg (dict): Normalization configuration of images.
"""
# TODO: consider split the multi-sweep part out of this pipeline
# Derive the mask and transform for loading of multi-sweep data
# Support multi-view images with different shapes
# TODO: record the origin shape and padded shape
filename, cam2img, lidar2cam, cam2lidar, lidar2img = [], [], [], [], []
filename.append(results['images']['CAM2']['img_path'])
lidar2cam.append(results['images']['CAM2']['lidar2cam'])
lidar2cam_array = np.array(results['images']['CAM2']['lidar2cam']).astype(
np.float32)
lidar2cam_rot = lidar2cam_array[:3, :3]
lidar2cam_trans = lidar2cam_array[:3, 3:4]
camera2lidar = np.eye(4)
camera2lidar[:3, :3] = lidar2cam_rot.T
camera2lidar[:3, 3:4] = -1 * np.matmul(
lidar2cam_rot.T, lidar2cam_trans.reshape(3, 1))
cam2lidar.append(camera2lidar)
cam2img_array = np.eye(4).astype(np.float32)
cam2img_array[:3, :3] = np.array(results['images']['CAM2']['cam2img'])[:3, :3].astype(np.float32)
cam2img.append(cam2img_array)
lidar2img.append(cam2img_array @ lidar2cam_array)
results['img_path'] = filename
results['cam2img'] = np.stack(cam2img, axis=0)
results['lidar2cam'] = np.stack(lidar2cam, axis=0)
results['cam2lidar'] = np.stack(cam2lidar, axis=0)
results['lidar2img'] = np.stack(lidar2img, axis=0)
results['ori_cam2img'] = copy.deepcopy(results['cam2img'])
# img is of shape (h, w, c, num_views)
# h and w can be different for different views
img_bytes = [
get(name, backend_args=self.backend_args) for name in filename
]
imgs = [
mmcv.imfrombytes(
img_byte,
flag=self.color_type,
backend='pillow',
channel_order='rgb') for img_byte in img_bytes
]
# handle the image with different shape
img_shapes = np.stack([img.shape for img in imgs], axis=0)
img_shape_max = np.max(img_shapes, axis=0)
img_shape_min = np.min(img_shapes, axis=0)
assert img_shape_min[-1] == img_shape_max[-1]
if not np.all(img_shape_max == img_shape_min):
pad_shape = img_shape_max[:2]
else:
pad_shape = None
if pad_shape is not None:
imgs = [
mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs
]
img = np.stack(imgs, axis=-1)
if self.to_float32:
img = img.astype(np.float32)
results['filename'] = filename
# unravel to list, see `DefaultFormatBundle` in formating.py
# which will transpose each image separately and then stack into array
results['img'] = [img[..., i] for i in range(img.shape[-1])]
results['img_shape'] = img.shape[:2]
results['ori_shape'] = img.shape[:2]
# Set initial values for default meta_keys
results['pad_shape'] = img.shape[:2]
if self.set_default_scale:
results['scale_factor'] = 1.0
num_channels = 1 if len(img.shape) < 3 else img.shape[2]
results['img_norm_cfg'] = dict(
mean=np.zeros(num_channels, dtype=np.float32),
std=np.ones(num_channels, dtype=np.float32),
to_rgb=False)
results['num_views'] = self.num_views
results['num_ref_frames'] = self.num_ref_frames
return results
The config.py are as follows: base = ['../../../configs/base/schedules/cosine.py', '../../../configs/base/default_runtime.py']
custom_imports = dict( imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1]
model = dict( type='BEVFusion', data_preprocessor=dict( type='Det3DDataPreprocessor', pad_size_divisor=32, voxelize_cfg=dict( max_num_points=10, point_cloud_range=[0, -40, -3, 70.4, 40, 1], voxel_size=[0.05, 0.05, 0.1], max_voxels=[120000, 160000], voxelize_reduce=True), mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], bgr_to_rgb=False), pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), pts_middle_encoder=dict( type='BEVFusionSparseEncoder', in_channels=5, sparse_shape=[1440, 1440, 41], order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), bbox_head=dict( type='TransFusionHead', num_proposals=200, auxiliary=True, in_channels=512, hidden_channel=128, num_classes=10, nms_kernel_size=3, bn_momentum=0.1, num_decoder_layers=1, decoder_layer=dict( type='TransformerDecoderLayer', self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), ffn_cfg=dict( embed_dims=128, feedforward_channels=256, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), norm_cfg=dict(type='LN'), pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)), train_cfg=dict( dataset='KittiDataset', point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0], grid_size=[1440, 1440, 41], voxel_size=[0.075, 0.075, 0.2], out_size_factor=8, gaussian_overlap=0.1, min_radius=2, pos_weight=-1, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict( type='mmdet.FocalLossCost', gamma=2.0, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25))), test_cfg=dict( dataset='KittiDataset', grid_size=[1440, 1440, 41], out_size_factor=8, voxel_size=[0.075, 0.075], pc_range=[-54.0, -54.0], nms_type=None), common_heads=dict( center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=[-54.0, -54.0], post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, out_size_factor=8, voxel_size=[0.075, 0.075], code_size=10), loss_cls=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, reduction='mean', loss_weight=1.0), loss_heatmap=dict( type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), loss_bbox=dict( type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)), img_backbone=dict( type='mmdet.SwinTransformer', embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=[1, 2, 3], with_cp=False, convert_weights=True, init_cfg=dict( type='Pretrained', checkpoint= # noqa: E251 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa: E501 )), img_neck=dict( type='GeneralizedLSSFPN', in_channels=[192, 384, 768], out_channels=256, start_level=0, num_outs=3, norm_cfg=dict(type='BN2d', requires_grad=True), act_cfg=dict(type='ReLU', inplace=True), upsample_cfg=dict(mode='bilinear', align_corners=False)), view_transform=dict( type='DepthLSSTransform', in_channels=256, out_channels=80, image_size=[256, 704], feature_size=[32, 88], xbound=[-54.0, 54.0, 0.3], ybound=[-54.0, 54.0, 0.3], zbound=[-10.0, 10.0, 20.0], dbound=[1.0, 60.0, 0.5], downsample=2), fusion_layer=dict( type='ConvFuser', in_channels=[80, 256], out_channels=256))
dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] metainfo = dict(classes=class_names) input_modality = dict(use_lidar=True, use_camera=True) backend_args = None
train_pipeline = [
dict(
type='BEVLoadKittiImageFromFiles',
to_float32=True,
color_type='color',
backend_args=backend_args),
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
backend_args=backend_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
# 少一个ImageAug3D的数据增强。
dict(
type='BEVFusionGlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05],
translation_std=[0.2, 0.2, 0.2]),
dict(type='BEVFusionRandomFlip3D'),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=['Pedestrian', 'Cyclist', 'Car']),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=[
'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
'gt_labels'
],
meta_keys=[
'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
'lidar_aug_matrix'
])
]
test_pipeline = [ dict( type='BEVLoadKittiImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args),
dict(
type='PointsRangeFilter',
point_cloud_range=[0, -40, -3, 70.4, 40, 1]),
dict(
type='Pack3DDetInputs',
keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'],
meta_keys=[
'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
'lidar_path', 'img_path'
])
]
modality = dict(use_lidar=True, use_camera=True) train_dataloader = dict( batch_size=2, num_workers=2, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_train.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=train_pipeline, filter_empty_gt=False, metainfo=metainfo,
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR',
backend_args=backend_args)))
val_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_val.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args)) test_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file='kitti_infos_val.pkl', modality=modality, data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args))
optim_wrapper = dict( optimizer=dict(weight_decay=0.01), clip_grad=dict(max_norm=35, norm_type=2), ) val_evaluator = dict( type='KittiMetric', ann_file='data/kitti/kitti_infos_val.pkl') test_evaluator = val_evaluator
vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
And you should add 'BEVLoadKittiImageFromFiles' into 'projects/BEVFusion/bevfusion/init.py' If you know how to solve my question, please contact me as soon. Best
Hi @liyih,
Great. Thank you. Looking at your errors it seems to me that this might be connected to the num_views in kitti dataset?
Regards, Saket
@schatur2 I have rewriten loading.py(projects/BEVFusion/bevfusion/loading_kitti.py) and config.py(projects/BEVFusion/configs/bev_kitti.py) and now the problem you mention before filename.append(cam_item['img_path']) KeyError: 'img_path'" has be solved. However, I meet the new problem as follow: File "mmdetection3d/projects/BEVFusion/bevfusion/depth_lss.py", line 299, in forward cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords) IndexError: too many indices for tensor of dimension 2 It looks like a normal question. But I don't want to do some changes in the model codes. By the way, I don't know if use nuScenes will meet the same question. Whether if the bug for the model code or I should rewrite the model. the loading.py are as follows:
Copyright (c) OpenMMLab. All rights reserved.
import copy from typing import Optional
import mmcv import numpy as np from mmengine.fileio import get
from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles from mmdet3d.registry import TRANSFORMS
@TRANSFORMS.register_module() class BEVLoadKittiImageFromFiles(LoadMultiViewImageFromFiles): """Load multi channel images from a list of separate channel files.
``BEVLoadMultiViewImageFromFiles`` adds the following keys for the convenience of view transforms in the forward: - 'cam2lidar' - 'lidar2img' Args: to_float32 (bool): Whether to convert the img to float32. Defaults to False. color_type (str): Color type of the file. Defaults to 'unchanged'. backend_args (dict, optional): Arguments to instantiate the corresponding backend. Defaults to None. num_views (int): Number of view in a frame. Defaults to 5. num_ref_frames (int): Number of frame in loading. Defaults to -1. test_mode (bool): Whether is test mode in loading. Defaults to False. set_default_scale (bool): Whether to set default scale. Defaults to True. """ def transform(self, results: dict) -> Optional[dict]: """Call function to load multi-view image from files. Args: results (dict): Result dict containing multi-view image filenames. Returns: dict: The result dict containing the multi-view image data. Added keys and values are described below. - filename (str): Multi-view image filenames. - img (np.ndarray): Multi-view image arrays. - img_shape (tuple[int]): Shape of multi-view image arrays. - ori_shape (tuple[int]): Shape of original image arrays. - pad_shape (tuple[int]): Shape of padded image arrays. - scale_factor (float): Scale factor. - img_norm_cfg (dict): Normalization configuration of images. """ # TODO: consider split the multi-sweep part out of this pipeline # Derive the mask and transform for loading of multi-sweep data # Support multi-view images with different shapes # TODO: record the origin shape and padded shape filename, cam2img, lidar2cam, cam2lidar, lidar2img = [], [], [], [], [] filename.append(results['images']['CAM2']['img_path']) lidar2cam.append(results['images']['CAM2']['lidar2cam']) lidar2cam_array = np.array(results['images']['CAM2']['lidar2cam']).astype( np.float32) lidar2cam_rot = lidar2cam_array[:3, :3] lidar2cam_trans = lidar2cam_array[:3, 3:4] camera2lidar = np.eye(4) camera2lidar[:3, :3] = lidar2cam_rot.T camera2lidar[:3, 3:4] = -1 * np.matmul( lidar2cam_rot.T, lidar2cam_trans.reshape(3, 1)) cam2lidar.append(camera2lidar) cam2img_array = np.eye(4).astype(np.float32) cam2img_array[:3, :3] = np.array(results['images']['CAM2']['cam2img'])[:3, :3].astype(np.float32) cam2img.append(cam2img_array) lidar2img.append(cam2img_array @ lidar2cam_array) results['img_path'] = filename results['cam2img'] = np.stack(cam2img, axis=0) results['lidar2cam'] = np.stack(lidar2cam, axis=0) results['cam2lidar'] = np.stack(cam2lidar, axis=0) results['lidar2img'] = np.stack(lidar2img, axis=0) results['ori_cam2img'] = copy.deepcopy(results['cam2img']) # img is of shape (h, w, c, num_views) # h and w can be different for different views img_bytes = [ get(name, backend_args=self.backend_args) for name in filename ] imgs = [ mmcv.imfrombytes( img_byte, flag=self.color_type, backend='pillow', channel_order='rgb') for img_byte in img_bytes ] # handle the image with different shape img_shapes = np.stack([img.shape for img in imgs], axis=0) img_shape_max = np.max(img_shapes, axis=0) img_shape_min = np.min(img_shapes, axis=0) assert img_shape_min[-1] == img_shape_max[-1] if not np.all(img_shape_max == img_shape_min): pad_shape = img_shape_max[:2] else: pad_shape = None if pad_shape is not None: imgs = [ mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs ] img = np.stack(imgs, axis=-1) if self.to_float32: img = img.astype(np.float32) results['filename'] = filename # unravel to list, see `DefaultFormatBundle` in formating.py # which will transpose each image separately and then stack into array results['img'] = [img[..., i] for i in range(img.shape[-1])] results['img_shape'] = img.shape[:2] results['ori_shape'] = img.shape[:2] # Set initial values for default meta_keys results['pad_shape'] = img.shape[:2] if self.set_default_scale: results['scale_factor'] = 1.0 num_channels = 1 if len(img.shape) < 3 else img.shape[2] results['img_norm_cfg'] = dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False) results['num_views'] = self.num_views results['num_ref_frames'] = self.num_ref_frames return results
The config.py are as follows: base = ['../../../configs/base/schedules/cosine.py', '../../../configs/base/default_runtime.py']
custom_imports = dict( imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1]
model = dict( type='BEVFusion', data_preprocessor=dict( type='Det3DDataPreprocessor', pad_size_divisor=32, voxelize_cfg=dict( max_num_points=10, point_cloud_range=[0, -40, -3, 70.4, 40, 1], voxel_size=[0.05, 0.05, 0.1], max_voxels=[120000, 160000], voxelize_reduce=True), mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], bgr_to_rgb=False), pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), pts_middle_encoder=dict( type='BEVFusionSparseEncoder', in_channels=5, sparse_shape=[1440, 1440, 41], order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), bbox_head=dict( type='TransFusionHead', num_proposals=200, auxiliary=True, in_channels=512, hidden_channel=128, num_classes=10, nms_kernel_size=3, bn_momentum=0.1, num_decoder_layers=1, decoder_layer=dict( type='TransformerDecoderLayer', self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), ffn_cfg=dict( embed_dims=128, feedforward_channels=256, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), norm_cfg=dict(type='LN'), pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)), train_cfg=dict( dataset='KittiDataset', point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0], grid_size=[1440, 1440, 41], voxel_size=[0.075, 0.075, 0.2], out_size_factor=8, gaussian_overlap=0.1, min_radius=2, pos_weight=-1, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict( type='mmdet.FocalLossCost', gamma=2.0, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25))), test_cfg=dict( dataset='KittiDataset', grid_size=[1440, 1440, 41], out_size_factor=8, voxel_size=[0.075, 0.075], pc_range=[-54.0, -54.0], nms_type=None), common_heads=dict( center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=[-54.0, -54.0], post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, out_size_factor=8, voxel_size=[0.075, 0.075], code_size=10), loss_cls=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, reduction='mean', loss_weight=1.0), loss_heatmap=dict( type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), loss_bbox=dict( type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)), img_backbone=dict( type='mmdet.SwinTransformer', embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=[1, 2, 3], with_cp=False, convert_weights=True, init_cfg=dict( type='Pretrained', checkpoint= # noqa: E251 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa: E501 )), img_neck=dict( type='GeneralizedLSSFPN', in_channels=[192, 384, 768], out_channels=256, start_level=0, num_outs=3, norm_cfg=dict(type='BN2d', requires_grad=True), act_cfg=dict(type='ReLU', inplace=True), upsample_cfg=dict(mode='bilinear', align_corners=False)), view_transform=dict( type='DepthLSSTransform', in_channels=256, out_channels=80, image_size=[256, 704], feature_size=[32, 88], xbound=[-54.0, 54.0, 0.3], ybound=[-54.0, 54.0, 0.3], zbound=[-10.0, 10.0, 20.0], dbound=[1.0, 60.0, 0.5], downsample=2), fusion_layer=dict( type='ConvFuser', in_channels=[80, 256], out_channels=256))
dataset settings
dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] metainfo = dict(classes=class_names) input_modality = dict(use_lidar=True, use_camera=True) backend_args = None
train_pipeline = [
dict( type='BEVLoadKittiImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), # 少一个ImageAug3D的数据增强。 dict( type='BEVFusionGlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05], translation_std=[0.2, 0.2, 0.2]), dict(type='BEVFusionRandomFlip3D'), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict( type='ObjectNameFilter', classes=['Pedestrian', 'Cyclist', 'Car']), dict(type='PointShuffle'), dict( type='Pack3DDetInputs', keys=[ 'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', 'gt_labels' ], meta_keys=[ 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', 'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation', 'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix', 'lidar_aug_matrix' ])
]
test_pipeline = [ dict( type='BEVLoadKittiImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), # 少一个ImageAug3D的数据增强。 dict( type='PointsRangeFilter', point_cloud_range=[0, -40, -3, 70.4, 40, 1]), dict( type='Pack3DDetInputs', keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'], meta_keys=[ 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', 'lidar_path', 'img_path' ]) ]
modality = dict(use_lidar=True, use_camera=True) train_dataloader = dict( batch_size=2, num_workers=2, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_train.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=train_pipeline, filter_empty_gt=False, metainfo=metainfo, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', backend_args=backend_args)))
val_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_val.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args)) test_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file='kitti_infos_val.pkl', modality=modality, data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args))
optim_wrapper = dict( optimizer=dict(weight_decay=0.01), clip_grad=dict(max_norm=35, norm_type=2), ) val_evaluator = dict( type='KittiMetric', ann_file='data/kitti/kitti_infos_val.pkl') test_evaluator = val_evaluator
vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
And you should add 'BEVLoadKittiImageFromFiles' into 'projects/BEVFusion/bevfusion/init.py' If you know how to solve my question, please contact me as soon. Best
Have you solved it?
@Lcl159 I find there is an easy way to solve this problem. In the class DepthLSSTransform author put the depth as priori to calculate the distribution of LSS, so it needs to inverse the augmenting process to calculate the right depth. However, in the original paper it doesn't add any depth as prior and there is no performance loss. Due to the bug appears in the inverse the augmenting process when using KITTI, I advise to use LSSTransform instead of DepthLSSTransform. You can change it easily in config. Best
I find there is an easy way to solve this problem. In the class DepthLSSTransform author put the depth as priori to calculate the distribution of LSS, so it needs to inverse the augmenting process to calculate the right depth. However, in the original paper it doesn't add any depth as prior and there is no performance loss. Due to the bug appears in the inverse the augmenting process when using KITTI, I advise to use LSSTransform instead of DepthLSSTransform. You can change it easily in config.
Wow, that's great. Can you give me your contact info?
Have you implemented bevfusion running on kitti?
@schatur2 I have rewriten loading.py(projects/BEVFusion/bevfusion/loading_kitti.py) and config.py(projects/BEVFusion/configs/bev_kitti.py) and now the problem you mention before filename.append(cam_item['img_path']) KeyError: 'img_path'" has be solved. However, I meet the new problem as follow: File "mmdetection3d/projects/BEVFusion/bevfusion/depth_lss.py", line 299, in forward cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords) IndexError: too many indices for tensor of dimension 2 It looks like a normal question. But I don't want to do some changes in the model codes. By the way, I don't know if use nuScenes will meet the same question. Whether if the bug for the model code or I should rewrite the model. the loading.py are as follows:
Copyright (c) OpenMMLab. All rights reserved.
import copy from typing import Optional
import mmcv import numpy as np from mmengine.fileio import get
from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles from mmdet3d.registry import TRANSFORMS
@TRANSFORMS.register_module() class BEVLoadKittiImageFromFiles(LoadMultiViewImageFromFiles): """Load multi channel images from a list of separate channel files.
``BEVLoadMultiViewImageFromFiles`` adds the following keys for the convenience of view transforms in the forward: - 'cam2lidar' - 'lidar2img' Args: to_float32 (bool): Whether to convert the img to float32. Defaults to False. color_type (str): Color type of the file. Defaults to 'unchanged'. backend_args (dict, optional): Arguments to instantiate the corresponding backend. Defaults to None. num_views (int): Number of view in a frame. Defaults to 5. num_ref_frames (int): Number of frame in loading. Defaults to -1. test_mode (bool): Whether is test mode in loading. Defaults to False. set_default_scale (bool): Whether to set default scale. Defaults to True. """ def transform(self, results: dict) -> Optional[dict]: """Call function to load multi-view image from files. Args: results (dict): Result dict containing multi-view image filenames. Returns: dict: The result dict containing the multi-view image data. Added keys and values are described below. - filename (str): Multi-view image filenames. - img (np.ndarray): Multi-view image arrays. - img_shape (tuple[int]): Shape of multi-view image arrays. - ori_shape (tuple[int]): Shape of original image arrays. - pad_shape (tuple[int]): Shape of padded image arrays. - scale_factor (float): Scale factor. - img_norm_cfg (dict): Normalization configuration of images. """ # TODO: consider split the multi-sweep part out of this pipeline # Derive the mask and transform for loading of multi-sweep data # Support multi-view images with different shapes # TODO: record the origin shape and padded shape filename, cam2img, lidar2cam, cam2lidar, lidar2img = [], [], [], [], [] filename.append(results['images']['CAM2']['img_path']) lidar2cam.append(results['images']['CAM2']['lidar2cam']) lidar2cam_array = np.array(results['images']['CAM2']['lidar2cam']).astype( np.float32) lidar2cam_rot = lidar2cam_array[:3, :3] lidar2cam_trans = lidar2cam_array[:3, 3:4] camera2lidar = np.eye(4) camera2lidar[:3, :3] = lidar2cam_rot.T camera2lidar[:3, 3:4] = -1 * np.matmul( lidar2cam_rot.T, lidar2cam_trans.reshape(3, 1)) cam2lidar.append(camera2lidar) cam2img_array = np.eye(4).astype(np.float32) cam2img_array[:3, :3] = np.array(results['images']['CAM2']['cam2img'])[:3, :3].astype(np.float32) cam2img.append(cam2img_array) lidar2img.append(cam2img_array @ lidar2cam_array) results['img_path'] = filename results['cam2img'] = np.stack(cam2img, axis=0) results['lidar2cam'] = np.stack(lidar2cam, axis=0) results['cam2lidar'] = np.stack(cam2lidar, axis=0) results['lidar2img'] = np.stack(lidar2img, axis=0) results['ori_cam2img'] = copy.deepcopy(results['cam2img']) # img is of shape (h, w, c, num_views) # h and w can be different for different views img_bytes = [ get(name, backend_args=self.backend_args) for name in filename ] imgs = [ mmcv.imfrombytes( img_byte, flag=self.color_type, backend='pillow', channel_order='rgb') for img_byte in img_bytes ] # handle the image with different shape img_shapes = np.stack([img.shape for img in imgs], axis=0) img_shape_max = np.max(img_shapes, axis=0) img_shape_min = np.min(img_shapes, axis=0) assert img_shape_min[-1] == img_shape_max[-1] if not np.all(img_shape_max == img_shape_min): pad_shape = img_shape_max[:2] else: pad_shape = None if pad_shape is not None: imgs = [ mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs ] img = np.stack(imgs, axis=-1) if self.to_float32: img = img.astype(np.float32) results['filename'] = filename # unravel to list, see `DefaultFormatBundle` in formating.py # which will transpose each image separately and then stack into array results['img'] = [img[..., i] for i in range(img.shape[-1])] results['img_shape'] = img.shape[:2] results['ori_shape'] = img.shape[:2] # Set initial values for default meta_keys results['pad_shape'] = img.shape[:2] if self.set_default_scale: results['scale_factor'] = 1.0 num_channels = 1 if len(img.shape) < 3 else img.shape[2] results['img_norm_cfg'] = dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False) results['num_views'] = self.num_views results['num_ref_frames'] = self.num_ref_frames return results
The config.py are as follows: base = ['../../../configs/base/schedules/cosine.py', '../../../configs/base/default_runtime.py']
custom_imports = dict( imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1]
model = dict( type='BEVFusion', data_preprocessor=dict( type='Det3DDataPreprocessor', pad_size_divisor=32, voxelize_cfg=dict( max_num_points=10, point_cloud_range=[0, -40, -3, 70.4, 40, 1], voxel_size=[0.05, 0.05, 0.1], max_voxels=[120000, 160000], voxelize_reduce=True), mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], bgr_to_rgb=False), pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), pts_middle_encoder=dict( type='BEVFusionSparseEncoder', in_channels=5, sparse_shape=[1440, 1440, 41], order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), bbox_head=dict( type='TransFusionHead', num_proposals=200, auxiliary=True, in_channels=512, hidden_channel=128, num_classes=10, nms_kernel_size=3, bn_momentum=0.1, num_decoder_layers=1, decoder_layer=dict( type='TransformerDecoderLayer', self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), ffn_cfg=dict( embed_dims=128, feedforward_channels=256, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), norm_cfg=dict(type='LN'), pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)), train_cfg=dict( dataset='KittiDataset', point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0], grid_size=[1440, 1440, 41], voxel_size=[0.075, 0.075, 0.2], out_size_factor=8, gaussian_overlap=0.1, min_radius=2, pos_weight=-1, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict( type='mmdet.FocalLossCost', gamma=2.0, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25))), test_cfg=dict( dataset='KittiDataset', grid_size=[1440, 1440, 41], out_size_factor=8, voxel_size=[0.075, 0.075], pc_range=[-54.0, -54.0], nms_type=None), common_heads=dict( center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=[-54.0, -54.0], post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, out_size_factor=8, voxel_size=[0.075, 0.075], code_size=10), loss_cls=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, reduction='mean', loss_weight=1.0), loss_heatmap=dict( type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), loss_bbox=dict( type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)), img_backbone=dict( type='mmdet.SwinTransformer', embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=[1, 2, 3], with_cp=False, convert_weights=True, init_cfg=dict( type='Pretrained', checkpoint= # noqa: E251 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa: E501 )), img_neck=dict( type='GeneralizedLSSFPN', in_channels=[192, 384, 768], out_channels=256, start_level=0, num_outs=3, norm_cfg=dict(type='BN2d', requires_grad=True), act_cfg=dict(type='ReLU', inplace=True), upsample_cfg=dict(mode='bilinear', align_corners=False)), view_transform=dict( type='DepthLSSTransform', in_channels=256, out_channels=80, image_size=[256, 704], feature_size=[32, 88], xbound=[-54.0, 54.0, 0.3], ybound=[-54.0, 54.0, 0.3], zbound=[-10.0, 10.0, 20.0], dbound=[1.0, 60.0, 0.5], downsample=2), fusion_layer=dict( type='ConvFuser', in_channels=[80, 256], out_channels=256))
dataset settings
dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] metainfo = dict(classes=class_names) input_modality = dict(use_lidar=True, use_camera=True) backend_args = None
train_pipeline = [
dict( type='BEVLoadKittiImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), # 少一个ImageAug3D的数据增强。 dict( type='BEVFusionGlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05], translation_std=[0.2, 0.2, 0.2]), dict(type='BEVFusionRandomFlip3D'), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict( type='ObjectNameFilter', classes=['Pedestrian', 'Cyclist', 'Car']), dict(type='PointShuffle'), dict( type='Pack3DDetInputs', keys=[ 'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', 'gt_labels' ], meta_keys=[ 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', 'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation', 'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix', 'lidar_aug_matrix' ])
]
test_pipeline = [ dict( type='BEVLoadKittiImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), # 少一个ImageAug3D的数据增强。 dict( type='PointsRangeFilter', point_cloud_range=[0, -40, -3, 70.4, 40, 1]), dict( type='Pack3DDetInputs', keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'], meta_keys=[ 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', 'lidar_path', 'img_path' ]) ]
modality = dict(use_lidar=True, use_camera=True) train_dataloader = dict( batch_size=2, num_workers=2, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_train.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=train_pipeline, filter_empty_gt=False, metainfo=metainfo, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', backend_args=backend_args)))
val_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_val.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args)) test_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file='kitti_infos_val.pkl', modality=modality, data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args))
optim_wrapper = dict( optimizer=dict(weight_decay=0.01), clip_grad=dict(max_norm=35, norm_type=2), ) val_evaluator = dict( type='KittiMetric', ann_file='data/kitti/kitti_infos_val.pkl') test_evaluator = val_evaluator
vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
And you should add 'BEVLoadKittiImageFromFiles' into 'projects/BEVFusion/bevfusion/init.py' If you know how to solve my question, please contact me as soon. Best
how do you reslove KeyError: 'img_path'"?
@schatur2 I have rewriten loading.py(projects/BEVFusion/bevfusion/loading_kitti.py) and config.py(projects/BEVFusion/configs/bev_kitti.py) and now the problem you mention before filename.append(cam_item['img_path']) KeyError: 'img_path'" has be solved. However, I meet the new problem as follow: File "mmdetection3d/projects/BEVFusion/bevfusion/depth_lss.py", line 299, in forward cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords) IndexError: too many indices for tensor of dimension 2 It looks like a normal question. But I don't want to do some changes in the model codes. By the way, I don't know if use nuScenes will meet the same question. Whether if the bug for the model code or I should rewrite the model. the loading.py are as follows:
Copyright (c) OpenMMLab. All rights reserved.
import copy from typing import Optional import mmcv import numpy as np from mmengine.fileio import get from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles from mmdet3d.registry import TRANSFORMS @TRANSFORMS.register_module() class BEVLoadKittiImageFromFiles(LoadMultiViewImageFromFiles): """Load multi channel images from a list of separate channel files.
``BEVLoadMultiViewImageFromFiles`` adds the following keys for the convenience of view transforms in the forward: - 'cam2lidar' - 'lidar2img' Args: to_float32 (bool): Whether to convert the img to float32. Defaults to False. color_type (str): Color type of the file. Defaults to 'unchanged'. backend_args (dict, optional): Arguments to instantiate the corresponding backend. Defaults to None. num_views (int): Number of view in a frame. Defaults to 5. num_ref_frames (int): Number of frame in loading. Defaults to -1. test_mode (bool): Whether is test mode in loading. Defaults to False. set_default_scale (bool): Whether to set default scale. Defaults to True. """ def transform(self, results: dict) -> Optional[dict]: """Call function to load multi-view image from files. Args: results (dict): Result dict containing multi-view image filenames. Returns: dict: The result dict containing the multi-view image data. Added keys and values are described below. - filename (str): Multi-view image filenames. - img (np.ndarray): Multi-view image arrays. - img_shape (tuple[int]): Shape of multi-view image arrays. - ori_shape (tuple[int]): Shape of original image arrays. - pad_shape (tuple[int]): Shape of padded image arrays. - scale_factor (float): Scale factor. - img_norm_cfg (dict): Normalization configuration of images. """ # TODO: consider split the multi-sweep part out of this pipeline # Derive the mask and transform for loading of multi-sweep data # Support multi-view images with different shapes # TODO: record the origin shape and padded shape filename, cam2img, lidar2cam, cam2lidar, lidar2img = [], [], [], [], [] filename.append(results['images']['CAM2']['img_path']) lidar2cam.append(results['images']['CAM2']['lidar2cam']) lidar2cam_array = np.array(results['images']['CAM2']['lidar2cam']).astype( np.float32) lidar2cam_rot = lidar2cam_array[:3, :3] lidar2cam_trans = lidar2cam_array[:3, 3:4] camera2lidar = np.eye(4) camera2lidar[:3, :3] = lidar2cam_rot.T camera2lidar[:3, 3:4] = -1 * np.matmul( lidar2cam_rot.T, lidar2cam_trans.reshape(3, 1)) cam2lidar.append(camera2lidar) cam2img_array = np.eye(4).astype(np.float32) cam2img_array[:3, :3] = np.array(results['images']['CAM2']['cam2img'])[:3, :3].astype(np.float32) cam2img.append(cam2img_array) lidar2img.append(cam2img_array @ lidar2cam_array) results['img_path'] = filename results['cam2img'] = np.stack(cam2img, axis=0) results['lidar2cam'] = np.stack(lidar2cam, axis=0) results['cam2lidar'] = np.stack(cam2lidar, axis=0) results['lidar2img'] = np.stack(lidar2img, axis=0) results['ori_cam2img'] = copy.deepcopy(results['cam2img']) # img is of shape (h, w, c, num_views) # h and w can be different for different views img_bytes = [ get(name, backend_args=self.backend_args) for name in filename ] imgs = [ mmcv.imfrombytes( img_byte, flag=self.color_type, backend='pillow', channel_order='rgb') for img_byte in img_bytes ] # handle the image with different shape img_shapes = np.stack([img.shape for img in imgs], axis=0) img_shape_max = np.max(img_shapes, axis=0) img_shape_min = np.min(img_shapes, axis=0) assert img_shape_min[-1] == img_shape_max[-1] if not np.all(img_shape_max == img_shape_min): pad_shape = img_shape_max[:2] else: pad_shape = None if pad_shape is not None: imgs = [ mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs ] img = np.stack(imgs, axis=-1) if self.to_float32: img = img.astype(np.float32) results['filename'] = filename # unravel to list, see `DefaultFormatBundle` in formating.py # which will transpose each image separately and then stack into array results['img'] = [img[..., i] for i in range(img.shape[-1])] results['img_shape'] = img.shape[:2] results['ori_shape'] = img.shape[:2] # Set initial values for default meta_keys results['pad_shape'] = img.shape[:2] if self.set_default_scale: results['scale_factor'] = 1.0 num_channels = 1 if len(img.shape) < 3 else img.shape[2] results['img_norm_cfg'] = dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False) results['num_views'] = self.num_views results['num_ref_frames'] = self.num_ref_frames return results
The config.py are as follows: base = ['../../../configs/base/schedules/cosine.py', '../../../configs/base/default_runtime.py'] custom_imports = dict( imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False) voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1] model = dict( type='BEVFusion', data_preprocessor=dict( type='Det3DDataPreprocessor', pad_size_divisor=32, voxelize_cfg=dict( max_num_points=10, point_cloud_range=[0, -40, -3, 70.4, 40, 1], voxel_size=[0.05, 0.05, 0.1], max_voxels=[120000, 160000], voxelize_reduce=True), mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], bgr_to_rgb=False), pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), pts_middle_encoder=dict( type='BEVFusionSparseEncoder', in_channels=5, sparse_shape=[1440, 1440, 41], order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), bbox_head=dict( type='TransFusionHead', num_proposals=200, auxiliary=True, in_channels=512, hidden_channel=128, num_classes=10, nms_kernel_size=3, bn_momentum=0.1, num_decoder_layers=1, decoder_layer=dict( type='TransformerDecoderLayer', self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), ffn_cfg=dict( embed_dims=128, feedforward_channels=256, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), norm_cfg=dict(type='LN'), pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)), train_cfg=dict( dataset='KittiDataset', point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0], grid_size=[1440, 1440, 41], voxel_size=[0.075, 0.075, 0.2], out_size_factor=8, gaussian_overlap=0.1, min_radius=2, pos_weight=-1, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict( type='mmdet.FocalLossCost', gamma=2.0, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25))), test_cfg=dict( dataset='KittiDataset', grid_size=[1440, 1440, 41], out_size_factor=8, voxel_size=[0.075, 0.075], pc_range=[-54.0, -54.0], nms_type=None), common_heads=dict( center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=[-54.0, -54.0], post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, out_size_factor=8, voxel_size=[0.075, 0.075], code_size=10), loss_cls=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, reduction='mean', loss_weight=1.0), loss_heatmap=dict( type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), loss_bbox=dict( type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)), img_backbone=dict( type='mmdet.SwinTransformer', embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=[1, 2, 3], with_cp=False, convert_weights=True, init_cfg=dict( type='Pretrained', checkpoint= # noqa: E251 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa: E501 )), img_neck=dict( type='GeneralizedLSSFPN', in_channels=[192, 384, 768], out_channels=256, start_level=0, num_outs=3, norm_cfg=dict(type='BN2d', requires_grad=True), act_cfg=dict(type='ReLU', inplace=True), upsample_cfg=dict(mode='bilinear', align_corners=False)), view_transform=dict( type='DepthLSSTransform', in_channels=256, out_channels=80, image_size=[256, 704], feature_size=[32, 88], xbound=[-54.0, 54.0, 0.3], ybound=[-54.0, 54.0, 0.3], zbound=[-10.0, 10.0, 20.0], dbound=[1.0, 60.0, 0.5], downsample=2), fusion_layer=dict( type='ConvFuser', in_channels=[80, 256], out_channels=256))
dataset settings
dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] metainfo = dict(classes=class_names) input_modality = dict(use_lidar=True, use_camera=True) backend_args = None train_pipeline = [
dict( type='BEVLoadKittiImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), # 少一个ImageAug3D的数据增强。 dict( type='BEVFusionGlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05], translation_std=[0.2, 0.2, 0.2]), dict(type='BEVFusionRandomFlip3D'), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict( type='ObjectNameFilter', classes=['Pedestrian', 'Cyclist', 'Car']), dict(type='PointShuffle'), dict( type='Pack3DDetInputs', keys=[ 'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', 'gt_labels' ], meta_keys=[ 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', 'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation', 'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix', 'lidar_aug_matrix' ])
] test_pipeline = [ dict( type='BEVLoadKittiImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), # 少一个ImageAug3D的数据增强。 dict( type='PointsRangeFilter', point_cloud_range=[0, -40, -3, 70.4, 40, 1]), dict( type='Pack3DDetInputs', keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'], meta_keys=[ 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', 'lidar_path', 'img_path' ]) ] modality = dict(use_lidar=True, use_camera=True) train_dataloader = dict( batch_size=2, num_workers=2, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_train.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=train_pipeline, filter_empty_gt=False, metainfo=metainfo, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', backend_args=backend_args))) val_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_val.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args)) test_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file='kitti_infos_val.pkl', modality=modality, data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args)) optim_wrapper = dict( optimizer=dict(weight_decay=0.01), clip_grad=dict(max_norm=35, norm_type=2), ) val_evaluator = dict( type='KittiMetric', ann_file='data/kitti/kitti_infos_val.pkl') test_evaluator = val_evaluator vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') And you should add 'BEVLoadKittiImageFromFiles' into 'projects/BEVFusion/bevfusion/init.py' If you know how to solve my question, please contact me as soon. Best
how do you reslove KeyError: 'img_path'"?
ok i have already finished it !
@liuyuansheng624 你好,你成功在kitti上训练了bevfusion嘛?我没试成功,你能发我一份配置文件到wuhu835@gmail.com嘛? 万分感谢!
13731082126微信联系
发自我的iPhone
------------------ Original ------------------ From: Alranen @.> Date: Mon,Mar 4,2024 11:39 AM To: open-mmlab/mmdetection3d @.> Cc: liuys @.>, Mention @.> Subject: Re: [open-mmlab/mmdetection3d] How to train model with BEVFusion onKitti Dataset instead of nuScenes dataset? (Issue #2617)
yes i can please wechat 13731082126
发自我的iPhone
------------------ Original ------------------ From: Dahjung Chung @.> Date: Thu,Mar 7,2024 3:55 PM To: open-mmlab/mmdetection3d @.> Cc: liuys @.>, Mention @.> Subject: Re: [open-mmlab/mmdetection3d] How to train model with BEVFusion onKitti Dataset instead of nuScenes dataset? (Issue #2617)
@schatur2 I have rewriten loading.py(projects/BEVFusion/bevfusion/loading_kitti.py) and config.py(projects/BEVFusion/configs/bev_kitti.py) and now the problem you mention before filename.append(cam_item['img_path']) KeyError: 'img_path'" has be solved. However, I meet the new problem as follow: File "mmdetection3d/projects/BEVFusion/bevfusion/depth_lss.py", line 299, in forward cur_coords = cur_img_aug_matrix[:, :3, :3].matmul(cur_coords) IndexError: too many indices for tensor of dimension 2 It looks like a normal question. But I don't want to do some changes in the model codes. By the way, I don't know if use nuScenes will meet the same question. Whether if the bug for the model code or I should rewrite the model. the loading.py are as follows:
Copyright (c) OpenMMLab. All rights reserved.
import copy from typing import Optional
import mmcv import numpy as np from mmengine.fileio import get
from mmdet3d.datasets.transforms import LoadMultiViewImageFromFiles from mmdet3d.registry import TRANSFORMS
@TRANSFORMS.register_module() class BEVLoadKittiImageFromFiles(LoadMultiViewImageFromFiles): """Load multi channel images from a list of separate channel files.
``BEVLoadMultiViewImageFromFiles`` adds the following keys for the convenience of view transforms in the forward: - 'cam2lidar' - 'lidar2img' Args: to_float32 (bool): Whether to convert the img to float32. Defaults to False. color_type (str): Color type of the file. Defaults to 'unchanged'. backend_args (dict, optional): Arguments to instantiate the corresponding backend. Defaults to None. num_views (int): Number of view in a frame. Defaults to 5. num_ref_frames (int): Number of frame in loading. Defaults to -1. test_mode (bool): Whether is test mode in loading. Defaults to False. set_default_scale (bool): Whether to set default scale. Defaults to True. """ def transform(self, results: dict) -> Optional[dict]: """Call function to load multi-view image from files. Args: results (dict): Result dict containing multi-view image filenames. Returns: dict: The result dict containing the multi-view image data. Added keys and values are described below. - filename (str): Multi-view image filenames. - img (np.ndarray): Multi-view image arrays. - img_shape (tuple[int]): Shape of multi-view image arrays. - ori_shape (tuple[int]): Shape of original image arrays. - pad_shape (tuple[int]): Shape of padded image arrays. - scale_factor (float): Scale factor. - img_norm_cfg (dict): Normalization configuration of images. """ # TODO: consider split the multi-sweep part out of this pipeline # Derive the mask and transform for loading of multi-sweep data # Support multi-view images with different shapes # TODO: record the origin shape and padded shape filename, cam2img, lidar2cam, cam2lidar, lidar2img = [], [], [], [], [] filename.append(results['images']['CAM2']['img_path']) lidar2cam.append(results['images']['CAM2']['lidar2cam']) lidar2cam_array = np.array(results['images']['CAM2']['lidar2cam']).astype( np.float32) lidar2cam_rot = lidar2cam_array[:3, :3] lidar2cam_trans = lidar2cam_array[:3, 3:4] camera2lidar = np.eye(4) camera2lidar[:3, :3] = lidar2cam_rot.T camera2lidar[:3, 3:4] = -1 * np.matmul( lidar2cam_rot.T, lidar2cam_trans.reshape(3, 1)) cam2lidar.append(camera2lidar) cam2img_array = np.eye(4).astype(np.float32) cam2img_array[:3, :3] = np.array(results['images']['CAM2']['cam2img'])[:3, :3].astype(np.float32) cam2img.append(cam2img_array) lidar2img.append(cam2img_array @ lidar2cam_array) results['img_path'] = filename results['cam2img'] = np.stack(cam2img, axis=0) results['lidar2cam'] = np.stack(lidar2cam, axis=0) results['cam2lidar'] = np.stack(cam2lidar, axis=0) results['lidar2img'] = np.stack(lidar2img, axis=0) results['ori_cam2img'] = copy.deepcopy(results['cam2img']) # img is of shape (h, w, c, num_views) # h and w can be different for different views img_bytes = [ get(name, backend_args=self.backend_args) for name in filename ] imgs = [ mmcv.imfrombytes( img_byte, flag=self.color_type, backend='pillow', channel_order='rgb') for img_byte in img_bytes ] # handle the image with different shape img_shapes = np.stack([img.shape for img in imgs], axis=0) img_shape_max = np.max(img_shapes, axis=0) img_shape_min = np.min(img_shapes, axis=0) assert img_shape_min[-1] == img_shape_max[-1] if not np.all(img_shape_max == img_shape_min): pad_shape = img_shape_max[:2] else: pad_shape = None if pad_shape is not None: imgs = [ mmcv.impad(img, shape=pad_shape, pad_val=0) for img in imgs ] img = np.stack(imgs, axis=-1) if self.to_float32: img = img.astype(np.float32) results['filename'] = filename # unravel to list, see `DefaultFormatBundle` in formating.py # which will transpose each image separately and then stack into array results['img'] = [img[..., i] for i in range(img.shape[-1])] results['img_shape'] = img.shape[:2] results['ori_shape'] = img.shape[:2] # Set initial values for default meta_keys results['pad_shape'] = img.shape[:2] if self.set_default_scale: results['scale_factor'] = 1.0 num_channels = 1 if len(img.shape) < 3 else img.shape[2] results['img_norm_cfg'] = dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False) results['num_views'] = self.num_views results['num_ref_frames'] = self.num_ref_frames return results
The config.py are as follows: base = ['../../../configs/base/schedules/cosine.py', '../../../configs/base/default_runtime.py']
custom_imports = dict( imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1]
model = dict( type='BEVFusion', data_preprocessor=dict( type='Det3DDataPreprocessor', pad_size_divisor=32, voxelize_cfg=dict( max_num_points=10, point_cloud_range=[0, -40, -3, 70.4, 40, 1], voxel_size=[0.05, 0.05, 0.1], max_voxels=[120000, 160000], voxelize_reduce=True), mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], bgr_to_rgb=False), pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), pts_middle_encoder=dict( type='BEVFusionSparseEncoder', in_channels=5, sparse_shape=[1440, 1440, 41], order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), bbox_head=dict( type='TransFusionHead', num_proposals=200, auxiliary=True, in_channels=512, hidden_channel=128, num_classes=10, nms_kernel_size=3, bn_momentum=0.1, num_decoder_layers=1, decoder_layer=dict( type='TransformerDecoderLayer', self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), ffn_cfg=dict( embed_dims=128, feedforward_channels=256, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), norm_cfg=dict(type='LN'), pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)), train_cfg=dict( dataset='KittiDataset', point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0], grid_size=[1440, 1440, 41], voxel_size=[0.075, 0.075, 0.2], out_size_factor=8, gaussian_overlap=0.1, min_radius=2, pos_weight=-1, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], assigner=dict( type='HungarianAssigner3D', iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), cls_cost=dict( type='mmdet.FocalLossCost', gamma=2.0, alpha=0.25, weight=0.15), reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), iou_cost=dict(type='IoU3DCost', weight=0.25))), test_cfg=dict( dataset='KittiDataset', grid_size=[1440, 1440, 41], out_size_factor=8, voxel_size=[0.075, 0.075], pc_range=[-54.0, -54.0], nms_type=None), common_heads=dict( center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), bbox_coder=dict( type='TransFusionBBoxCoder', pc_range=[-54.0, -54.0], post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], score_threshold=0.0, out_size_factor=8, voxel_size=[0.075, 0.075], code_size=10), loss_cls=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, reduction='mean', loss_weight=1.0), loss_heatmap=dict( type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), loss_bbox=dict( type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)), img_backbone=dict( type='mmdet.SwinTransformer', embed_dims=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=[1, 2, 3], with_cp=False, convert_weights=True, init_cfg=dict( type='Pretrained', checkpoint= # noqa: E251 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa: E501 )), img_neck=dict( type='GeneralizedLSSFPN', in_channels=[192, 384, 768], out_channels=256, start_level=0, num_outs=3, norm_cfg=dict(type='BN2d', requires_grad=True), act_cfg=dict(type='ReLU', inplace=True), upsample_cfg=dict(mode='bilinear', align_corners=False)), view_transform=dict( type='DepthLSSTransform', in_channels=256, out_channels=80, image_size=[256, 704], feature_size=[32, 88], xbound=[-54.0, 54.0, 0.3], ybound=[-54.0, 54.0, 0.3], zbound=[-10.0, 10.0, 20.0], dbound=[1.0, 60.0, 0.5], downsample=2), fusion_layer=dict( type='ConvFuser', in_channels=[80, 256], out_channels=256))
dataset settings
dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] metainfo = dict(classes=class_names) input_modality = dict(use_lidar=True, use_camera=True) backend_args = None
train_pipeline = [
dict( type='BEVLoadKittiImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), # 少一个ImageAug3D的数据增强。 dict( type='BEVFusionGlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05], translation_std=[0.2, 0.2, 0.2]), dict(type='BEVFusionRandomFlip3D'), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict( type='ObjectNameFilter', classes=['Pedestrian', 'Cyclist', 'Car']), dict(type='PointShuffle'), dict( type='Pack3DDetInputs', keys=[ 'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', 'gt_labels' ], meta_keys=[ 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', 'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation', 'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix', 'lidar_aug_matrix' ])
]
test_pipeline = [ dict( type='BEVLoadKittiImageFromFiles', to_float32=True, color_type='color', backend_args=backend_args), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, backend_args=backend_args), # 少一个ImageAug3D的数据增强。 dict( type='PointsRangeFilter', point_cloud_range=[0, -40, -3, 70.4, 40, 1]), dict( type='Pack3DDetInputs', keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'], meta_keys=[ 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', 'lidar_path', 'img_path' ]) ]
modality = dict(use_lidar=True, use_camera=True) train_dataloader = dict( batch_size=2, num_workers=2, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_train.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=train_pipeline, filter_empty_gt=False, metainfo=metainfo, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', backend_args=backend_args)))
val_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, modality=modality, ann_file='kitti_infos_val.pkl', data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args)) test_dataloader = dict( batch_size=1, num_workers=1, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, data_root=data_root, ann_file='kitti_infos_val.pkl', modality=modality, data_prefix=dict( pts='training/velodyne_reduced', img='training/image_2'), pipeline=test_pipeline, metainfo=metainfo, test_mode=True, box_type_3d='LiDAR', backend_args=backend_args))
optim_wrapper = dict( optimizer=dict(weight_decay=0.01), clip_grad=dict(max_norm=35, norm_type=2), ) val_evaluator = dict( type='KittiMetric', ann_file='data/kitti/kitti_infos_val.pkl') test_evaluator = val_evaluator
vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
And you should add 'BEVLoadKittiImageFromFiles' into 'projects/BEVFusion/bevfusion/init.py' If you know how to solve my question, please contact me as soon. Best
Hello,Have you solved it?
Model/Dataset/Scheduler description
Hello Everyone,
I am training mvxnet model with the default config file available in the repository, which is for the point_fusion and kitti dataset. I wanted to try other fusion methods with mvxnet model.
I see BEVFusion code added in the repository with the nuScenes dataset. What config settings I would need to change for adapting BEVFusion for Camera Lidar3d fusion with the Kitti dataset?
Thank you so much for your help! Regards, Saket
Open source status
Provide useful links for the implementation
No response