[Bug] KeyError: 'VoxelNet is not in the mmengine::model registry. Please check whether the value of `VoxelNet` is correct or it was registered as expected. More details can be found at https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module' #2992
Operating System and Python Version:
OS: Linux
Python: 3.10.14
CUDA and GPU Information:
CUDA Available: True
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
CUDA_HOME: /usr
NVCC: Cuda compilation tools, release 11.5, V11.5.119
Compiler and Build Tools:
GCC: 11.4.0
PyTorch: 1.13.1
PyTorch Compiling Details:
Built with GCC 9.3
CUDA Runtime 11.7
CuDNN 8.5
Intel MKL and MKL-DNN enabled
Related Libraries and Versions:
TorchVision: 0.14.1
OpenCV: 4.10.0
MMEngine: 0.10.1
MMDetection: 3.2.0
MMDetection3D: 1.4.0+962f093
Spconv2.0: False
import torch
import torch.nn.functional as F
from torch import nn
import math
import copy
from utils import box_ops
from utils.misc import (NestedTensor, nested_tensor_from_tensor_list,
accuracy, get_world_size, interpolate,
is_dist_avail_and_initialized, inverse_sigmoid)
from .backbone import build_backbone
from .matcher import build_matcher
from .depthaware_transformer import build_depthaware_transformer
from .depth_predictor import DepthPredictor
from .depth_predictor.ddn_loss import DDNLoss
from lib.losses.focal_loss import sigmoid_focal_loss
from .dn_components import prepare_for_dn, dn_post_process, compute_dn_loss
from mmdet.registry import MODELS
@MODELS.register_module()
class MonoDETR(nn.Module):
""" This is the MonoDETR module that performs monocualr 3D object detection """
def init(self, backbone, depthaware_transformer, depth_predictor, num_classes, num_queries, num_feature_levels,
aux_loss=True, with_box_refine=False, two_stage=False, init_box=False, use_dab=False, group_num=11, two_stage_dino=False):
""" Initializes the model.
Parameters:
backbone: torch module of the backbone to be used. See backbone.py
depthaware_transformer: depth-aware transformer architecture. See depth_aware_transformer.py
num_classes: number of object classes
num_queries: number of object queries, ie detection slot. This is the maximal number of objects
DETR can detect in a single image. For KITTI, we recommend 50 queries.
aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
with_box_refine: iterative bounding box refinement
two_stage: two-stage MonoDETR
"""
super().init()
self.num_queries = num_queries
self.depthaware_transformer = depthaware_transformer
self.depth_predictor = depth_predictor
hidden_dim = depthaware_transformer.d_model
self.hidden_dim = hidden_dim
self.num_feature_levels = num_feature_levels
self.two_stage_dino = two_stage_dino
self.label_enc = nn.Embedding(num_classes + 1, hidden_dim - 1) # # for indicator
# prediction heads
self.class_embed = nn.Linear(hidden_dim, num_classes)
prior_prob = 0.01
bias_value = -math.log((1 - prior_prob) / prior_prob)
self.class_embed.bias.data = torch.ones(num_classes) * bias_value
self.bbox_embed = MLP(hidden_dim, hidden_dim, 6, 3)
self.dim_embed_3d = MLP(hidden_dim, hidden_dim, 3, 2)
self.angle_embed = MLP(hidden_dim, hidden_dim, 24, 2)
self.depth_embed = MLP(hidden_dim, hidden_dim, 2, 2) # depth and deviation
self.use_dab = use_dab
if init_box == True:
nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
if not two_stage:
if two_stage_dino:
self.query_embed = None
if not use_dab:
self.query_embed = nn.Embedding(num_queries * group_num, hidden_dim*2)
else:
self.tgt_embed = nn.Embedding(num_queries * group_num, hidden_dim)
self.refpoint_embed = nn.Embedding(num_queries * group_num, 6)
if num_feature_levels > 1:
num_backbone_outs = len(backbone.strides)
input_proj_list = []
for _ in range(num_backbone_outs):
in_channels = backbone.num_channels[_]
input_proj_list.append(nn.Sequential(
nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
nn.GroupNorm(32, hidden_dim),
))
for _ in range(num_feature_levels - num_backbone_outs):
input_proj_list.append(nn.Sequential(
nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
nn.GroupNorm(32, hidden_dim),
))
in_channels = hidden_dim
self.input_proj = nn.ModuleList(input_proj_list)
else:
self.input_proj = nn.ModuleList([
nn.Sequential(
nn.Conv2d(backbone.num_channels[0], hidden_dim, kernel_size=1),
nn.GroupNorm(32, hidden_dim),
)])
self.backbone = backbone
self.aux_loss = aux_loss
self.with_box_refine = with_box_refine
self.two_stage = two_stage
self.num_classes = num_classes
if self.two_stage_dino:
_class_embed = nn.Linear(hidden_dim, num_classes)
_bbox_embed = MLP(hidden_dim, hidden_dim, 6, 3)
# init the two embed layers
prior_prob = 0.01
bias_value = -math.log((1 - prior_prob) / prior_prob)
_class_embed.bias.data = torch.ones(num_classes) * bias_value
nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
self.depthaware_transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)
self.depthaware_transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
for proj in self.input_proj:
nn.init.xavier_uniform_(proj[0].weight, gain=1)
nn.init.constant_(proj[0].bias, 0)
# if two-stage, the last class_embed and bbox_embed is for region proposal generation
num_pred = (depthaware_transformer.decoder.num_layers + 1) if two_stage else depthaware_transformer.decoder.num_layers
if with_box_refine:
self.class_embed = _get_clones(self.class_embed, num_pred)
self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
# hack implementation for iterative bounding box refinement
self.depthaware_transformer.decoder.bbox_embed = self.bbox_embed
self.dim_embed_3d = _get_clones(self.dim_embed_3d, num_pred)
self.depthaware_transformer.decoder.dim_embed = self.dim_embed_3d
self.angle_embed = _get_clones(self.angle_embed, num_pred)
self.depth_embed = _get_clones(self.depth_embed, num_pred)
else:
nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
self.dim_embed_3d = nn.ModuleList([self.dim_embed_3d for _ in range(num_pred)])
self.angle_embed = nn.ModuleList([self.angle_embed for _ in range(num_pred)])
self.depth_embed = nn.ModuleList([self.depth_embed for _ in range(num_pred)])
self.depthaware_transformer.decoder.bbox_embed = None
if two_stage:
# hack implementation for two-stage
self.depthaware_transformer.decoder.class_embed = self.class_embed
for box_embed in self.bbox_embed:
nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
def forward(self, images, calibs, targets, img_sizes, dn_args=None):
""" The forward expects a NestedTensor, which consists of:
- samples.tensor: batched images, of shape [batch_size x 3 x H x W]
- samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
"""
features, pos = self.backbone(images)
srcs = []
masks = []
for l, feat in enumerate(features):
src, mask = feat.decompose()
srcs.append(self.input_proj[l](src))
masks.append(mask)
assert mask is not None
if self.num_feature_levels > len(srcs):
_len_srcs = len(srcs)
for l in range(_len_srcs, self.num_feature_levels):
if l == _len_srcs:
src = self.input_proj[l](features[-1].tensors)
else:
src = self.input_proj[l](srcs[-1])
m = torch.zeros(src.shape[0], src.shape[2], src.shape[3]).to(torch.bool).to(src.device)
mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
srcs.append(src)
masks.append(mask)
pos.append(pos_l)
if self.two_stage:
query_embeds = None
elif self.use_dab:
if self.training:
tgt_all_embed=tgt_embed = self.tgt_embed.weight # nq, 256
refanchor = self.refpoint_embed.weight # nq, 4
query_embeds = torch.cat((tgt_embed, refanchor), dim=1)
else:
tgt_all_embed=tgt_embed = self.tgt_embed.weight[:self.num_queries]
refanchor = self.refpoint_embed.weight[:self.num_queries]
query_embeds = torch.cat((tgt_embed, refanchor), dim=1)
elif self.two_stage_dino:
query_embeds = None
else:
if self.training:
query_embeds = self.query_embed.weight
else:
# only use one group in inference
query_embeds = self.query_embed.weight[:self.num_queries]
pred_depth_map_logits, depth_pos_embed, weighted_depth, depth_pos_embed_ip = self.depth_predictor(srcs, masks[1], pos[1])
hs, init_reference, inter_references, inter_references_dim, enc_outputs_class, enc_outputs_coord_unact = self.depthaware_transformer(
srcs, masks, pos, query_embeds, depth_pos_embed, depth_pos_embed_ip)#, attn_mask)
outputs_coords = []
outputs_classes = []
outputs_3d_dims = []
outputs_depths = []
outputs_angles = []
for lvl in range(hs.shape[0]):
if lvl == 0:
reference = init_reference
else:
reference = inter_references[lvl - 1]
reference = inverse_sigmoid(reference)
tmp = self.bbox_embed[lvl](hs[lvl])
if reference.shape[-1] == 6:
tmp += reference
else:
assert reference.shape[-1] == 2
tmp[..., :2] += reference
# 3d center + 2d box
outputs_coord = tmp.sigmoid()
outputs_coords.append(outputs_coord)
# classes
outputs_class = self.class_embed[lvl](hs[lvl])
outputs_classes.append(outputs_class)
# 3D sizes
size3d = inter_references_dim[lvl]
outputs_3d_dims.append(size3d)
# depth_geo
box2d_height_norm = outputs_coord[:, :, 4] + outputs_coord[:, :, 5]
box2d_height = torch.clamp(box2d_height_norm * img_sizes[:, 1: 2], min=1.0)
depth_geo = size3d[:, :, 0] / box2d_height * calibs[:, 0, 0].unsqueeze(1)
# depth_reg
depth_reg = self.depth_embed[lvl](hs[lvl])
# depth_map
outputs_center3d = ((outputs_coord[..., :2] - 0.5) * 2).unsqueeze(2).detach()
depth_map = F.grid_sample(
weighted_depth.unsqueeze(1),
outputs_center3d,
mode='bilinear',
align_corners=True).squeeze(1)
# depth average + sigma
depth_ave = torch.cat([((1. / (depth_reg[:, :, 0: 1].sigmoid() + 1e-6) - 1.) + depth_geo.unsqueeze(-1) + depth_map) / 3,
depth_reg[:, :, 1: 2]], -1)
outputs_depths.append(depth_ave)
# angles
outputs_angle = self.angle_embed[lvl](hs[lvl])
outputs_angles.append(outputs_angle)
outputs_coord = torch.stack(outputs_coords)
outputs_class = torch.stack(outputs_classes)
outputs_3d_dim = torch.stack(outputs_3d_dims)
outputs_depth = torch.stack(outputs_depths)
outputs_angle = torch.stack(outputs_angles)
out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
out['pred_3d_dim'] = outputs_3d_dim[-1]
out['pred_depth'] = outputs_depth[-1]
out['pred_angle'] = outputs_angle[-1]
out['pred_depth_map_logits'] = pred_depth_map_logits
if self.aux_loss:
out['aux_outputs'] = self._set_aux_loss(
outputs_class, outputs_coord, outputs_3d_dim, outputs_angle, outputs_depth)
if self.two_stage:
enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
out['enc_outputs'] = {'pred_logits': enc_outputs_class, 'pred_boxes': enc_outputs_coord}
return out #, mask_dict
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_coord, outputs_3d_dim, outputs_angle, outputs_depth):
# this is a workaround to make torchscript happy, as torchscript
# doesn't support dictionary with non-homogeneous values, such
# as a dict having both a Tensor and a list.
return [{'pred_logits': a, 'pred_boxes': b,
'pred_3d_dim': c, 'pred_angle': d, 'pred_depth': e}
for a, b, c, d, e in zip(outputs_class[:-1], outputs_coord[:-1],
outputs_3d_dim[:-1], outputs_angle[:-1], outputs_depth[:-1])]
%%writefile mmdet3d/models/init.py
Copyright (c) OpenMMLab. All rights reserved.
from mmdet3d.models.layers.fusion_layers import # noqa: F401,F403
from .backbones import # noqa: F401,F403
from .data_preprocessors import # noqa: F401,F403
from .decode_heads import # noqa: F401,F403
from .dense_heads import # noqa: F401,F403
from .detectors import # noqa: F401,F403
from .layers import # noqa: F401,F403
from .losses import # noqa: F401,F403
from .middle_encoders import # noqa: F401,F403
from .necks import # noqa: F401,F403
from .roi_heads import # noqa: F401,F403
from .segmentors import # noqa: F401,F403
from .test_time_augs import # noqa: F401,F403
from .utils import # noqa: F401,F403
from .voxel_encoders import * # noqa: F401,F403
from .monodetr import MonoDETR
Prerequisite
Task
I have modified the scripts/configs, or I'm working on my own tasks/models/datasets.
Branch
main branch https://github.com/open-mmlab/mmdetection3d
Environment
Operating System and Python Version: OS: Linux Python: 3.10.14 CUDA and GPU Information: CUDA Available: True GPU: NVIDIA GeForce RTX 3050 Laptop GPU CUDA_HOME: /usr NVCC: Cuda compilation tools, release 11.5, V11.5.119 Compiler and Build Tools:
Related Libraries and Versions: TorchVision: 0.14.1 OpenCV: 4.10.0 MMEngine: 0.10.1 MMDetection: 3.2.0 MMDetection3D: 1.4.0+962f093 Spconv2.0: False
Reproduces the problem - code sample
%%writefile configs/monodetr/MonoDETR.py
model = dict( type='VoxelNet', data_preprocessor=dict( type='Det3DDataPreprocessor', voxel=True, voxel_layer=dict( max_num_points=32, point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1], voxel_size=[0.16, 0.16, 4], max_voxels=(16000, 40000))),
Reproduces the problem - command or script
python tools/train.py configs/monodetr/MonoDETR.py
Reproduces the problem - error message
KeyError: 'VoxelNet is not in the mmengine::model registry. Please check whether the value of
VoxelNet
is correct or it was registered as expected. More details can be found at https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module'Additional information
%%writefile mmdet3d/models/monodetr.py
import torch import torch.nn.functional as F from torch import nn import math import copy
from utils import box_ops from utils.misc import (NestedTensor, nested_tensor_from_tensor_list, accuracy, get_world_size, interpolate, is_dist_avail_and_initialized, inverse_sigmoid)
from .backbone import build_backbone from .matcher import build_matcher from .depthaware_transformer import build_depthaware_transformer from .depth_predictor import DepthPredictor from .depth_predictor.ddn_loss import DDNLoss from lib.losses.focal_loss import sigmoid_focal_loss from .dn_components import prepare_for_dn, dn_post_process, compute_dn_loss from mmdet.registry import MODELS
@MODELS.register_module() class MonoDETR(nn.Module): """ This is the MonoDETR module that performs monocualr 3D object detection """ def init(self, backbone, depthaware_transformer, depth_predictor, num_classes, num_queries, num_feature_levels, aux_loss=True, with_box_refine=False, two_stage=False, init_box=False, use_dab=False, group_num=11, two_stage_dino=False): """ Initializes the model. Parameters: backbone: torch module of the backbone to be used. See backbone.py depthaware_transformer: depth-aware transformer architecture. See depth_aware_transformer.py num_classes: number of object classes num_queries: number of object queries, ie detection slot. This is the maximal number of objects DETR can detect in a single image. For KITTI, we recommend 50 queries. aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. with_box_refine: iterative bounding box refinement two_stage: two-stage MonoDETR """ super().init()
%%writefile mmdet3d/models/init.py
Copyright (c) OpenMMLab. All rights reserved.
from mmdet3d.models.layers.fusion_layers import # noqa: F401,F403 from .backbones import # noqa: F401,F403 from .data_preprocessors import # noqa: F401,F403 from .decode_heads import # noqa: F401,F403 from .dense_heads import # noqa: F401,F403 from .detectors import # noqa: F401,F403 from .layers import # noqa: F401,F403 from .losses import # noqa: F401,F403 from .middle_encoders import # noqa: F401,F403 from .necks import # noqa: F401,F403 from .roi_heads import # noqa: F401,F403 from .segmentors import # noqa: F401,F403 from .test_time_augs import # noqa: F401,F403 from .utils import # noqa: F401,F403 from .voxel_encoders import * # noqa: F401,F403 from .monodetr import MonoDETR