Open Byte247 opened 6 months ago
Can you provide more details about your reproduced network, especially the backbone structure, configuration parameters and corresponding code implementation?
Sure, here is my backbone code:
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
super(BasicBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
@BACKBONES.register_module
class PointResNet34(nn.Module):
"""
ResNet-34 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
Adjusted for use as a 3D backbone, so different block structure according to FastPillars: https://arxiv.org/abs/2302.02367
"""
def __init__(self, block= BasicBlock, layers=[6,6,3,1], in_channels = 64, zero_init_residual=False,
groups=1, width_per_group=64, replace_stride_with_dilation=None,
norm_layer=None, name="PointResNet34", first_max_pool = True, **kwargs):
super(PointResNet34, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer
self.first_max_pool = first_max_pool
self.inplanes = 64
self.dilation = 1
self.in_channels = in_channels
if replace_stride_with_dilation is None:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError("replace_stride_with_dilation should be None "
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(self.in_channels, self.inplanes, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
if first_max_pool:
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
dilate=replace_stride_with_dilation[2])
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0)
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes, groups=self.groups,
base_width=self.base_width, dilation=self.dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
def _forward_impl(self, x):
# See note [TorchScript super()]
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
if self.first_max_pool:
x = self.maxpool(x)
stage_2_out = self.layer1(x)
stage_3_out = self.layer2(stage_2_out)
stage_4_out = self.layer3(stage_3_out)
stage_5_out = self.layer4(stage_4_out)
return [stage_5_out, stage_4_out, stage_3_out, stage_2_out]
def forward(self, x):
return self._forward_impl(x)
And here is my config:
import itertools
import logging
from det3d.utils.config_tool import get_downsample_factor
tasks = [
dict(num_class=1, class_names=["car"]),
dict(num_class=2, class_names=["truck", "construction_vehicle"]),
dict(num_class=2, class_names=["bus", "trailer"]),
dict(num_class=1, class_names=["barrier"]),
dict(num_class=2, class_names=["motorcycle", "bicycle"]),
dict(num_class=2, class_names=["pedestrian", "traffic_cone"]),
]
class_names = list(itertools.chain(*[t["class_names"] for t in tasks]))
# training and testing settings
target_assigner = dict(
tasks=tasks,
)
# model settings
model = dict(
type="FastPillars",
pretrained=None,
reader=dict(
type="CustomPillarFeatureNet", # just PillarFeatureNet followed by PointPillarsScatter to fit the config architecture
num_filters=[64, 64],
num_input_features=5,
with_distance=False,
voxel_size=(0.15, 0.15, 8), # Paper: "set the pillar size as 0.15m"
pc_range=(-54, -54, -5.0, 54, 54, 3.0),
),
backbone=dict(type="PointResNet34",first_max_pool=False, ds_factor=8), # Remove first downsample operation according to https://github.com/StiphyJay/FastPillars/issues/10
neck=dict(
type="RPNV3",
layer_nums=[5, 5],
ds_layer_strides=[1, 2],
ds_num_filters=[256, 256],
us_layer_strides=[1, 2],
us_num_filters=[256, 256], # increase from 128x128 to better match 3x 128 filters in CenterPoint PP centerhead
num_input_features=[256,512], #num features in the feature maps block 4 and 5
logger=logging.getLogger("RPN"),
),
bbox_head=dict(
type="CenterHead",
in_channels=sum([256, 256]),
tasks=tasks,
dataset='nuscenes',
weight=0.25,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0],
common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv)
),
)
assigner = dict(
target_assigner=target_assigner,
out_size_factor= get_downsample_factor(model),
gaussian_overlap=0.1,
max_objs=500,
min_radius=2,
)
train_cfg = dict(assigner=assigner)
test_cfg = dict(
post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
max_per_img=500,
nms=dict(
nms_pre_max_size=1000,
nms_post_max_size=83,
nms_iou_threshold=0.2,
),
score_threshold=0.1,
pc_range=[-54, -54],
out_size_factor=get_downsample_factor(model),
voxel_size=[0.15, 0.15]
)
# dataset settings
dataset_type = "NuScenesDataset"
nsweeps = 10
data_root = "/data/nuScenes"
db_sampler = dict(
type="GT-AUG",
enable=True,
db_info_path="/data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
sample_groups=[
dict(car=2),
dict(truck=3),
dict(construction_vehicle=7),
dict(bus=4),
dict(trailer=6),
dict(barrier=2),
dict(motorcycle=6),
dict(bicycle=6),
dict(pedestrian=2),
dict(traffic_cone=2),
],
db_prep_steps=[
dict(
filter_by_min_num_points=dict(
car=5,
truck=5,
bus=5,
trailer=5,
construction_vehicle=5,
traffic_cone=5,
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5,
)
),
dict(filter_by_difficulty=[-1],),
],
global_random_rotation_range_per_object=[0, 0],
rate=1.0,
)
train_preprocessor = dict(
mode="train",
shuffle_points=True,
global_rot_noise=[-0.3925, 0.3925],
global_scale_noise=[0.95, 1.05],
db_sampler=db_sampler,
class_names=class_names,
)
val_preprocessor = dict(
mode="val",
shuffle_points=False,
)
voxel_generator = dict(
range=[-54, -54, -5.0, 54, 54, 3.0],
voxel_size=[0.15, 0.15, 8],
max_points_in_voxel=20,
max_voxel_num=[30000, 60000],
)
train_pipeline = [
dict(type="LoadPointCloudFromFile", dataset=dataset_type),
dict(type="LoadPointCloudAnnotations", with_bbox=True),
dict(type="Preprocess", cfg=train_preprocessor),
dict(type="Voxelization", cfg=voxel_generator),
dict(type="AssignLabel", cfg=train_cfg["assigner"]),
dict(type="Reformat"),
]
test_pipeline = [
dict(type="LoadPointCloudFromFile", dataset=dataset_type),
dict(type="LoadPointCloudAnnotations", with_bbox=True),
dict(type="Preprocess", cfg=val_preprocessor),
dict(type="Voxelization", cfg=voxel_generator),
dict(type="AssignLabel", cfg=train_cfg["assigner"]),
dict(type="Reformat"),
]
train_anno = "/data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
val_anno = "/data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
test_anno = "/data/nuScenes/test_set/infos_test_10sweeps_withvelo.pkl"
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type=dataset_type,
root_path=data_root,
info_path=train_anno,
ann_file=train_anno,
nsweeps=nsweeps,
class_names=class_names,
pipeline=train_pipeline,
),
val=dict(
type=dataset_type,
root_path=data_root,
info_path=val_anno,
test_mode=True,
ann_file=val_anno,
nsweeps=nsweeps,
class_names=class_names,
pipeline=test_pipeline,
),
test=dict(
type=dataset_type,
root_path=data_root,
info_path=test_anno,
ann_file=test_anno,
nsweeps=nsweeps,
class_names=class_names,
pipeline=test_pipeline,
),
)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# optimizer
optimizer = dict(
type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False,
)
lr_config = dict(
type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=1.0, pct_start=0.4,
)
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=5,
hooks=[
dict(type="TextLoggerHook"),
#dict(type='TensorboardLoggerHook')
],
)
# yapf:enable
# runtime settings
total_epochs = 20
device_ids = range(8)
dist_params = dict(backend="nccl", init_method="env://")
log_level = "INFO"
work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3])
load_from = None
resume_from = None
workflow = [('train', 1)]
Sure, here is my backbone code:
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, groups=groups, bias=False, dilation=dilation) def conv1x1(in_planes, out_planes, stride=1): """1x1 convolution""" return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None): super(BasicBlock, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d if groups != 1 or base_width != 64: raise ValueError('BasicBlock only supports groups=1 and base_width=64') if dilation > 1: raise NotImplementedError("Dilation > 1 not supported in BasicBlock") # Both self.conv1 and self.downsample layers downsample the input when stride != 1 self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = norm_layer(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = norm_layer(planes) self.downsample = downsample self.stride = stride def forward(self, x): identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out @BACKBONES.register_module class PointResNet34(nn.Module): """ ResNet-34 model from `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_ Adjusted for use as a 3D backbone, so different block structure according to FastPillars: https://arxiv.org/abs/2302.02367 """ def __init__(self, block= BasicBlock, layers=[6,6,3,1], in_channels = 64, zero_init_residual=False, groups=1, width_per_group=64, replace_stride_with_dilation=None, norm_layer=None, name="PointResNet34", first_max_pool = True, **kwargs): super(PointResNet34, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d self._norm_layer = norm_layer self.first_max_pool = first_max_pool self.inplanes = 64 self.dilation = 1 self.in_channels = in_channels if replace_stride_with_dilation is None: # each element in the tuple indicates if we should replace # the 2x2 stride with a dilated convolution instead replace_stride_with_dilation = [False, False, False] if len(replace_stride_with_dilation) != 3: raise ValueError("replace_stride_with_dilation should be None " "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) self.groups = groups self.base_width = width_per_group self.conv1 = nn.Conv2d(self.in_channels, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(self.inplanes) self.relu = nn.ReLU(inplace=True) if first_max_pool: self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]) self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]) self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual block behaves like an identity. # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.modules(): if isinstance(m, BasicBlock): nn.init.constant_(m.bn2.weight, 0) def _make_layer(self, block, planes, blocks, stride=1, dilate=False): norm_layer = self._norm_layer downsample = None previous_dilation = self.dilation if dilate: self.dilation *= stride stride = 1 if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( conv1x1(self.inplanes, planes * block.expansion, stride), norm_layer(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append(block(self.inplanes, planes, groups=self.groups, base_width=self.base_width, dilation=self.dilation, norm_layer=norm_layer)) return nn.Sequential(*layers) def _forward_impl(self, x): # See note [TorchScript super()] x = self.conv1(x) x = self.bn1(x) x = self.relu(x) if self.first_max_pool: x = self.maxpool(x) stage_2_out = self.layer1(x) stage_3_out = self.layer2(stage_2_out) stage_4_out = self.layer3(stage_3_out) stage_5_out = self.layer4(stage_4_out) return [stage_5_out, stage_4_out, stage_3_out, stage_2_out] def forward(self, x): return self._forward_impl(x)
And here is my config:
import itertools import logging from det3d.utils.config_tool import get_downsample_factor tasks = [ dict(num_class=1, class_names=["car"]), dict(num_class=2, class_names=["truck", "construction_vehicle"]), dict(num_class=2, class_names=["bus", "trailer"]), dict(num_class=1, class_names=["barrier"]), dict(num_class=2, class_names=["motorcycle", "bicycle"]), dict(num_class=2, class_names=["pedestrian", "traffic_cone"]), ] class_names = list(itertools.chain(*[t["class_names"] for t in tasks])) # training and testing settings target_assigner = dict( tasks=tasks, ) # model settings model = dict( type="FastPillars", pretrained=None, reader=dict( type="CustomPillarFeatureNet", # just PillarFeatureNet followed by PointPillarsScatter to fit the config architecture num_filters=[64, 64], num_input_features=5, with_distance=False, voxel_size=(0.15, 0.15, 8), # Paper: "set the pillar size as 0.15m" pc_range=(-54, -54, -5.0, 54, 54, 3.0), ), backbone=dict(type="PointResNet34",first_max_pool=False, ds_factor=8), # Remove first downsample operation according to https://github.com/StiphyJay/FastPillars/issues/10 neck=dict( type="RPNV3", layer_nums=[5, 5], ds_layer_strides=[1, 2], ds_num_filters=[256, 256], us_layer_strides=[1, 2], us_num_filters=[256, 256], # increase from 128x128 to better match 3x 128 filters in CenterPoint PP centerhead num_input_features=[256,512], #num features in the feature maps block 4 and 5 logger=logging.getLogger("RPN"), ), bbox_head=dict( type="CenterHead", in_channels=sum([256, 256]), tasks=tasks, dataset='nuscenes', weight=0.25, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv) ), ) assigner = dict( target_assigner=target_assigner, out_size_factor= get_downsample_factor(model), gaussian_overlap=0.1, max_objs=500, min_radius=2, ) train_cfg = dict(assigner=assigner) test_cfg = dict( post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_per_img=500, nms=dict( nms_pre_max_size=1000, nms_post_max_size=83, nms_iou_threshold=0.2, ), score_threshold=0.1, pc_range=[-54, -54], out_size_factor=get_downsample_factor(model), voxel_size=[0.15, 0.15] ) # dataset settings dataset_type = "NuScenesDataset" nsweeps = 10 data_root = "/data/nuScenes" db_sampler = dict( type="GT-AUG", enable=True, db_info_path="/data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl", sample_groups=[ dict(car=2), dict(truck=3), dict(construction_vehicle=7), dict(bus=4), dict(trailer=6), dict(barrier=2), dict(motorcycle=6), dict(bicycle=6), dict(pedestrian=2), dict(traffic_cone=2), ], db_prep_steps=[ dict( filter_by_min_num_points=dict( car=5, truck=5, bus=5, trailer=5, construction_vehicle=5, traffic_cone=5, barrier=5, motorcycle=5, bicycle=5, pedestrian=5, ) ), dict(filter_by_difficulty=[-1],), ], global_random_rotation_range_per_object=[0, 0], rate=1.0, ) train_preprocessor = dict( mode="train", shuffle_points=True, global_rot_noise=[-0.3925, 0.3925], global_scale_noise=[0.95, 1.05], db_sampler=db_sampler, class_names=class_names, ) val_preprocessor = dict( mode="val", shuffle_points=False, ) voxel_generator = dict( range=[-54, -54, -5.0, 54, 54, 3.0], voxel_size=[0.15, 0.15, 8], max_points_in_voxel=20, max_voxel_num=[30000, 60000], ) train_pipeline = [ dict(type="LoadPointCloudFromFile", dataset=dataset_type), dict(type="LoadPointCloudAnnotations", with_bbox=True), dict(type="Preprocess", cfg=train_preprocessor), dict(type="Voxelization", cfg=voxel_generator), dict(type="AssignLabel", cfg=train_cfg["assigner"]), dict(type="Reformat"), ] test_pipeline = [ dict(type="LoadPointCloudFromFile", dataset=dataset_type), dict(type="LoadPointCloudAnnotations", with_bbox=True), dict(type="Preprocess", cfg=val_preprocessor), dict(type="Voxelization", cfg=voxel_generator), dict(type="AssignLabel", cfg=train_cfg["assigner"]), dict(type="Reformat"), ] train_anno = "/data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl" val_anno = "/data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl" test_anno = "/data/nuScenes/test_set/infos_test_10sweeps_withvelo.pkl" data = dict( samples_per_gpu=8, workers_per_gpu=8, train=dict( type=dataset_type, root_path=data_root, info_path=train_anno, ann_file=train_anno, nsweeps=nsweeps, class_names=class_names, pipeline=train_pipeline, ), val=dict( type=dataset_type, root_path=data_root, info_path=val_anno, test_mode=True, ann_file=val_anno, nsweeps=nsweeps, class_names=class_names, pipeline=test_pipeline, ), test=dict( type=dataset_type, root_path=data_root, info_path=test_anno, ann_file=test_anno, nsweeps=nsweeps, class_names=class_names, pipeline=test_pipeline, ), ) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) # optimizer optimizer = dict( type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, ) lr_config = dict( type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=1.0, pct_start=0.4, ) checkpoint_config = dict(interval=1) # yapf:disable log_config = dict( interval=5, hooks=[ dict(type="TextLoggerHook"), #dict(type='TensorboardLoggerHook') ], ) # yapf:enable # runtime settings total_epochs = 20 device_ids = range(8) dist_params = dict(backend="nccl", init_method="env://") log_level = "INFO" work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3]) load_from = None resume_from = None workflow = [('train', 1)]
Thank you for sharing. I will review your reimplementation and get back to you as soon as possible.
I also test on the kitti dataset. But I found that the processing of the backbone([6,6,3,1])met the error "Out of the memory". The shape of the input is turned into [4,32,1600, 1408]. How do you solve this problem?
Look forward to your reply!
I found the computation is so heavy in the backbone. I am not sure about the shape of the input for backbone.
I also test on the kitti dataset. But I found that the processing of the backbone([6,6,3,1])met the error "Out of the memory". The shape of the input is turned into [4,32,1600, 1408]. How do you solve this problem?
Look forward to your reply!
Thank you sincerely for your reply!
I have another querstion.
What is the number of MAMP layers in your experiments. Or just one?
And what's the "max_points_in_voxel" and "max_voxel_num" in the nuscense dataset you used in you experiments.
in my reimplementation of the fastpillars(centerpoint-pillar),
- NAME: transform_points_to_voxels VOXEL_SIZE: [0.15, 0.15, 8.0] MAX_POINTS_PER_VOXEL: 20 MAX_NUMBER_OF_VOXELS: { 'train': 30000, 'test': 60000 }
Is this parammeters suitable?
Thank you sincerely for your reply! I have another querstion. What is the number of MAMP layers in your experiments. Or just one? And what's the "max_points_in_voxel" and "max_voxel_num" in the nuscense dataset you used in you experiments. in my reimplementation of the fastpillars(centerpoint-pillar),
- NAME: transform_points_to_voxels VOXEL_SIZE: [0.15, 0.15, 8.0] MAX_POINTS_PER_VOXEL: 20 MAX_NUMBER_OF_VOXELS: { 'train': 30000, 'test': 60000 }
Is this parammeters suitable?
I trained on the nuscenes. But I found the results is abnormal.
epoch_1.pth val datatset mAP=0.04 NDS=0.157
Could you help me check that is there a problem with the my implementation settings?
my implementation:
MAPE
Based on https://github.com/VISION-SJTU/PillarNet/blob/master/det3d/models/readers/pillar_encoder.py class PillarFeatureNet
add MAPE layer as the PFNLayer (one layer)
open the comment in the https://github.com/VISION-SJTU/PillarNet/blob/master/det3d/datasets/pipelines/preprocess.py 195-260 for using the PillarFeatureNet
backbone
ResNet18 :
removing the first pooling.
layer1-layer4 channel:64,128,256,512
input channel=64
config based on the https://github.com/VISION-SJTU/PillarNet/blob/master/configs/nusc/pillarnet/nusc_centerpoint_pillarnet_flip.py
import itertools import logging from det3d.utils.config_tool import get_downsample_factor DOUBLE_FLIP = False
tasks = [ dict(stride=8, class_names=["car"]), dict(stride=8, class_names=["truck", "construction_vehicle"]), dict(stride=8, class_names=["bus", "trailer"]), dict(stride=8, class_names=["barrier"]), dict(stride=8, class_names=["motorcycle", "bicycle"]), dict(stride=8, class_names=["pedestrian", "traffic_cone"]), ] class_names = list(itertools.chain(*[t["class_names"] for t in tasks]))
target_assigner = dict( tasks=tasks, ) pillar_size=0.15 pc_range=[-54, -54, -5.0, 54, 54, 3.0]
model = dict( type="FastPillars", pretrained=None, reader=dict( type="PillarFeatureNet_MAPE", voxel_size=(0.15, 0.15, 8), num_filters=(64, ), pc_range=pc_range, with_distance=False, num_input_features=5, ), backbone=dict( type="PointResNet18_512",ds_factor=8), neck=dict( type="RPNV2", layer_nums=[5, 5], ds_layer_strides=[1, 2], ds_num_filters=[256, 256], us_layer_strides=[1, 2], us_num_filters=[128, 128], num_input_features=[256, 512], # [256, 256] logger=logging.getLogger("RPN"), bbox_head=dict( type="CenterHead", in_channels=256, tasks=tasks, dataset='nuscenes', weight=0.25, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], common_heads={'reg': (2, 2), 'height': (1, 2), 'dim': (3, 2), 'rot': (2, 2), 'vel': (2, 2), 'iou': (1, 2)}, share_conv_channel=64, dcn_head=False, iou_reg='DIoU', ), assigner = dict( target_assigner=target_assigner, out_size_factor=get_downsample_factor(model), dense_reg=1, gaussian_overlap=0.1, max_objs=500, min_radius=2, pc_range=pc_range, voxel_size=[pillar_size, pillar_size,8], ) train_cfg = dict(assigner=assigner) rectifier=[0.5 for i in range(10)] nms_iouthreshold=[0.2 for in range(10)] nms_post_maxsize=[83 for in range(10)] nms_pre_maxsize=[1000 for in range(10)] test_cfg = dict( post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_per_img=500, nms=dict( use_rotate_nms=True, use_multi_class_nms=False, nms_pre_max_size=nms_pre_max_size, nms_post_max_size=nms_post_max_size, nms_iou_threshold=nms_iou_threshold, ), score_threshold=0.1, rectifier=rectifier, pc_range=pc_range[:2], out_size_factor=get_downsample_factor(model), voxel_size=[pillar_size, pillar_size,8], ) train_preprocessor = dict( mode="train", shuffle_points=True, global_rot_noise=[-0.78539816, 0.78539816], global_scale_noise=[0.9, 1.1], global_translate_std=0.5, db_sampler=db_sampler, class_names=class_names, )
val_preprocessor = dict( mode="val", shuffle_points=False, )
voxel_generator = dict( range=[-54, -54, -5.0, 54, 54, 3.0], voxel_size=[0.15, 0.15, 8], max_points_in_voxel=20, max_voxel_num=[120000, 160000],
)
dataset_type = "NuScenesDataset" nsweeps = 10 data_root = "data/nuScenes" data settings data = dict( samples_per_gpu=8, workers_per_gpu=6, ........ optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
optimizer = dict( type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, ) lr_config = dict( type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=1.0, pct_start=0.4, )
MAPE is
class MAPELayer(nn.Module):
def __init__(self,
in_channels,
out_channels,
norm_cfg=None):
super().__init__()
self.conv_layer1 = nn.Conv2d(in_channels, out_channels, kernel_size=1)
self.bn1 = nn.BatchNorm2d(out_channels, eps=1e-3, momentum=0.01)
self.conv_layer2 = nn.Conv2d(in_channels, out_channels, kernel_size=1)
def forward(self, inputs):
if inputs.ndim ==3:
inputs=inputs.unsqueeze(0)
shape = inputs.shape
inputs = inputs.view(shape[0], shape[3], shape[1], shape[2])
max_point_per_voxel = shape[2]
# (1, num_point_features, voxel_count, max_point_per_voxel)
x = self.conv_layer1(inputs)
x = self.bn1(x)
x = F.relu(x)
# (1, num_point_features, voxel_count, 1)
max_feature = torch.max(x, dim=-1, keepdim=True)[0]
# (1, num_point_features, voxel_count, max_point_per_voxel)
attention_score = F.softmax(self.conv_layer2(inputs), dim=-1)
avg_feature = x * attention_score
# (1, num_point_features, voxel_count, 1)
avg_feature = torch.sum(avg_feature, dim=-1)
avg_feature = x * attention_score
# (1, num_point_features, voxel_count, 1)
avg_feature = torch.sum(avg_feature, dim=-1, keepdim=True)
feature = (avg_feature + max_feature) / 2.0
# (1, voxel_count,num_point_features,1)
feature = feature.transpose(1,2)
return feature
Hello,
since the code is not published yet, I rebuild your architecture following your paper. To see the effects of your new backbone I used the official CenterPoint implementation (https://github.com/tianweiy/CenterPoint/blob/master/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep.py) and added your backbone after the PointPillarsScatter and before the PillarNet Neck (RPNV3: https://github.com/VISION-SJTU/PillarNet/blob/master/det3d/models/necks/rpn.py ). I select the voxel size of 0.15 as outlined in your paper for the base CenterPoint aswell as for FastPillars.
While some categories do show slight differences the overall NDS and mAP scores do not differ by a significant amount. CenterPoint with PP backbone achieves 64.42 NDS and 56.39 mAP and FastPillars 64.97 NDS and 56.71 mAP. Both values reported on the validation set and trained for 20 Epochs with fade strategy. I did not use your MAPE module or the structural reparametrization yet, so its just the effect of adding the new backbone. I did not use any test time aug.
I think the individual categories like e.g. the Car category do look better with the additional backbone, but the overall score is not changing a lot. Did you encounter something similar? Is that just a problem with the nuScenes evaluation method?
Results screenshots: FastPillars:
CenterPoint: