StiphyJay / FastPillars

FastPillars: A Deployment-friendly Pillar-based 3D Detector
133 stars 12 forks source link

Backbone effect on nuScenes validation dataset #12

Open Byte247 opened 6 months ago

Byte247 commented 6 months ago

Hello,

since the code is not published yet, I rebuild your architecture following your paper. To see the effects of your new backbone I used the official CenterPoint implementation (https://github.com/tianweiy/CenterPoint/blob/master/configs/nusc/pp/nusc_centerpoint_pp_02voxel_two_pfn_10sweep.py) and added your backbone after the PointPillarsScatter and before the PillarNet Neck (RPNV3: https://github.com/VISION-SJTU/PillarNet/blob/master/det3d/models/necks/rpn.py ). I select the voxel size of 0.15 as outlined in your paper for the base CenterPoint aswell as for FastPillars.

While some categories do show slight differences the overall NDS and mAP scores do not differ by a significant amount. CenterPoint with PP backbone achieves 64.42 NDS and 56.39 mAP and FastPillars 64.97 NDS and 56.71 mAP. Both values reported on the validation set and trained for 20 Epochs with fade strategy. I did not use your MAPE module or the structural reparametrization yet, so its just the effect of adding the new backbone. I did not use any test time aug.

I think the individual categories like e.g. the Car category do look better with the additional backbone, but the overall score is not changing a lot. Did you encounter something similar? Is that just a problem with the nuScenes evaluation method?

Results screenshots: FastPillars: fastPillars

CenterPoint: CenterPoint

StiphyJay commented 6 months ago

Can you provide more details about your reproduced network, especially the backbone structure, configuration parameters and corresponding code implementation?

Byte247 commented 6 months ago

Sure, here is my backbone code:

def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)

def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

class BasicBlock(nn.Module):

    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

@BACKBONES.register_module
class PointResNet34(nn.Module):
    """
    ResNet-34 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Adjusted for use as a 3D backbone, so different block structure according to FastPillars: https://arxiv.org/abs/2302.02367
    """
    def __init__(self, block= BasicBlock, layers=[6,6,3,1], in_channels = 64, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None, 
                 norm_layer=None, name="PointResNet34", first_max_pool = True, **kwargs):
        super(PointResNet34, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer
        self.first_max_pool = first_max_pool

        self.inplanes = 64
        self.dilation = 1
        self.in_channels = in_channels
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(self.in_channels, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        if first_max_pool:
            self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def _forward_impl(self, x):
        # See note [TorchScript super()]

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        if self.first_max_pool:
            x = self.maxpool(x)

        stage_2_out = self.layer1(x)
        stage_3_out = self.layer2(stage_2_out)
        stage_4_out = self.layer3(stage_3_out)
        stage_5_out = self.layer4(stage_4_out)

        return [stage_5_out, stage_4_out, stage_3_out, stage_2_out]

    def forward(self, x):
        return self._forward_impl(x)

And here is my config:

import itertools
import logging

from det3d.utils.config_tool import get_downsample_factor

tasks = [
    dict(num_class=1, class_names=["car"]),
    dict(num_class=2, class_names=["truck", "construction_vehicle"]),
    dict(num_class=2, class_names=["bus", "trailer"]),
    dict(num_class=1, class_names=["barrier"]),
    dict(num_class=2, class_names=["motorcycle", "bicycle"]),
    dict(num_class=2, class_names=["pedestrian", "traffic_cone"]),
]

class_names = list(itertools.chain(*[t["class_names"] for t in tasks]))

# training and testing settings
target_assigner = dict(
    tasks=tasks,
)

# model settings
model = dict(
    type="FastPillars",
    pretrained=None,
    reader=dict(
        type="CustomPillarFeatureNet", # just PillarFeatureNet followed by PointPillarsScatter to fit the config architecture
        num_filters=[64, 64],
        num_input_features=5,
        with_distance=False,
        voxel_size=(0.15, 0.15, 8), # Paper: "set the pillar size as 0.15m"
        pc_range=(-54, -54, -5.0, 54, 54, 3.0), 
    ),

    backbone=dict(type="PointResNet34",first_max_pool=False, ds_factor=8), # Remove first downsample operation according to https://github.com/StiphyJay/FastPillars/issues/10

    neck=dict(
        type="RPNV3",
        layer_nums=[5, 5],
        ds_layer_strides=[1, 2],
        ds_num_filters=[256, 256],
        us_layer_strides=[1, 2],
        us_num_filters=[256, 256], # increase from 128x128 to better match 3x 128 filters in CenterPoint PP centerhead
        num_input_features=[256,512], #num features in the feature maps block 4 and 5
        logger=logging.getLogger("RPN"),
    ),

    bbox_head=dict(
        type="CenterHead",
        in_channels=sum([256, 256]),
        tasks=tasks,
        dataset='nuscenes',
        weight=0.25,
        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0],
        common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv)
    ),
)

assigner = dict(
    target_assigner=target_assigner,
    out_size_factor= get_downsample_factor(model),
    gaussian_overlap=0.1,
    max_objs=500,
    min_radius=2,
)

train_cfg = dict(assigner=assigner)

test_cfg = dict(
    post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
    max_per_img=500,
    nms=dict(
        nms_pre_max_size=1000,
        nms_post_max_size=83,
        nms_iou_threshold=0.2,
    ),
    score_threshold=0.1,
    pc_range=[-54, -54],
    out_size_factor=get_downsample_factor(model),
    voxel_size=[0.15, 0.15]
)

# dataset settings
dataset_type = "NuScenesDataset"
nsweeps = 10
data_root = "/data/nuScenes"

db_sampler = dict(
    type="GT-AUG",
    enable=True,
    db_info_path="/data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
    sample_groups=[
        dict(car=2),
        dict(truck=3),
        dict(construction_vehicle=7),
        dict(bus=4),
        dict(trailer=6),
        dict(barrier=2),
        dict(motorcycle=6),
        dict(bicycle=6),
        dict(pedestrian=2),
        dict(traffic_cone=2),
    ],
    db_prep_steps=[
        dict(
            filter_by_min_num_points=dict(
                car=5,
                truck=5,
                bus=5,
                trailer=5,
                construction_vehicle=5,
                traffic_cone=5,
                barrier=5,
                motorcycle=5,
                bicycle=5,
                pedestrian=5,
            )
        ),
        dict(filter_by_difficulty=[-1],),
    ],
    global_random_rotation_range_per_object=[0, 0],
    rate=1.0,
)

train_preprocessor = dict(
    mode="train",
    shuffle_points=True,
    global_rot_noise=[-0.3925, 0.3925],
    global_scale_noise=[0.95, 1.05],
    db_sampler=db_sampler,
    class_names=class_names,
)

val_preprocessor = dict(
    mode="val",
    shuffle_points=False,
)

voxel_generator = dict(
    range=[-54, -54, -5.0, 54, 54, 3.0],
    voxel_size=[0.15, 0.15, 8],
    max_points_in_voxel=20,
    max_voxel_num=[30000, 60000],
)

train_pipeline = [
    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
    dict(type="LoadPointCloudAnnotations", with_bbox=True),
    dict(type="Preprocess", cfg=train_preprocessor),
    dict(type="Voxelization", cfg=voxel_generator),
    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
    dict(type="Reformat"),
]
test_pipeline = [
    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
    dict(type="LoadPointCloudAnnotations", with_bbox=True),
    dict(type="Preprocess", cfg=val_preprocessor),
    dict(type="Voxelization", cfg=voxel_generator),
    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
    dict(type="Reformat"),
]

train_anno = "/data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
val_anno = "/data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
test_anno = "/data/nuScenes/test_set/infos_test_10sweeps_withvelo.pkl"

data = dict(
    samples_per_gpu=8,
    workers_per_gpu=8,
    train=dict(
        type=dataset_type,
        root_path=data_root,
        info_path=train_anno,
        ann_file=train_anno,
        nsweeps=nsweeps,
        class_names=class_names,
        pipeline=train_pipeline,
    ),
    val=dict(
        type=dataset_type,
        root_path=data_root,
        info_path=val_anno,
        test_mode=True,
        ann_file=val_anno,
        nsweeps=nsweeps,
        class_names=class_names,
        pipeline=test_pipeline,
    ),
    test=dict(
        type=dataset_type,
        root_path=data_root,
        info_path=test_anno,
        ann_file=test_anno,
        nsweeps=nsweeps,
        class_names=class_names,
        pipeline=test_pipeline,
    ),
)

optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# optimizer
optimizer = dict(
    type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False,
)
lr_config = dict(
    type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=1.0, pct_start=0.4,
)

checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
    interval=5,
    hooks=[
        dict(type="TextLoggerHook"),
        #dict(type='TensorboardLoggerHook')
    ],
)
# yapf:enable
# runtime settings
total_epochs = 20
device_ids = range(8)
dist_params = dict(backend="nccl", init_method="env://")
log_level = "INFO"
work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3])
load_from = None
resume_from = None 
workflow = [('train', 1)]
StiphyJay commented 6 months ago

Sure, here is my backbone code:

def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)

def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

class BasicBlock(nn.Module):

    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

@BACKBONES.register_module
class PointResNet34(nn.Module):
    """
    ResNet-34 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
    Adjusted for use as a 3D backbone, so different block structure according to FastPillars: https://arxiv.org/abs/2302.02367
    """
    def __init__(self, block= BasicBlock, layers=[6,6,3,1], in_channels = 64, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None, 
                 norm_layer=None, name="PointResNet34", first_max_pool = True, **kwargs):
        super(PointResNet34, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer
        self.first_max_pool = first_max_pool

        self.inplanes = 64
        self.dilation = 1
        self.in_channels = in_channels
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(self.in_channels, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        if first_max_pool:
            self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def _forward_impl(self, x):
        # See note [TorchScript super()]

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        if self.first_max_pool:
            x = self.maxpool(x)

        stage_2_out = self.layer1(x)
        stage_3_out = self.layer2(stage_2_out)
        stage_4_out = self.layer3(stage_3_out)
        stage_5_out = self.layer4(stage_4_out)

        return [stage_5_out, stage_4_out, stage_3_out, stage_2_out]

    def forward(self, x):
        return self._forward_impl(x)

And here is my config:

import itertools
import logging

from det3d.utils.config_tool import get_downsample_factor

tasks = [
    dict(num_class=1, class_names=["car"]),
    dict(num_class=2, class_names=["truck", "construction_vehicle"]),
    dict(num_class=2, class_names=["bus", "trailer"]),
    dict(num_class=1, class_names=["barrier"]),
    dict(num_class=2, class_names=["motorcycle", "bicycle"]),
    dict(num_class=2, class_names=["pedestrian", "traffic_cone"]),
]

class_names = list(itertools.chain(*[t["class_names"] for t in tasks]))

# training and testing settings
target_assigner = dict(
    tasks=tasks,
)

# model settings
model = dict(
    type="FastPillars",
    pretrained=None,
    reader=dict(
        type="CustomPillarFeatureNet", # just PillarFeatureNet followed by PointPillarsScatter to fit the config architecture
        num_filters=[64, 64],
        num_input_features=5,
        with_distance=False,
        voxel_size=(0.15, 0.15, 8), # Paper: "set the pillar size as 0.15m"
        pc_range=(-54, -54, -5.0, 54, 54, 3.0), 
    ),

    backbone=dict(type="PointResNet34",first_max_pool=False, ds_factor=8), # Remove first downsample operation according to https://github.com/StiphyJay/FastPillars/issues/10

    neck=dict(
        type="RPNV3",
        layer_nums=[5, 5],
        ds_layer_strides=[1, 2],
        ds_num_filters=[256, 256],
        us_layer_strides=[1, 2],
        us_num_filters=[256, 256], # increase from 128x128 to better match 3x 128 filters in CenterPoint PP centerhead
        num_input_features=[256,512], #num features in the feature maps block 4 and 5
        logger=logging.getLogger("RPN"),
    ),

    bbox_head=dict(
        type="CenterHead",
        in_channels=sum([256, 256]),
        tasks=tasks,
        dataset='nuscenes',
        weight=0.25,
        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0],
        common_heads={'reg': (2, 2), 'height': (1, 2), 'dim':(3, 2), 'rot':(2, 2), 'vel': (2, 2)}, # (output_channel, num_conv)
    ),
)

assigner = dict(
    target_assigner=target_assigner,
    out_size_factor= get_downsample_factor(model),
    gaussian_overlap=0.1,
    max_objs=500,
    min_radius=2,
)

train_cfg = dict(assigner=assigner)

test_cfg = dict(
    post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
    max_per_img=500,
    nms=dict(
        nms_pre_max_size=1000,
        nms_post_max_size=83,
        nms_iou_threshold=0.2,
    ),
    score_threshold=0.1,
    pc_range=[-54, -54],
    out_size_factor=get_downsample_factor(model),
    voxel_size=[0.15, 0.15]
)

# dataset settings
dataset_type = "NuScenesDataset"
nsweeps = 10
data_root = "/data/nuScenes"

db_sampler = dict(
    type="GT-AUG",
    enable=True,
    db_info_path="/data/nuScenes/dbinfos_train_10sweeps_withvelo.pkl",
    sample_groups=[
        dict(car=2),
        dict(truck=3),
        dict(construction_vehicle=7),
        dict(bus=4),
        dict(trailer=6),
        dict(barrier=2),
        dict(motorcycle=6),
        dict(bicycle=6),
        dict(pedestrian=2),
        dict(traffic_cone=2),
    ],
    db_prep_steps=[
        dict(
            filter_by_min_num_points=dict(
                car=5,
                truck=5,
                bus=5,
                trailer=5,
                construction_vehicle=5,
                traffic_cone=5,
                barrier=5,
                motorcycle=5,
                bicycle=5,
                pedestrian=5,
            )
        ),
        dict(filter_by_difficulty=[-1],),
    ],
    global_random_rotation_range_per_object=[0, 0],
    rate=1.0,
)

train_preprocessor = dict(
    mode="train",
    shuffle_points=True,
    global_rot_noise=[-0.3925, 0.3925],
    global_scale_noise=[0.95, 1.05],
    db_sampler=db_sampler,
    class_names=class_names,
)

val_preprocessor = dict(
    mode="val",
    shuffle_points=False,
)

voxel_generator = dict(
    range=[-54, -54, -5.0, 54, 54, 3.0],
    voxel_size=[0.15, 0.15, 8],
    max_points_in_voxel=20,
    max_voxel_num=[30000, 60000],
)

train_pipeline = [
    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
    dict(type="LoadPointCloudAnnotations", with_bbox=True),
    dict(type="Preprocess", cfg=train_preprocessor),
    dict(type="Voxelization", cfg=voxel_generator),
    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
    dict(type="Reformat"),
]
test_pipeline = [
    dict(type="LoadPointCloudFromFile", dataset=dataset_type),
    dict(type="LoadPointCloudAnnotations", with_bbox=True),
    dict(type="Preprocess", cfg=val_preprocessor),
    dict(type="Voxelization", cfg=voxel_generator),
    dict(type="AssignLabel", cfg=train_cfg["assigner"]),
    dict(type="Reformat"),
]

train_anno = "/data/nuScenes/infos_train_10sweeps_withvelo_filter_True.pkl"
val_anno = "/data/nuScenes/infos_val_10sweeps_withvelo_filter_True.pkl"
test_anno = "/data/nuScenes/test_set/infos_test_10sweeps_withvelo.pkl"

data = dict(
    samples_per_gpu=8,
    workers_per_gpu=8,
    train=dict(
        type=dataset_type,
        root_path=data_root,
        info_path=train_anno,
        ann_file=train_anno,
        nsweeps=nsweeps,
        class_names=class_names,
        pipeline=train_pipeline,
    ),
    val=dict(
        type=dataset_type,
        root_path=data_root,
        info_path=val_anno,
        test_mode=True,
        ann_file=val_anno,
        nsweeps=nsweeps,
        class_names=class_names,
        pipeline=test_pipeline,
    ),
    test=dict(
        type=dataset_type,
        root_path=data_root,
        info_path=test_anno,
        ann_file=test_anno,
        nsweeps=nsweeps,
        class_names=class_names,
        pipeline=test_pipeline,
    ),
)

optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# optimizer
optimizer = dict(
    type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False,
)
lr_config = dict(
    type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=1.0, pct_start=0.4,
)

checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
    interval=5,
    hooks=[
        dict(type="TextLoggerHook"),
        #dict(type='TensorboardLoggerHook')
    ],
)
# yapf:enable
# runtime settings
total_epochs = 20
device_ids = range(8)
dist_params = dict(backend="nccl", init_method="env://")
log_level = "INFO"
work_dir = './work_dirs/{}/'.format(__file__[__file__.rfind('/') + 1:-3])
load_from = None
resume_from = None 
workflow = [('train', 1)]

Thank you for sharing. I will review your reimplementation and get back to you as soon as possible.

c122-ode commented 2 months ago

I also test on the kitti dataset. But I found that the processing of the backbone([6,6,3,1])met the error "Out of the memory". The shape of the input is turned into [4,32,1600, 1408]. How do you solve this problem?

Look forward to your reply!

c122-ode commented 2 months ago

I found the computation is so heavy in the backbone. I am not sure about the shape of the input for backbone.

StiphyJay commented 2 months ago

I also test on the kitti dataset. But I found that the processing of the backbone([6,6,3,1])met the error "Out of the memory". The shape of the input is turned into [4,32,1600, 1408]. How do you solve this problem?

Look forward to your reply!

  1. In nuscenes dataset, the pillar size is 0.15m, is the 2X for pillarnet setting(0.075m). 2. Dense conv will have heavy computation burden, I train the model on A100 80G GPU. 3. I didn't test the fastpillars on KITTI dataset due to the limited data.
c122-ode commented 2 months ago

Thank you sincerely for your reply! I have another querstion. What is the number of MAMP layers in your experiments. Or just one? And what's the "max_points_in_voxel" and "max_voxel_num" in the nuscense dataset you used in you experiments. in my reimplementation of the fastpillars(centerpoint-pillar), - NAME: transform_points_to_voxels VOXEL_SIZE: [0.15, 0.15, 8.0] MAX_POINTS_PER_VOXEL: 20 MAX_NUMBER_OF_VOXELS: { 'train': 30000, 'test': 60000 }

Is this parammeters suitable?

StiphyJay commented 2 months ago

Thank you sincerely for your reply! I have another querstion. What is the number of MAMP layers in your experiments. Or just one? And what's the "max_points_in_voxel" and "max_voxel_num" in the nuscense dataset you used in you experiments. in my reimplementation of the fastpillars(centerpoint-pillar), - NAME: transform_points_to_voxels VOXEL_SIZE: [0.15, 0.15, 8.0] MAX_POINTS_PER_VOXEL: 20 MAX_NUMBER_OF_VOXELS: { 'train': 30000, 'test': 60000 }

Is this parammeters suitable?

  1. just 1 layers in MAPE. 2. "max_points_in_voxel" is 20 and "max_voxel_num" setting follow this config: https://github.com/open-mmlab/OpenPCDet/blob/8cacccec11db6f59bf6934600c9a175dae254806/tools/cfgs/nuscenes_models/cbgs_pillar0075_res2d_centerpoint.yaml#L58
c122-ode commented 2 months ago

I trained on the nuscenes. But I found the results is abnormal. epoch_1.pth val datatset mAP=0.04 NDS=0.157 Could you help me check that is there a problem with the my implementation settings? my implementation: MAPE Based on https://github.com/VISION-SJTU/PillarNet/blob/master/det3d/models/readers/pillar_encoder.py class PillarFeatureNet
add MAPE layer as the PFNLayer (one layer) open the comment in the https://github.com/VISION-SJTU/PillarNet/blob/master/det3d/datasets/pipelines/preprocess.py 195-260 for using the PillarFeatureNet
backbone ResNet18 : removing the first pooling. layer1-layer4 channel:64,128,256,512 input channel=64

config based on the https://github.com/VISION-SJTU/PillarNet/blob/master/configs/nusc/pillarnet/nusc_centerpoint_pillarnet_flip.py

import itertools import logging from det3d.utils.config_tool import get_downsample_factor DOUBLE_FLIP = False

tasks = [ dict(stride=8, class_names=["car"]), dict(stride=8, class_names=["truck", "construction_vehicle"]), dict(stride=8, class_names=["bus", "trailer"]), dict(stride=8, class_names=["barrier"]), dict(stride=8, class_names=["motorcycle", "bicycle"]), dict(stride=8, class_names=["pedestrian", "traffic_cone"]), ] class_names = list(itertools.chain(*[t["class_names"] for t in tasks]))

target_assigner = dict( tasks=tasks, ) pillar_size=0.15 pc_range=[-54, -54, -5.0, 54, 54, 3.0]

model = dict( type="FastPillars", pretrained=None, reader=dict( type="PillarFeatureNet_MAPE", voxel_size=(0.15, 0.15, 8), num_filters=(64, ), pc_range=pc_range, with_distance=False, num_input_features=5, ), backbone=dict( type="PointResNet18_512",ds_factor=8), neck=dict( type="RPNV2", layer_nums=[5, 5], ds_layer_strides=[1, 2], ds_num_filters=[256, 256], us_layer_strides=[1, 2], us_num_filters=[128, 128], num_input_features=[256, 512], # [256, 256] logger=logging.getLogger("RPN"), bbox_head=dict( type="CenterHead", in_channels=256, tasks=tasks, dataset='nuscenes', weight=0.25, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 1.0, 1.0], common_heads={'reg': (2, 2), 'height': (1, 2), 'dim': (3, 2), 'rot': (2, 2), 'vel': (2, 2), 'iou': (1, 2)}, share_conv_channel=64, dcn_head=False, iou_reg='DIoU', ), assigner = dict( target_assigner=target_assigner, out_size_factor=get_downsample_factor(model), dense_reg=1, gaussian_overlap=0.1, max_objs=500, min_radius=2, pc_range=pc_range, voxel_size=[pillar_size, pillar_size,8], ) train_cfg = dict(assigner=assigner) rectifier=[0.5 for i in range(10)] nms_iouthreshold=[0.2 for in range(10)] nms_post_maxsize=[83 for in range(10)] nms_pre_maxsize=[1000 for in range(10)] test_cfg = dict( post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_per_img=500, nms=dict( use_rotate_nms=True, use_multi_class_nms=False, nms_pre_max_size=nms_pre_max_size, nms_post_max_size=nms_post_max_size, nms_iou_threshold=nms_iou_threshold, ), score_threshold=0.1, rectifier=rectifier, pc_range=pc_range[:2], out_size_factor=get_downsample_factor(model), voxel_size=[pillar_size, pillar_size,8], ) train_preprocessor = dict( mode="train", shuffle_points=True, global_rot_noise=[-0.78539816, 0.78539816], global_scale_noise=[0.9, 1.1], global_translate_std=0.5, db_sampler=db_sampler, class_names=class_names, )

val_preprocessor = dict( mode="val", shuffle_points=False, )

voxel_generator = dict( range=[-54, -54, -5.0, 54, 54, 3.0], voxel_size=[0.15, 0.15, 8], max_points_in_voxel=20, max_voxel_num=[120000, 160000],

double_flip=DOUBLE_FLIP,

)

dataset_type = "NuScenesDataset" nsweeps = 10 data_root = "data/nuScenes" data settings data = dict( samples_per_gpu=8, workers_per_gpu=6, ........ optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

optimizer = dict( type="adam", amsgrad=0.0, wd=0.01, fixed_wd=True, moving_average=False, ) lr_config = dict( type="one_cycle", lr_max=0.001, moms=[0.95, 0.85], div_factor=1.0, pct_start=0.4, )

c122-ode commented 2 months ago

MAPE is

class MAPELayer(nn.Module):

def __init__(self,
             in_channels,
             out_channels,
             norm_cfg=None):
    super().__init__()
    self.conv_layer1 = nn.Conv2d(in_channels, out_channels, kernel_size=1)
    self.bn1 = nn.BatchNorm2d(out_channels, eps=1e-3, momentum=0.01)
    self.conv_layer2 = nn.Conv2d(in_channels, out_channels, kernel_size=1)
def forward(self, inputs):
  if inputs.ndim ==3:
        inputs=inputs.unsqueeze(0)
        shape = inputs.shape
        inputs = inputs.view(shape[0], shape[3], shape[1], shape[2])
        max_point_per_voxel = shape[2]
   # (1, num_point_features, voxel_count, max_point_per_voxel)
    x = self.conv_layer1(inputs)
    x = self.bn1(x)
    x = F.relu(x)
    # (1, num_point_features, voxel_count, 1)
    max_feature = torch.max(x, dim=-1, keepdim=True)[0]
    # (1, num_point_features, voxel_count, max_point_per_voxel)
    attention_score = F.softmax(self.conv_layer2(inputs), dim=-1)
    avg_feature = x * attention_score
    # (1, num_point_features, voxel_count, 1)
    avg_feature = torch.sum(avg_feature, dim=-1)
    avg_feature = x * attention_score
    # (1, num_point_features, voxel_count, 1)
    avg_feature = torch.sum(avg_feature, dim=-1, keepdim=True)
    feature = (avg_feature + max_feature) / 2.0
    # (1, voxel_count,num_point_features,1)
    feature = feature.transpose(1,2)
    return feature