Hello :) First of all congrats to your paper + code. It looks super cool. However I tried using your Model with a Resnet 101 as image backbone with a final image resolution close to the original value. I also tried to detect object within 100m and beyond. Unfortunately this model does not seem to converge... Thur purple line in the picture is your resnet50 with resolution 256X704 (I didn't change the config) while the Orange line is my (described above) network. He is also the config of the experiment. Please note the orange line was created using the the optimizer elf.optimizer_config = dict(type="AdamW", lr=2e-4, weight_decay=1e-4) The change optimizer didn't changed a thing here: `class CRNLightningModel(BEVDepthLightningModel): def init(self, *args, **kwargs) -> None: self.return_image = True self.return_depth = True self.return_radar_pv = True ################################################

self.optimizer_config = dict(type="AdamW", lr=2e-4, weight_decay=1e-4) ### org

    self.optimizer_config = dict(type="AdamW", lr=2e-6, weight_decay=1e-5)
    ################################################
    x_y_bound = [-51.2 * 2, 51.2 * 2]
    final_dim = [896, 1600]
    self.backbone_img_conf = {
        # "x_bound": [-51.2, 51.2, 0.8],
        # "y_bound": [-51.2, 51.2, 0.8],
        "x_bound": x_y_bound + [0.8],  # voxel x bounds
        "y_bound": x_y_bound + [0.8],  # voxel y bounds
        "z_bound": [-5, 3, 8],
        "d_bound": [
            2.0,
            117.2,
            0.8,
        ],  # frsutum depth bounds min,max, depth_step_size
        # "d_bound": [2.0, 58.0, 0.8],
        # "final_dim": (256, 704),
        "final_dim": tuple(final_dim),
        "downsample_factor": 32,
        "img_backbone_conf": dict(
            type="ResNet",
            depth=101,
            frozen_stages=0,
            out_indices=[0, 1, 2, 3],
            dilations=(2, 2, 1, 1),
            #dilations=(1, 1, 1, 1),
            strides=(2, 2, 2, 2),
            #strides=(1, 1, 1, 1),
            norm_eval=False,
            init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet101"),
        ),
        "img_neck_conf": dict(
            type="SECONDFPN",
            in_channels=[256, 512, 1024, 2048],
            upsample_strides=[0.25, 0.50, 1, 2],
            out_channels=[128, 128, 128, 128],
            # out_channels=[256, 256, 256, 256],
        ),
        "depth_net_conf": dict(in_channels=512, mid_channels=256),
        "radar_view_transform": True,
        "camera_aware": False,
        # "camera_aware": True,  # Model camera intrinsic into DepthNet
        #"output_channels": 80,
        "output_channels": 128,
    }
    ################################################
    # point_cloud_range_backbone=[0, 2.0, 0, 704, 58.0, 2],    #[x_min, y_min, z_min, x_max, y_max, z_max]
    point_cloud_range_backbone = [
        0,
        2.0,
        0,
        final_dim[1],
        self.backbone_img_conf["d_bound"][1],
        2,
    ]  # [x_min, y_min, z_min, x_max, y_max, z_max]
    self.backbone_pts_conf = {
        "pts_voxel_layer": dict(
            max_num_points=8,
            voxel_size=[8, 0.4, 2],
            point_cloud_range=point_cloud_range_backbone,  # [x_min, y_min, z_min, x_max, y_max, z_max]
            max_voxels=(768, 1024),
        ),
        "pts_voxel_encoder": dict(
            type="PillarFeatureNet",
            in_channels=5,
            feat_channels=[32, 64],
            with_distance=False,
            with_cluster_center=False,
            with_voxel_center=True,
            voxel_size=[8, 0.4, 2],
            point_cloud_range=point_cloud_range_backbone,
            norm_cfg=dict(type="BN1d", eps=1e-3, momentum=0.01),
            legacy=True,
        ),
        "pts_middle_encoder": dict(
            # type="PointPillarsScatter", in_channels=64, output_shape=(140, 88)
            type="PointPillarsScatter",
            in_channels=64,
            output_shape=(288, 100)
            # type="PointPillarsScatter",
            # in_channels=64,
            # output_shape=(288, 200),
        ),
        "pts_backbone": dict(
            type="SECOND",
            in_channels=64,
            out_channels=[64, 128, 256],
            layer_nums=[3, 5, 5],
            layer_strides=[1, 2, 2],
            norm_cfg=dict(type="BN", eps=1e-3, momentum=0.01),
            conv_cfg=dict(type="Conv2d", bias=True, padding_mode="reflect"),
        ),
        "pts_neck": dict(
            type="SECONDFPN",
            in_channels=[64, 128, 256],
            out_channels=[128, 128, 128],
            upsample_strides=[0.5, 1, 2],
            norm_cfg=dict(type="BN", eps=1e-3, momentum=0.01),
            upsample_cfg=dict(type="deconv", bias=False),
            use_conv_for_no_stride=True,
        ),
        "occupancy_init": 0.01,
        # "out_channels_pts": 80,
        "out_channels_pts": 128,
    }
    ################################################
    self.fuser_conf = {
        #"img_dims": 80,
        #"pts_dims": 80,
        "embed_dims": 128,
        # "num_layers": 6,
        # "num_heads": 4,
        # "bev_shape": (128, 128),
        "img_dims":128,
        "pts_dims":128,
        #"embed_dims":256,
        "num_layers": 6,
        "num_heads": 4,
        "bev_shape": (256, 256),
    }
    ################################################
    out_size_factor = 2

    # point_cloud_range_head = [-51.2, -51.2, -5, 51.2, 51.2, 3]
    point_cloud_range_head = [
        x_y_bound[0],
        x_y_bound[0],
        -5,
        x_y_bound[1],
        x_y_bound[1],
        3,
    ]
    # post_center_range = [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
    post_center_range = [-61.2 * 2, -61.2 * 2, -10.0, 61.2 * 2, 61.2 * 2, 10.0]

    self.head_conf = {
        "bev_backbone_conf": dict(
            type="ResNet",
            #in_channels=256,
            in_channels=128,
            depth=18,
            num_stages=3,
            strides=(1, 2, 2),
            dilations=(1, 1, 1),
            out_indices=[0, 1, 2],
            norm_eval=False,
            base_channels=160
            # base_channels=360,
        ),
        "bev_neck_conf": dict(
            type="SECONDFPN",
            in_channels=[128, 160, 320, 640],
            #upsample_strides=[1, 2, 4, 8],
            #in_channels=[256, 160, 320, 640],
            #in_channels=[256, 360, 720, 1440],
            upsample_strides=[1, 2, 4, 8],
            out_channels=[64, 64, 64, 64],
        ),
        "tasks": [
            dict(num_class=1, class_names=["car"]),
            dict(num_class=2, class_names=["truck", "construction_vehicle"]),
            dict(num_class=2, class_names=["bus", "trailer"]),
            dict(num_class=1, class_names=["barrier"]),
            dict(num_class=2, class_names=["motorcycle", "bicycle"]),
            dict(num_class=2, class_names=["pedestrian", "traffic_cone"]),
        ],
        "common_heads": dict(
            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)
        ),
        "bbox_coder": dict(
            type="CenterPointBBoxCoder",
            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            post_center_range=post_center_range,
            max_num=500,
            score_threshold=0.01,
            out_size_factor=out_size_factor,
            voxel_size=[0.2, 0.2, 8],
            pc_range=point_cloud_range_head,
            code_size=9,
        ),
        "train_cfg": dict(
            point_cloud_range=point_cloud_range_head,
            grid_size=[512, 512, 1],
            voxel_size=[0.2, 0.2, 8],
            out_size_factor=out_size_factor,
            dense_reg=1,
            gaussian_overlap=0.1,
            max_objs=500,
            min_radius=2,
            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
        ),
        "test_cfg": dict(
            post_center_limit_range=post_center_range,
            max_per_img=500,
            max_pool_nms=False,
            min_radius=[4, 12, 10, 1, 0.85, 0.175],
            score_threshold=0.01,
            out_size_factor=out_size_factor,
            voxel_size=[0.2, 0.2, 8],
            nms_type="circle",
            pre_max_size=1000,
            post_max_size=200,
            nms_thr=0.2,
        ),
        "in_channels": 256,  # Equal to bev_neck output_channels.
        "loss_cls": dict(type="GaussianFocalLoss", reduction="mean"),
        "loss_bbox": dict(type="L1Loss", reduction="mean", loss_weight=0.25),
        "gaussian_overlap": 0.1,
        "min_radius": 2,
    }
    ################################################
    self.ida_aug_conf = {
        "resize_lim": (1, 1),
        #"resize_lim": (0.386, 0.55),
        # "final_dim": (256, 704),
        "final_dim": (896, 1600),
        "rot_lim": (0.0, 0.0),
        "H": 900,
        "W": 1600,
        "rand_flip": True,
        "bot_pct_lim": (0.0, 0.0),
        "cams": [
            "CAM_FRONT_LEFT",
            "CAM_FRONT",
            "CAM_FRONT_RIGHT",
            "CAM_BACK_LEFT",
            "CAM_BACK",
            "CAM_BACK_RIGHT",
        ],
        "Ncams": 6,
    }
    self.bda_aug_conf = {
        "rot_ratio": 1.0,
        "rot_lim": (-22.5, 22.5),
        "scale_lim": (0.9, 1.1),
        "flip_dx_ratio": 0.5,
        "flip_dy_ratio": 0.5,
    }

    self.rda_aug_conf = {
        "N_sweeps": 6,
        "N_use": 5,
        "drop_ratio": 0.1,
    }
    ################################################
    super().__init__(
        backbone_img_conf=self.backbone_img_conf,
        head_conf=self.head_conf,
        ida_aug_conf=self.ida_aug_conf,
        bda_aug_conf=self.bda_aug_conf,
        rda_aug_conf=self.rda_aug_conf,
        return_image=self.return_image,
        return_depth=self.return_depth,
        return_radar_pv=self.return_radar_pv,
        optimizer_config=self.optimizer_config,
        *args,
        **kwargs
    )
    ################################################
    self.key_idxes = [-2, -4, -6]
    self.model = CameraRadarNetDet(
        self.backbone_img_conf,
        self.backbone_pts_conf,
        self.fuser_conf,
        self.head_conf,
    )

` Does anybody have an idea why it's not converging?

Kind regards

Stefan

youngskkim / CRN

Resnet 101 with depth estimation >= 100m of distance does not converge #8

self.optimizer_config = dict(type="AdamW", lr=2e-4, weight_decay=1e-4) ### org