youngskkim / CRN

[ICCV'23] Official implementation of CRN: Camera Radar Net for Accurate, Robust, Efficient 3D Perception
MIT License
99 stars 15 forks source link

Resnet 101 with depth estimation >= 100m of distance does not converge #8

Open 87royalts87 opened 6 months ago

87royalts87 commented 6 months ago

Hello :) First of all congrats to your paper + code. It looks super cool. However I tried using your Model with a Resnet 101 as image backbone with a final image resolution close to the original value. I also tried to detect object within 100m and beyond. Unfortunately this model does not seem to converge... image Thur purple line in the picture is your resnet50 with resolution 256X704 (I didn't change the config) while the Orange line is my (described above) network. He is also the config of the experiment. Please note the orange line was created using the the optimizer elf.optimizer_config = dict(type="AdamW", lr=2e-4, weight_decay=1e-4) The change optimizer didn't changed a thing here: `class CRNLightningModel(BEVDepthLightningModel): def init(self, *args, **kwargs) -> None: self.return_image = True self.return_depth = True self.return_radar_pv = True ################################################

self.optimizer_config = dict(type="AdamW", lr=2e-4, weight_decay=1e-4) ### org

    self.optimizer_config = dict(type="AdamW", lr=2e-6, weight_decay=1e-5)
    ################################################
    x_y_bound = [-51.2 * 2, 51.2 * 2]
    final_dim = [896, 1600]
    self.backbone_img_conf = {
        # "x_bound": [-51.2, 51.2, 0.8],
        # "y_bound": [-51.2, 51.2, 0.8],
        "x_bound": x_y_bound + [0.8],  # voxel x bounds
        "y_bound": x_y_bound + [0.8],  # voxel y bounds
        "z_bound": [-5, 3, 8],
        "d_bound": [
            2.0,
            117.2,
            0.8,
        ],  # frsutum depth bounds min,max, depth_step_size
        # "d_bound": [2.0, 58.0, 0.8],
        # "final_dim": (256, 704),
        "final_dim": tuple(final_dim),
        "downsample_factor": 32,
        "img_backbone_conf": dict(
            type="ResNet",
            depth=101,
            frozen_stages=0,
            out_indices=[0, 1, 2, 3],
            dilations=(2, 2, 1, 1),
            #dilations=(1, 1, 1, 1),
            strides=(2, 2, 2, 2),
            #strides=(1, 1, 1, 1),
            norm_eval=False,
            init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet101"),
        ),
        "img_neck_conf": dict(
            type="SECONDFPN",
            in_channels=[256, 512, 1024, 2048],
            upsample_strides=[0.25, 0.50, 1, 2],
            out_channels=[128, 128, 128, 128],
            # out_channels=[256, 256, 256, 256],
        ),
        "depth_net_conf": dict(in_channels=512, mid_channels=256),
        "radar_view_transform": True,
        "camera_aware": False,
        # "camera_aware": True,  # Model camera intrinsic into DepthNet
        #"output_channels": 80,
        "output_channels": 128,
    }
    ################################################
    # point_cloud_range_backbone=[0, 2.0, 0, 704, 58.0, 2],    #[x_min, y_min, z_min, x_max, y_max, z_max]
    point_cloud_range_backbone = [
        0,
        2.0,
        0,
        final_dim[1],
        self.backbone_img_conf["d_bound"][1],
        2,
    ]  # [x_min, y_min, z_min, x_max, y_max, z_max]
    self.backbone_pts_conf = {
        "pts_voxel_layer": dict(
            max_num_points=8,
            voxel_size=[8, 0.4, 2],
            point_cloud_range=point_cloud_range_backbone,  # [x_min, y_min, z_min, x_max, y_max, z_max]
            max_voxels=(768, 1024),
        ),
        "pts_voxel_encoder": dict(
            type="PillarFeatureNet",
            in_channels=5,
            feat_channels=[32, 64],
            with_distance=False,
            with_cluster_center=False,
            with_voxel_center=True,
            voxel_size=[8, 0.4, 2],
            point_cloud_range=point_cloud_range_backbone,
            norm_cfg=dict(type="BN1d", eps=1e-3, momentum=0.01),
            legacy=True,
        ),
        "pts_middle_encoder": dict(
            # type="PointPillarsScatter", in_channels=64, output_shape=(140, 88)
            type="PointPillarsScatter",
            in_channels=64,
            output_shape=(288, 100)
            # type="PointPillarsScatter",
            # in_channels=64,
            # output_shape=(288, 200),
        ),
        "pts_backbone": dict(
            type="SECOND",
            in_channels=64,
            out_channels=[64, 128, 256],
            layer_nums=[3, 5, 5],
            layer_strides=[1, 2, 2],
            norm_cfg=dict(type="BN", eps=1e-3, momentum=0.01),
            conv_cfg=dict(type="Conv2d", bias=True, padding_mode="reflect"),
        ),
        "pts_neck": dict(
            type="SECONDFPN",
            in_channels=[64, 128, 256],
            out_channels=[128, 128, 128],
            upsample_strides=[0.5, 1, 2],
            norm_cfg=dict(type="BN", eps=1e-3, momentum=0.01),
            upsample_cfg=dict(type="deconv", bias=False),
            use_conv_for_no_stride=True,
        ),
        "occupancy_init": 0.01,
        # "out_channels_pts": 80,
        "out_channels_pts": 128,
    }
    ################################################
    self.fuser_conf = {
        #"img_dims": 80,
        #"pts_dims": 80,
        "embed_dims": 128,
        # "num_layers": 6,
        # "num_heads": 4,
        # "bev_shape": (128, 128),
        "img_dims":128,
        "pts_dims":128,
        #"embed_dims":256,
        "num_layers": 6,
        "num_heads": 4,
        "bev_shape": (256, 256),
    }
    ################################################
    out_size_factor = 2

    # point_cloud_range_head = [-51.2, -51.2, -5, 51.2, 51.2, 3]
    point_cloud_range_head = [
        x_y_bound[0],
        x_y_bound[0],
        -5,
        x_y_bound[1],
        x_y_bound[1],
        3,
    ]
    # post_center_range = [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
    post_center_range = [-61.2 * 2, -61.2 * 2, -10.0, 61.2 * 2, 61.2 * 2, 10.0]

    self.head_conf = {
        "bev_backbone_conf": dict(
            type="ResNet",
            #in_channels=256,
            in_channels=128,
            depth=18,
            num_stages=3,
            strides=(1, 2, 2),
            dilations=(1, 1, 1),
            out_indices=[0, 1, 2],
            norm_eval=False,
            base_channels=160
            # base_channels=360,
        ),
        "bev_neck_conf": dict(
            type="SECONDFPN",
            in_channels=[128, 160, 320, 640],
            #upsample_strides=[1, 2, 4, 8],
            #in_channels=[256, 160, 320, 640],
            #in_channels=[256, 360, 720, 1440],
            upsample_strides=[1, 2, 4, 8],
            out_channels=[64, 64, 64, 64],
        ),
        "tasks": [
            dict(num_class=1, class_names=["car"]),
            dict(num_class=2, class_names=["truck", "construction_vehicle"]),
            dict(num_class=2, class_names=["bus", "trailer"]),
            dict(num_class=1, class_names=["barrier"]),
            dict(num_class=2, class_names=["motorcycle", "bicycle"]),
            dict(num_class=2, class_names=["pedestrian", "traffic_cone"]),
        ],
        "common_heads": dict(
            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)
        ),
        "bbox_coder": dict(
            type="CenterPointBBoxCoder",
            # post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            post_center_range=post_center_range,
            max_num=500,
            score_threshold=0.01,
            out_size_factor=out_size_factor,
            voxel_size=[0.2, 0.2, 8],
            pc_range=point_cloud_range_head,
            code_size=9,
        ),
        "train_cfg": dict(
            point_cloud_range=point_cloud_range_head,
            grid_size=[512, 512, 1],
            voxel_size=[0.2, 0.2, 8],
            out_size_factor=out_size_factor,
            dense_reg=1,
            gaussian_overlap=0.1,
            max_objs=500,
            min_radius=2,
            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
        ),
        "test_cfg": dict(
            post_center_limit_range=post_center_range,
            max_per_img=500,
            max_pool_nms=False,
            min_radius=[4, 12, 10, 1, 0.85, 0.175],
            score_threshold=0.01,
            out_size_factor=out_size_factor,
            voxel_size=[0.2, 0.2, 8],
            nms_type="circle",
            pre_max_size=1000,
            post_max_size=200,
            nms_thr=0.2,
        ),
        "in_channels": 256,  # Equal to bev_neck output_channels.
        "loss_cls": dict(type="GaussianFocalLoss", reduction="mean"),
        "loss_bbox": dict(type="L1Loss", reduction="mean", loss_weight=0.25),
        "gaussian_overlap": 0.1,
        "min_radius": 2,
    }
    ################################################
    self.ida_aug_conf = {
        "resize_lim": (1, 1),
        #"resize_lim": (0.386, 0.55),
        # "final_dim": (256, 704),
        "final_dim": (896, 1600),
        "rot_lim": (0.0, 0.0),
        "H": 900,
        "W": 1600,
        "rand_flip": True,
        "bot_pct_lim": (0.0, 0.0),
        "cams": [
            "CAM_FRONT_LEFT",
            "CAM_FRONT",
            "CAM_FRONT_RIGHT",
            "CAM_BACK_LEFT",
            "CAM_BACK",
            "CAM_BACK_RIGHT",
        ],
        "Ncams": 6,
    }
    self.bda_aug_conf = {
        "rot_ratio": 1.0,
        "rot_lim": (-22.5, 22.5),
        "scale_lim": (0.9, 1.1),
        "flip_dx_ratio": 0.5,
        "flip_dy_ratio": 0.5,
    }

    self.rda_aug_conf = {
        "N_sweeps": 6,
        "N_use": 5,
        "drop_ratio": 0.1,
    }
    ################################################
    super().__init__(
        backbone_img_conf=self.backbone_img_conf,
        head_conf=self.head_conf,
        ida_aug_conf=self.ida_aug_conf,
        bda_aug_conf=self.bda_aug_conf,
        rda_aug_conf=self.rda_aug_conf,
        return_image=self.return_image,
        return_depth=self.return_depth,
        return_radar_pv=self.return_radar_pv,
        optimizer_config=self.optimizer_config,
        *args,
        **kwargs
    )
    ################################################
    self.key_idxes = [-2, -4, -6]
    self.model = CameraRadarNetDet(
        self.backbone_img_conf,
        self.backbone_pts_conf,
        self.fuser_conf,
        self.head_conf,
    )

` Does anybody have an idea why it's not converging?

Kind regards

Stefan

youngskkim commented 5 months ago

Hi, sorry for the late reply.
If I understand your modification correctly, you increase point_cloud_range twice without changing voxel_size. As a result, the size of BEV grid becomes twice ([512, 512] to [1024, 1024]). You have to change self.head_conf = {"train_cfg": dict(grid_size=[512, 512, 1])} accordingly.

jaser-rayeh commented 4 months ago

@87royalts87 Could you tell me how were you able to run the model checkpont ?