One-Frame Real-Time Inference Speed is Very Slow

Hi!

I'm doing real-time inference with a pre-trained pillar-based center point model, the point-cloud is received from ROS-Bag frame by frame. I modified the codes directly from demos/pcd_demo.py. The inference time for each frame is about 150 ms, the time-test code is here:

torch.cuda.synchronize()
start_time = time.time()
print("START TIME")
with torch.no_grad():
    result = self.model(return_loss=False, rescale=False, **self.data)
torch.cuda.synchronize()
print('inference time:',time.time()-start_time)

My device is RTX3070, I think the correct inference time for each frame is about 30~50 ms, could you please give me some clue about the problem? Thanks in advance!

The following attached the whole codes for your reference:

here is the whole codes:

def get_xyz_points(cloud_array, remove_nans=True, dtype=np.float32):
    '''
    '''
    if remove_nans:
        mask = np.isfinite(cloud_array['x']) & np.isfinite(cloud_array['y']) & np.isfinite(cloud_array['z'])
        cloud_array = cloud_array[mask]

    points = np.zeros(cloud_array.shape + (4,), dtype=dtype)
    points[...,0] = cloud_array['z']
    points[...,1] = -cloud_array['y']
    points[...,2] = cloud_array['x']

    points[...,3] = cloud_array['intensity']/255
    return points

def xyz_array_to_pointcloud2(points_sum, stamp=None, frame_id=None):
    '''
    Create a sensor_msgs.PointCloud2 from an array of points.
    '''
    msg = PointCloud2()
    if stamp:
        msg.header.stamp = stamp
    if frame_id:
        msg.header.frame_id = frame_id
    msg.height = 1
    msg.width = points_sum.shape[0]
    msg.fields = [
        PointField('x', 0, PointField.FLOAT32, 1),
        PointField('y', 4, PointField.FLOAT32, 1),
        PointField('z', 8, PointField.FLOAT32, 1)
        # PointField('i', 12, PointField.FLOAT32, 1)
        ]
    msg.is_bigendian = False
    msg.point_step = 12
    msg.row_step = points_sum.shape[0]
    msg.is_dense = int(np.isfinite(points_sum).all())
    msg.data = np.asarray(points_sum, np.float32).tostring()
    return msg

def yaw2quaternion(yaw: float) -> Quaternion:
    return Quaternion(axis=[0,0,1], radians=yaw)

class Processor_ROS:
    def __init__(self, args):
        self.args= args

    def initialize(self):
        # build the model from a config file and a checkpoint file
        torch.backends.cudnn.benchmark = True
        self.model = init_model(self.args.config, self.args.checkpoint, device=self.args.device)

        cfg = self.model.cfg
        self.device = next(self.model.parameters()).device  # model device
        # build the data pipeline
        test_pipeline = deepcopy(cfg.data.test.pipeline)
        test_pipeline = Compose(test_pipeline)
        box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d)
        data = dict(
            pts_filename="/Innovusion/Data/Waymo/testing/bin/0000000.bin",
            box_type_3d=box_type_3d,
            box_mode_3d=box_mode_3d,
            # for ScanNet demo we need axis_align_matrix
            ann_info=dict(axis_align_matrix=np.eye(4)),
            sweeps=[],
            # set timestamp = 0
            timestamp=[0],
            img_fields=[],
            bbox3d_fields=[],
            pts_mask_fields=[],
            pts_seg_fields=[],
            bbox_fields=[],
            mask_fields=[],
            seg_fields=[])
        data = test_pipeline(data)
        self.data = collate([data], samples_per_gpu=1)

    def run(self, points):

        # data['img_metas'] = None
        self.data['points'] = [[points]]

        if next(self.model.parameters()).is_cuda:
            # scatter to specified GPU
            self.data = scatter(self.data, [self.device.index])[0]
        else:
            # this is a workaround to avoid the bug of MMDataParallel
            self.data['img_metas'] = self.data['img_metas'][0].data
            self.data['points'] = self.data['points'][0].data

        # forward the model

        torch.cuda.synchronize()
        start_time = time.time()
        with torch.no_grad():
            result = self.model(return_loss=False, rescale=False, **self.data)
        torch.cuda.synchronize()
        print('inference time:',time.time()-start_time)

        boxes_lidar = result[0]['pts_bbox']['boxes_3d'].tensor.numpy()
        scores = result[0]['pts_bbox']['scores_3d'].numpy()
        types = result[0]['pts_bbox']['labels_3d'].numpy()

        if self.args.score_thr > 0:
            inds = scores > self.args.score_thr
            scores = scores[inds]
            types = types[inds]

        return boxes_lidar, scores, types

def rslidar_callback(msg):

    arr_bbox = BoundingBoxArray()

    msg_cloud = ros_numpy.point_cloud2.pointcloud2_to_array(msg)
    np_points = get_xyz_points(msg_cloud, True)
    dim_5 = np.zeros((np_points.shape[0],1)).astype(np.float32) # !
    np_points = np.column_stack((np_points, dim_5))
    points = torch.from_numpy(np_points)

    boxes_lidar, scores, types = proc_1.run(points)

    print("len scores:", len(scores))

    if scores.size != 0:
        for i in range(scores.size):
            bbox = BoundingBox()
            bbox.header.frame_id = msg.header.frame_id
            bbox.header.stamp = rospy.Time.now()
            q = yaw2quaternion(float(-boxes_lidar[i][6]))
            bbox.pose.orientation.x = q[3]
            bbox.pose.orientation.y = -q[2]
            bbox.pose.orientation.z = q[1]
            bbox.pose.orientation.w = q[0]           
            bbox.pose.position.x = float(boxes_lidar[i][2])
            bbox.pose.position.y = -float(boxes_lidar[i][1])
            bbox.pose.position.z = float(boxes_lidar[i][0])
            bbox.dimensions.x = float(boxes_lidar[i][5])
            bbox.dimensions.y = float(boxes_lidar[i][4])
            bbox.dimensions.z = float(boxes_lidar[i][3])
            bbox.value = scores[i]
            bbox.label = int(types[i])
            arr_bbox.boxes.append(bbox)
    arr_bbox.header.frame_id = msg.header.frame_id
    arr_bbox.header.stamp = msg.header.stamp
    if len(arr_bbox.boxes) is not 0:
        pub_arr_bbox.publish(arr_bbox)
        arr_bbox.boxes = []
    else:
        arr_bbox.boxes = []
        pub_arr_bbox.publish(arr_bbox)

def main():
    global proc_1
    global pub_arr_bbox

    parser = ArgumentParser()
    # parser.add_argument('pcd', help='Point cloud file')
    parser.add_argument('--config', default="/Innovusion/Projects/mmdetection3d/checkpoints/centerpoint_pillar_waymo_one_baseline/centerpoint_032pillar_second_secfpn_circlenms_4x8_cyclic_20e_waymo.py", help='Config file')
    parser.add_argument('--checkpoint', default="/Innovusion/Projects/mmdetection3d/checkpoints/centerpoint_pillar_waymo_one_baseline/epoch_30.pth", help='Checkpoint file')
    parser.add_argument(
        '--device', default='cuda:0', help='Device used for inference')
    parser.add_argument(
        '--score-thr', type=float, default=0.0, help='bbox score threshold')

    args = parser.parse_args()

    proc_1 = Processor_ROS(args)

    proc_1.initialize()

    rospy.init_node('mmdetection3d_ros_node')

    sub_lidar_topic = [ "/velodyne_points", 
                        "/top/rslidar_points",
                        "/points_raw", 
                        "/lidar_protector/merged_cloud", 
                        "/merged_cloud",
                        "/lidar_top", 
                        "/roi_pclouds",
                        "/livox/lidar",
                        "/iv_points",
                        "/SimOneSM_PointCloud_0"]

    sub_ = rospy.Subscriber(sub_lidar_topic[8], PointCloud2, rslidar_callback, queue_size=1, buff_size=2**24)

    pub_arr_bbox = rospy.Publisher("pp_boxes", BoundingBoxArray, queue_size=1)

    print("[+] mmdetection3d ros_node has started!")    
    rospy.spin() 

if __name__ == '__main__':
    main()

open-mmlab / mmdetection3d

One-Frame Real-Time Inference Speed is Very Slow #1064