How can I successfully export ONNX for use with TensorRT?

thfylsty commented 1 year ago

Thank you very much for your answer.

ymlab commented 1 year ago

The implementation of onnx_export will be released in the near future.

cyn-liu commented 1 year ago

@ymlab Hi, How is the progress of the implementation of onnx_export? look forward to your reply, thanks. I want to validate the FPS of the model after using TensorRT acceleration.

thfylsty commented 1 year ago

@ymlab Hi, How is the progress of the implementation of onnx_export? I want to validate the FPS of the model after using TensorRT acceleration.

I have completed exporttoonnx.py and accelerated it with tensorrt, which is almost consistent with the author's announcement.There are tensorrt plugins in the author's code, just register them in tensorrt.

Just refer to the following code. script/view_tranform_cuda/cpp/src/cuda_accelerated_functions_cu_project_v3.cu

Author exports two onnx 2d and 3d, and uses project.cu for projection calculation betweern them . You can also export one onnx and register the projection as a plugin. I try success.

cyn-liu commented 1 year ago

@ymlab Hi, How is the progress of the implementation of onnx_export? I want to validate the FPS of the model after using TensorRT acceleration.

I have completed exporttoonnx.py and accelerated it with tensorrt, which is almost consistent with the author's announcement.There are tensorrt plugins in the author's code, just register them in tensorrt.

@thfylsty thanks for your reply. can you share your exporttoonnx.py and use TensorRT to accelerate its code?

thfylsty commented 1 year ago

I deleted the time sequence and only used 3 cameras. The code is for reference only.

The author has also completed this part of the code, but it may not be completely perfect? I modified it according to his export code.

The most important thing is to input the image as "input", and correctly complete the 2D part. "project" uses Trt plugin to complete it, and complete the 3D part. Finally, decode the results in Trt.

class TestPlugin(torch.autograd.Function):

    @staticmethod
    def symbolic(g, input):
        return g.op("Plugin", input, name_s="Project2Dto3D", info_s="")

    # 这里的forward只是为了让onnx导出时可以执行，实际上写与不写意义不大，只需要返回同等的输出维度即可
    @staticmethod
    def forward(ctx, input):
        return torch.zeros((1,64,200,200,4))

@DETECTORS.register_module()
class FastBEVTRT(FastBEV):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)

        n_images = self.camera_num
        stride = 4

        # img_extrinsic = np.ones([n_images,4,4]) # camnum x 4 x 4 np
        # img_intrinsic = np.ones([4,4]) # camnum x 4 x 4 np

        img_extrinsic = np.array([[[0.0293,-0.99,0.0061,-0.57],[0.005,-0.00598,-0.999,0.78],[0.99,0.029,0.0055,0.675],[0,0,0,1]]]) # camnum x 4 x 4 np
        img_intrinsic = np.array([[[454.477,0,662.663,0],[0,453.804,352.785,0],[0,0,1,0],[0,0,0,1]]]) #  4 x 4 np
        img_extrinsic = img_extrinsic.repeat(n_images, 0)
        img_intrinsic = img_intrinsic.repeat(n_images, 0)
        for i in range(n_images):
            img_extrinsic[i] *= (1+i/10)

        point_cloud_range = np.array([-50, -50, -5, 50, 50, 3])
        origin = (point_cloud_range[:3] + point_cloud_range[3:]) / 2.

        img_meta = {"lidar2img":{"extrinsic":img_extrinsic,"intrinsic":img_intrinsic,"origin":origin},}
        img_shape=[512,512]
        self.img_shape = img_shape
        self.img_meta = img_meta
        self.fea_shape = [img_shape[0]/4,img_shape[0]/4]
        self.height,self.width = img_shape[0]//stride,img_shape[1]//stride
        extrinsic_noise = 0
        device = "cpu"
        self.device = device

        self.upsample_list = []
        scale = [2,4,8]
        for i in range(3):
            self.upsample_list.append(torch.nn.Upsample(size=None, scale_factor=scale[i], mode='bilinear', align_corners=False))

    def onnx_export_2d(self, img):

        x = self.backbone(img)
        feas = list(self.neck(x))
        for i in range(3):
            feas[i+1] = self.upsample_list[i](feas[i+1])
        x = torch.cat(feas, dim=1)
        x = self.neck_fuse(x)

        return x

    def onnx_export_3d(self, x, _=None):
        x = self.neck_3d(x) # [1, 64, 200, 200, 4])
        cls_score, bbox_pred, dir_cls_preds = self.bbox_head(x)
        cls_score = cls_score[0].sigmoid()
        # # for  onnx export
        x = torch.cat((cls_score, bbox_pred[0], dir_cls_preds[0]),dim=1)
        return x

    def project_torch(self,features):
        stride_i = math.ceil(self.img_shape[-1] / features.shape[-1])  # P4 880 / 32 = 27.5
        projection = self._compute_projection(
            self.img_meta, stride_i, noise=self.extrinsic_noise).to(features.device)

        n_voxels, voxel_size = self.n_voxels[0], self.voxel_size[0]

        points = get_points(  # [3, vx, vy, vz]
            n_voxels=torch.tensor(n_voxels),
            voxel_size=torch.tensor(voxel_size),
            origin=torch.tensor(self.img_meta["lidar2img"]["origin"]),
        ).to(features.device)

        volume = backproject_inplace(
            features[:, :, :self.height, :self.width], points, projection)  # [c,
        volume = volume.unsqueeze(0)
        return volume

    def forward(
        self,
        img
    ):
        # 把bs维度消掉，在fastbev中，bs是用来存相机的
        img = img.squeeze(0)    ## torch.Size([1, cn, 3, 512, 512]) --> torch.Size([cn, 3, 512, 512])
        # print("x fea",img.sum(),img.mean(),img.max(),img.min())

        x = self.onnx_export_2d(img) # out: n 64 1/4 1/4size   --> torch.Size([3, 64, 128, 128]) [3,  128, 128,64]
        # print("x fea",x.sum(),x.mean(),x.max(),x.min())

        ## for tensorrt 打开这里用来trt
        x = x.permute(0, 2, 3, 1)
        x = TestPlugin.apply(x) # out: 64 voxel[x y z] --> torch.Size([1, 64, 200, 200, 4])

        # x = self.forward_trt(x)   # out: 64 voxel[x y z] --> torch.Size([1, 64, 200, 200, 4])
        ## for torch 打开这里用来pytorch 验证trt结果是否一致
        # x = self.project_torch(x)   # out: 64 voxel[x y z] --> torch.Size([1, 64, 200, 200, 4])
        # print("x fea",x.sum(),x.abs().sum(),x.mean(),x.max(),x.min())

        x = self.onnx_export_3d(x)  #out: 1 192   --> torch.Size([1, 192, 100, 100])
        # print("x fea",x.sum(),x.mean(),x.max(),x.min())

        return x

thfylsty commented 1 year ago

different camera number cost time without sequence on Jeston AGX Xavier

cam-num time(ms) 1 25.889 3 34.464 6 46.967 8 55.560

thfylsty commented 1 year ago

@cyn-liu the export.py

notice cfg.model.type = cfg.model.type + 'TRT' and n_image = 8


def main():
    args = parse_args()

    assert args.out or args.eval or args.format_only or args.show \
        or args.show_dir, \
        ('Please specify at least one operation (save/eval/format/show the '
         'results / save the results) with the argument "--out", "--eval"'
         ', "--format-only", "--show" or "--show-dir"')

    if args.eval and args.format_only:
        raise ValueError('--eval and --format_only cannot be both specified')

    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
        raise ValueError('The output file must be a pkl file.')
    print(args.config)
    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # import modules from string list.
    if cfg.get('custom_imports', None):
        from mmcv.utils import import_modules_from_strings
        import_modules_from_strings(**cfg['custom_imports'])
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    cfg.model.pretrained = None
    # in case the test dataset is concatenated
    samples_per_gpu = 1
    if isinstance(cfg.data.test, dict):
        cfg.data.test.test_mode = True
        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
        if samples_per_gpu > 1:
            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
            cfg.data.test.pipeline = replace_ImageToTensor(
                cfg.data.test.pipeline)
    elif isinstance(cfg.data.test, list):
        for ds_cfg in cfg.data.test:
            ds_cfg.test_mode = True
        samples_per_gpu = max(
            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
        if samples_per_gpu > 1:
            for ds_cfg in cfg.data.test:
                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)

    # set random seeds
    if args.seed is not None:
        set_random_seed(args.seed, deterministic=args.deterministic)

    # build the model and load checkpoint
    if args.vis:
        nms_thr = 0.0001
        try:
            cfg.model.test_cfg.nms_thr = nms_thr
        except:
            print('### imvoxelnet except in train.py ###')
            cfg.test_cfg.nms_thr = nms_thr

    if args.extrinsic_noise > 0:
        for i in range(3):
            print('### test camera extrinsic robustness ###')
        cfg.model.extrinsic_noise = args.extrinsic_noise

    cfg.model.train_cfg = None

    cfg.model.type = cfg.model.type + 'TRT'

    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
    model_path = "weights/pth/export.pth"
    from mmcv.runner import save_checkpoint,load_checkpoint

    if os.path.exists(model_path):
        load_checkpoint(model,model_path)
        # model = torch.load(model_path)
    else:
        save_checkpoint(model,model_path)
        # torch.save(model_path,model_path)
    # fp16_cfg = cfg.get('fp16', None)
    # if fp16_cfg is not None:
    #     wrap_fp16_model(model)
    # load_checkpoint(model, args.checkpoint, map_location='cpu')
    if args.fuse_conv_bn:
        model = fuse_conv_bn(model)

    n_image = 8
    img = torch.ones([1,n_image, 3, 512, 512])
    # x = model(img)
    model.eval()

    onnx_path = "weights/onnxs/fastbev_plugin_n"+str(n_image)+".onnx"
    with torch.no_grad():
        torch.onnx.export(
            model,
            (img),
            onnx_path,
            verbose=True,
            opset_version=11,
            input_names=[
                'input'
            ],
            output_names=["output"],
            enable_onnx_checker=False
        )

    from onnxsim import simplify
    import onnx
    model = onnx.load(onnx_path)
    # 简化模型可视化
    # model_simp, check = simplify(model,skip_unknown=False)
    # 增加维度信息与保存
    onnx.save(onnx.shape_inference.infer_shapes(model), onnx_path)

cyn-liu commented 1 year ago

@thfylsty Thank you very much for sharing. I saw the cost time on different camera numbers you shared, and I have a question: Can you change the number of cameras for model(shared by the author) inference? (6 cameras by default, Nuscenes dataset) . Or you retrained the model?