请问如何对模型进行后处理

wojiazaiyugang commented 3 years ago

感谢各位开发者，我对深度学习的理论理解的并不是非常深入，目前在完成一些关于目标检测相关功能的开发希望将mmdet作为一个工具来使用，我想咨询一个问题，我想使用mmdet来训练一个检测模型在自己的数据集上进行训练然后进行检测，我的训练配置文件是

_base_ = '../yolox/yolox_l_8x8_300e_coco.py'

model = dict(
    bbox_head=dict(num_classes=1)
)

dataset_type = 'BasketballDetection'
# data_root = '/mnt/nfs-storage/yujiannan/data/bas_data/train_data/'
data_root = '/home/senseport0/data/train_data/'

data = dict(
    samples_per_gpu=4,
    workers_per_gpu=1,
    train=dict(
        dataset=dict(
            type=dataset_type,
            ann_file=data_root + "train_21.3.10_train.json" or "train_21.8.16_train.json",
            img_prefix=data_root)),
    val=dict(
        type=dataset_type,
        img_prefix=data_root,
        ann_file=data_root + "train_21.3.10_val.json" or "val_21.8.16.json"),
    test=dict(
        type=dataset_type,
        img_prefix=data_root,
        ann_file=data_root + "train_21.3.10_val.json" or "val_21.8.16.json"))

optimizer = dict(type='SGD', lr=2e-5, momentum=0.9, weight_decay=5e-4)
evaluation = dict(interval=1, metric=['mAP'])
checkpoint_config = dict(interval=1)

使用的是yolox的配置，训练之后得到一些.pth文件，我希望将训练好的模型在不同的机器上（Ubuntu 18）使用tensorrt进行部署，经过查阅了一些资料决定将.pth文件转换为.onnx文件，然后在目标机器上转化为.trt文件进行部署我的加载trt文件然后推理的代码是

import os
from typing import List, Tuple
from dataclasses import dataclass

import cv2
import torch
import torchvision
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit  # 不能删 init了啥玩意
import tensorrt as trt

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

@dataclass
class DetectResult:
    """
    检测结果
    """
    bbox: Tuple[int, int, int, int]  # bbox框 左上角xy和右下角xy
    score: float  # 分数

class BasketballDetector:

    def __init__(self, onnx_file: str, engine_file: str):
        self.input_size = (640, 640)  # 输入图像尺寸
        self.infer_image_shape = ()  # 当前推理的图像尺寸
        self.trt_logger = trt.Logger()
        self.engine = self.get_engine(onnx_file, engine_file)
        self.context = self.engine.create_execution_context()
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()

    def get_engine(self, onnx_file: str, engine_file: str):
        """
        获取engine
        :param onnx_file:
        :param engine_file:
        :return:
        """
        if os.path.exists(engine_file):
            print(f"读取trt engine {engine_file}")
            return trt.Runtime(self.trt_logger).deserialize_cuda_engine(open(engine_file, "rb").read())
        else:
            print(f"trt engine {engine_file}不存在，开始在线转换 {onnx_file}")
            explicit_batch = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
            with trt.Builder(self.trt_logger) as builder, \
                    builder.create_network(explicit_batch) as network, \
                    trt.OnnxParser(network, self.trt_logger) as parser:

                builder.max_workspace_size = 1 << 30
                builder.max_batch_size = 1
                if builder.platform_has_fast_fp16:
                    builder.fp16_mode = True
                with open(onnx_file, 'rb') as model_file:
                    if not parser.parse(model_file.read()):
                        print('Failed to parse the ONNX file')
                        for err in range(parser.num_errors):
                            print(parser.get_error(err))
                        return None
                network.get_input(0).shape = [1, 3, *self.input_size]
                engine = builder.build_cuda_engine(network)
                if engine is None:
                    print('Failed to build engine')
                    return None
                with open(engine_file, 'wb') as engine_file:
                    engine_file.write(engine.serialize())
                return engine

    def allocate_buffers(self):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding)) * self.engine.max_batch_size
            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(device_mem))
            # Append to the appropriate list.
            if self.engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))
        return inputs, outputs, bindings, stream

    def do_inference(self):
        # Transfer input data to the GPU.
        [cuda.memcpy_htod_async(inp.device, inp.host, self.stream) for inp in self.inputs]
        # Run inference.
        # context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        self.context.execute_async(bindings=self.bindings, stream_handle=self.stream.handle)
        # Transfer predictions back from the GPU.
        [cuda.memcpy_dtoh_async(out.host, out.device, self.stream) for out in self.outputs]
        # Synchronize the stream
        self.stream.synchronize()
        # Return only the host outputs.
        return [out.host for out in self.outputs]

    def decode_outputs(self, outputs):
        """
        trt输出解码
        :return:
        """
        grids = []
        strides = []
        dtype = torch.FloatTensor
        for (hsize, wsize), stride in zip([torch.Size([80, 80]), torch.Size([40, 40]), torch.Size([20, 20])],
                                          [8, 16, 32]):
            yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
            grids.append(grid)
            shape = grid.shape[:2]
            strides.append(torch.full((*shape, 1), stride))
        grids = torch.cat(grids, dim=1).type(dtype)
        strides = torch.cat(strides, dim=1).type(dtype)
        outputs[..., :2] = (outputs[..., :2] + grids) * strides
        outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
        return outputs

    def pre_processing(self, img: np.ndarray):
        """
        图像预处理
        :param img:
        :return:
        """
        padded_img = np.ones((self.input_size[0], self.input_size[1], 3), dtype=np.uint8) * 114
        r = min(self.input_size[0] / self.infer_image_shape[0], self.input_size[1] / self.infer_image_shape[1])
        resized_img = cv2.resize(
            img,
            (int(self.infer_image_shape[1] * r), int(self.infer_image_shape[0] * r)),
            interpolation=cv2.INTER_LINEAR,
        ).astype(np.uint8)
        padded_img[: int(self.infer_image_shape[0] * r), : int(self.infer_image_shape[1] * r)] = resized_img
        padded_img = padded_img.transpose((2, 0, 1))
        padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
        return padded_img

    def post_processing(self, prediction, num_classes, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
        box_corner = prediction.new(prediction.shape)
        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
        prediction[:, :, :4] = box_corner[:, :, :4]
        output = [None for _ in range(len(prediction))]
        for i, image_pred in enumerate(prediction):

            # If none are remaining => process next image
            if not image_pred.size(0):
                continue
            # Get score and class with highest confidence
            class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
            conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
            # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
            detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
            detections = detections[conf_mask]
            if not detections.size(0):
                continue
            if class_agnostic:
                nms_out_index = torchvision.ops.nms(
                    detections[:, :4],
                    detections[:, 4] * detections[:, 5],
                    nms_thre,
                )
            else:
                nms_out_index = torchvision.ops.batched_nms(
                    detections[:, :4],
                    detections[:, 4] * detections[:, 5],
                    detections[:, 6],
                    nms_thre,
                )
            detections = detections[nms_out_index]
            if output[i] is None:
                output[i] = detections
            else:
                output[i] = torch.cat((output[i], detections))
        return output

    def infer(self, image) -> List[DetectResult]:
        self.infer_image_shape = image.shape
        image = self.pre_processing(image)
        np.copyto(self.inputs[0].host, image.flatten())
        trt_outputs = self.do_inference()
        trt_outputs = torch.from_numpy(trt_outputs[0])
        trt_outputs.resize_(1, 8400, 6)
        trt_outputs = self.decode_outputs(trt_outputs)
        trt_outputs = self.post_processing(prediction=trt_outputs,
                                           num_classes=1,
                                           conf_thre=0.3,
                                           nms_thre=0.3,
                                           class_agnostic=True)
        if trt_outputs[0] is None:
            return []
        results = trt_outputs[0].numpy()
        # input_image = cv2.resize(input_image, input_size)
        ratio = min(self.input_size[0] / self.infer_image_shape[0], self.input_size[1] / self.infer_image_shape[1])
        detect_results = []
        for result in results:
            bbox = list(map(int, result[:4] / ratio))
            score = float(result[4] * result[5])
            detect_results.append(DetectResult(bbox=bbox, score=score))
        return detect_results

if __name__ == '__main__':
    import datetime

    image = cv2.imread("assets/dog.jpg")
    print(image.shape)
    basketball_detector = BasketballDetector("checkpoints/tmp.onnx",
                                             "checkpoints/tmp.trt")
    for i in range(10):
        s = datetime.datetime.now()
        print(image.shape)
        detect_results = basketball_detector.infer(image)
        print((datetime.datetime.now() - s).total_seconds() * 1000)
    for detect_result in detect_results:
        if detect_result.score < 2:
            continue
        print(detect_result)
        cv2.rectangle(image, (detect_result.bbox[0], detect_result.bbox[1]),
                      (detect_result.bbox[2], detect_result.bbox[3]), (0, 0, 255), 2)
        cv2.putText(image, str(detect_result.score), (detect_result.bbox[0], detect_result.bbox[1] - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        # print(bbox, result)
    cv2.imwrite("assets/output.jpg", image)

一开始我使用pth转onnx 的命令是

ONNX_BACKEND=MMCVTensorRT python tools/deployment/pytorch2onnx.py configs/basketball_detection/yolox_l.py --simplify log/epoch_1.pth --shape 640 640

得到了tmp.onnx 然后使用Python用tensorrt将onnx转换为trt尝试推理，出现了以下错误

我的理解是mmdet可能有什么tensorrt中不支持的操作，因此我把命令改为

ONNX_BACKEND=MMCVTensorRT python tools/deployment/pytorch2onnx.py configs/basketball_detection/yolox_l.py --simplify log/epoch_1.pth --shape 640 640 --skip-postprocess

即多加了--skip-postprocess，这次得到的onnx可以成功转换为trt

在我的推理代码中，对于图像前处理和后处理的代码我是从YOLOX的项目中迁移过来，在yolox的项目中使用是没有问题的，但是现在使用mmdet，在infer函数中trt_outputs = self.do_inference()这句话返回的结果是一个list

我不清楚如何把trt输出的结果转换为我需要的格式（若干个bbox和置信度）因此想了解我该如何把trt输出的结果转换为有意义的数据同时也想请教对于我的需求，我目前的流程是否合理，是否有更加优雅的解决方案

我的环境是

root@b77705740e4a:/workspace/mmdetection# python mmdet/utils/collect_env.py 
sys.platform: linux
Python: 3.7.7 (default, May  7 2020, 21:25:33) [GCC 7.3.0]
CUDA available: True
GPU 0: GeForce RTX 2080 Ti
CUDA_HOME: /usr/local/cuda
NVCC: Cuda compilation tools, release 10.1, V10.1.243
GCC: gcc (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
PyTorch: 1.6.0
PyTorch compiling details: PyTorch built with:
  - GCC 7.3
  - C++ Version: 201402
  - Intel(R) Math Kernel Library Version 2020.0.1 Product Build 20200208 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v1.5.0 (Git Hash e2ac1fac44c5078ca927cb9b90e1b3066a0b2ed0)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 10.1
  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37
  - CuDNN 7.6.3
  - Magma 2.5.2
  - Build settings: BLAS=MKL, BUILD_TYPE=Release, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_VULKAN_WRAPPER -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, USE_CUDA=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_STATIC_DISPATCH=OFF, 

TorchVision: 0.7.0
OpenCV: 4.5.3
MMCV: 1.3.8
MMCV Compiler: GCC 7.3
MMCV CUDA Compiler: 10.1
MMDetection: 2.16.0+998fd70

jshilong commented 3 years ago

Can we give some help? @RunningLeon @RangiLyu

wojiazaiyugang commented 3 years ago

又训练了一个ssd的检测，还是skip-postprocess然后转onnx转trt，发现trt的输出是12个list，也就是对于不同的模型输出的size是不一样的，那么我该去哪里查看这个结果是如何生成的，以及如何把结果处理成有意义的数值呢

RunningLeon commented 3 years ago

@wojiazaiyugang You may consider end2end deployment with mmcv-full compiled through MMCV_TRT=1 MMCV_ORT=1 MMCV_OPS=1 pip install -e .

wojiazaiyugang commented 3 years ago

@wojiazaiyugang You may consider end2end deployment with mmcv-full compiled through MMCV_TRT=1 MMCV_ORT=1 MMCV_OPS=1 pip install -e .

使用mm部署之前考虑过，放弃的原因在于这要求部署的时候机器也要装一整套完整的mm环境比较麻烦，如果转成了onnx之后，部署的时候只要把onnx扔到服务器里，用的时候就可以自动转换了比较方便

RunningLeon commented 3 years ago

@wojiazaiyugang You may consider end2end deployment with mmcv-full compiled through MMCV_TRT=1 MMCV_ORT=1 MMCV_OPS=1 pip install -e .

使用mm部署之前考虑过，放弃的原因在于这要求部署的时候机器也要装一整套完整的mm环境比较麻烦，如果转成了onnx之后，部署的时候只要把onnx扔到服务器里，用的时候就可以自动转换了比较方便

in that case, you may have to write decoding and the post-processing by yourself. You could refer to Python code in MMDetection.

wojiazaiyugang commented 3 years ago

@wojiazaiyugang You may consider end2end deployment with mmcv-full compiled through MMCV_TRT=1 MMCV_ORT=1 MMCV_OPS=1 pip install -e .

使用mm部署之前考虑过，放弃的原因在于这要求部署的时候机器也要装一整套完整的mm环境比较麻烦，如果转成了onnx之后，部署的时候只要把onnx扔到服务器里，用的时候就可以自动转换了比较方便

in that case, you may have to write decoding and the post-processing by yourself. You could refer to Python code in MMDetection.

是的，我想了解一下我该参考哪部分的代码，有没有一个完整的后处理的例子呢

RunningLeon commented 3 years ago

@RangiLyu

@wojiazaiyugang You may consider end2end deployment with mmcv-full compiled through MMCV_TRT=1 MMCV_ORT=1 MMCV_OPS=1 pip install -e .

使用mm部署之前考虑过，放弃的原因在于这要求部署的时候机器也要装一整套完整的mm环境比较麻烦，如果转成了onnx之后，部署的时候只要把onnx扔到服务器里，用的时候就可以自动转换了比较方便

in that case, you may have to write decoding and the post-processing by yourself. You could refer to Python code in MMDetection.

是的，我想了解一下我该参考哪部分的代码，有没有一个完整的后处理的例子呢

@RangiLyu Could take a look at this?

wojiazaiyugang commented 3 years ago

I'm still waiting for a helpful answer....

RangiLyu commented 3 years ago

If you use --skip-postprocess, you have to implement the post-process part in cpp by yourself. For example, if you use YOLOX, you need to refer to the get_bboxes function in https://github.com/open-mmlab/mmdetection/blob/fe396572839f30351aa862d2b2702f57698cefea/mmdet/models/dense_heads/yolox_head.py

jshilong commented 3 years ago

Feel free to reopen the issue if there is any question

open-mmlab / mmdetection

请问如何对模型进行后处理 #6081