Linaom1214 / TensorRT-For-YOLO-Series

tensorrt for yolo series (YOLOv10,YOLOv9,YOLOv8,YOLOv7,YOLOv6,YOLOX,YOLOv5), nms plugin support
915 stars 155 forks source link

使用这一份代码,推理速度提高了,例子中使用您的export.py导出的trt模型 #43

Closed YFforever2022 closed 2 years ago

YFforever2022 commented 2 years ago

https://colab.research.google.com/github/WongKinYiu/yolov7/blob/main/tools/YOLOv7trt.ipynb import cv2 import torch import random import time import numpy as np import tensorrt as trt from PIL import Image from pathlib import Path from collections import OrderedDict,namedtuple

w = './yolov7-tiny-nms.trt' device = torch.device('cuda:0') img = cv2.imread('src/1.jpg')

[08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init CUDA: CPU +252, GPU +0, now: CPU 10507, GPU 1049 (MiB) [08/23/2022-17:20:35] [TRT] [I] Loaded engine size: 27 MiB [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +132, GPU +46, now: CPU 10678, GPU 1122 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +108, GPU +36, now: CPU 10786, GPU 1158 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +27, now: CPU 0, GPU 27 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 10761, GPU 1172 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 10761, GPU 1180 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +65, now: CPU 0, GPU 92 (MiB) Cost 4.830199999999785 ms

GTX1080 torch 1.10.1+cu102 torchvision 0.11.2+cu102 tensorrt 8.4.1.5 opencv-python 4.5.1.48

YFforever2022 commented 2 years ago

目前看来脱离不了torch,使用torch必然会使总体项目文件变大

Linaom1214 commented 2 years ago

目前看来脱离不了torch,使用torch必然会使总体项目文件变大

我们的初衷就是为了部署的代码和依赖更少, 问题还是在于数据在cpu和gpu之前的拷贝增加了耗时吧,三木君这个把很多操作都放在了GPU上, 目前已经在优化了,希望下个版本速度有提升

YFforever2022 commented 2 years ago

目前看来脱离不了torch,使用torch必然会使总体项目文件变大

我们的初衷就是为了部署的代码和依赖更少, 问题还是在于数据在cpu和gpu之前的拷贝增加了耗时吧,三木君这个把很多操作都放在了GPU上, 目前已经在优化了,希望下个版本速度有提升

好的,没有问题了,期待您的新版本

YFforever2022 commented 2 years ago
import cv2
import torch
import random
import time
import numpy as np
import tensorrt as trt
from PIL import Image
from pathlib import Path
from collections import OrderedDict,namedtuple

w = './yolov7-tiny-nms.trt'
device = torch.device('cuda:0')

# Infer TensorRT Engine
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
logger = trt.Logger(trt.Logger.INFO)
trt.init_libnvinfer_plugins(logger, namespace="")
with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
    model = runtime.deserialize_cuda_engine(f.read())
bindings = OrderedDict()
for index in range(model.num_bindings):
    name = model.get_binding_name(index)
    dtype = trt.nptype(model.get_binding_dtype(index))
    shape = tuple(model.get_binding_shape(index))
    data = torch.from_numpy(np.empty(shape, dtype=np.dtype(dtype))).to(device)
    bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()))
binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
context = model.create_execution_context()

def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, r, (dw, dh)

def postprocess(boxes,r,dwdh):
    dwdh = torch.tensor(dwdh*2).to(boxes.device)
    boxes -= dwdh
    boxes /= r
    return boxes

names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
         'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
         'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
         'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
         'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
         'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
         'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
         'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
         'hair drier', 'toothbrush']
colors = {name:[random.randint(0, 255) for _ in range(3)] for i,name in enumerate(names)}

def detect_pic(img):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image = img.copy()
    image, ratio, dwdh = letterbox(image, auto=False)
    image = image.transpose((2, 0, 1))
    image = np.expand_dims(image, 0)
    image = np.ascontiguousarray(image)

    im = image.astype(np.float32)
    im.shape
    im = torch.from_numpy(im).to(device)
    im /= 255
    im.shape
    binding_addrs['images'] = int(im.data_ptr())
    context.execute_v2(list(binding_addrs.values()))

    nums = bindings['num_dets'].data
    boxes = bindings['det_boxes'].data
    scores = bindings['det_scores'].data
    classes = bindings['det_classes'].data
    nums.shape,boxes.shape,scores.shape,classes.shape
    boxes = boxes[0, :nums[0][0]]
    scores = scores[0, :nums[0][0]]
    classes = classes[0, :nums[0][0]]

    ret = ''
    for box, score, cl in zip(boxes, scores, classes):
        box = postprocess(box, ratio, dwdh).round().int()
        name = names[cl]
        color = colors[name]
        name += ' ' + str(round(float(score), 3))
        clas_id = int(cl)
        x1 = box[:2].tolist()[0]
        y1 = box[:2].tolist()[1]
        x2 = box[2:].tolist()[0]
        y2 = box[2:].tolist()[1]
        w = abs(x2 - x1)
        h = abs(y2 - y1)
        conf = float(score)
        ret += str(clas_id) + ',' + str(x1) + ',' + str(y1) + ',' + str(w) + ',' + str(h) + ',' + str(conf) + '\r\n'
        # cv2.rectangle(img, box[:2].tolist(), box[2:].tolist(), color, thickness=2)
        # cv2.putText(img, name, (int(box[0]), int(box[1]) - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.75, color, thickness=2)

    # Image.fromarray(img)

    return ret

# warmup for 10 times
for _ in range(10):
    tmp = torch.randn(1, 3, 640, 640).to(device)
    binding_addrs['images'] = int(tmp.data_ptr())
    context.execute_v2(list(binding_addrs.values()))

img2 = cv2.imread('src/1.jpg')
for _ in range(10):
    t1 = time.time()
    a = detect_pic(img2)
    t2 = time.time()
    print(f'Cost {(t2 - t1) * 1000} ms')
    print(a)

原来的代码,不计入图片加载的耗时,我的计算机推理耗时是4-5ms 重新整理得到方法,detect_pic(),耗时变成了13ms左右 这说明大量的时间花在了cv2图片处理上了