Closed YFforever2022 closed 2 years ago
目前看来脱离不了torch,使用torch必然会使总体项目文件变大
目前看来脱离不了torch,使用torch必然会使总体项目文件变大
我们的初衷就是为了部署的代码和依赖更少, 问题还是在于数据在cpu和gpu之前的拷贝增加了耗时吧,三木君这个把很多操作都放在了GPU上, 目前已经在优化了,希望下个版本速度有提升
目前看来脱离不了torch,使用torch必然会使总体项目文件变大
我们的初衷就是为了部署的代码和依赖更少, 问题还是在于数据在cpu和gpu之前的拷贝增加了耗时吧,三木君这个把很多操作都放在了GPU上, 目前已经在优化了,希望下个版本速度有提升
好的,没有问题了,期待您的新版本
import cv2
import torch
import random
import time
import numpy as np
import tensorrt as trt
from PIL import Image
from pathlib import Path
from collections import OrderedDict,namedtuple
w = './yolov7-tiny-nms.trt'
device = torch.device('cuda:0')
# Infer TensorRT Engine
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
logger = trt.Logger(trt.Logger.INFO)
trt.init_libnvinfer_plugins(logger, namespace="")
with open(w, 'rb') as f, trt.Runtime(logger) as runtime:
model = runtime.deserialize_cuda_engine(f.read())
bindings = OrderedDict()
for index in range(model.num_bindings):
name = model.get_binding_name(index)
dtype = trt.nptype(model.get_binding_dtype(index))
shape = tuple(model.get_binding_shape(index))
data = torch.from_numpy(np.empty(shape, dtype=np.dtype(dtype))).to(device)
bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr()))
binding_addrs = OrderedDict((n, d.ptr) for n, d in bindings.items())
context = model.create_execution_context()
def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
# Resize and pad image while meeting stride-multiple constraints
shape = im.shape[:2] # current shape [height, width]
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not scaleup: # only scale down, do not scale up (for better val mAP)
r = min(r, 1.0)
# Compute padding
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
if auto: # minimum rectangle
dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding
dw /= 2 # divide padding into 2 sides
dh /= 2
if shape[::-1] != new_unpad: # resize
im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return im, r, (dw, dh)
def postprocess(boxes,r,dwdh):
dwdh = torch.tensor(dwdh*2).to(boxes.device)
boxes -= dwdh
boxes /= r
return boxes
names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
'hair drier', 'toothbrush']
colors = {name:[random.randint(0, 255) for _ in range(3)] for i,name in enumerate(names)}
def detect_pic(img):
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
image = img.copy()
image, ratio, dwdh = letterbox(image, auto=False)
image = image.transpose((2, 0, 1))
image = np.expand_dims(image, 0)
image = np.ascontiguousarray(image)
im = image.astype(np.float32)
im.shape
im = torch.from_numpy(im).to(device)
im /= 255
im.shape
binding_addrs['images'] = int(im.data_ptr())
context.execute_v2(list(binding_addrs.values()))
nums = bindings['num_dets'].data
boxes = bindings['det_boxes'].data
scores = bindings['det_scores'].data
classes = bindings['det_classes'].data
nums.shape,boxes.shape,scores.shape,classes.shape
boxes = boxes[0, :nums[0][0]]
scores = scores[0, :nums[0][0]]
classes = classes[0, :nums[0][0]]
ret = ''
for box, score, cl in zip(boxes, scores, classes):
box = postprocess(box, ratio, dwdh).round().int()
name = names[cl]
color = colors[name]
name += ' ' + str(round(float(score), 3))
clas_id = int(cl)
x1 = box[:2].tolist()[0]
y1 = box[:2].tolist()[1]
x2 = box[2:].tolist()[0]
y2 = box[2:].tolist()[1]
w = abs(x2 - x1)
h = abs(y2 - y1)
conf = float(score)
ret += str(clas_id) + ',' + str(x1) + ',' + str(y1) + ',' + str(w) + ',' + str(h) + ',' + str(conf) + '\r\n'
# cv2.rectangle(img, box[:2].tolist(), box[2:].tolist(), color, thickness=2)
# cv2.putText(img, name, (int(box[0]), int(box[1]) - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.75, color, thickness=2)
# Image.fromarray(img)
return ret
# warmup for 10 times
for _ in range(10):
tmp = torch.randn(1, 3, 640, 640).to(device)
binding_addrs['images'] = int(tmp.data_ptr())
context.execute_v2(list(binding_addrs.values()))
img2 = cv2.imread('src/1.jpg')
for _ in range(10):
t1 = time.time()
a = detect_pic(img2)
t2 = time.time()
print(f'Cost {(t2 - t1) * 1000} ms')
print(a)
原来的代码,不计入图片加载的耗时,我的计算机推理耗时是4-5ms 重新整理得到方法,detect_pic(),耗时变成了13ms左右 这说明大量的时间花在了cv2图片处理上了
https://colab.research.google.com/github/WongKinYiu/yolov7/blob/main/tools/YOLOv7trt.ipynb import cv2 import torch import random import time import numpy as np import tensorrt as trt from PIL import Image from pathlib import Path from collections import OrderedDict,namedtuple
w = './yolov7-tiny-nms.trt' device = torch.device('cuda:0') img = cv2.imread('src/1.jpg')
[08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init CUDA: CPU +252, GPU +0, now: CPU 10507, GPU 1049 (MiB) [08/23/2022-17:20:35] [TRT] [I] Loaded engine size: 27 MiB [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +132, GPU +46, now: CPU 10678, GPU 1122 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +108, GPU +36, now: CPU 10786, GPU 1158 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +27, now: CPU 0, GPU 27 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 10761, GPU 1172 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 10761, GPU 1180 (MiB) [08/23/2022-17:20:35] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +65, now: CPU 0, GPU 92 (MiB) Cost 4.830199999999785 ms
GTX1080 torch 1.10.1+cu102 torchvision 0.11.2+cu102 tensorrt 8.4.1.5 opencv-python 4.5.1.48