PaddlePaddle / FastDeploy

⚡️An Easy-to-use and Fast Deep Learning Model Deployment Toolkit for ☁️Cloud 📱Mobile and 📹Edge. Including Image, Video, Text and Audio 20+ main stream scenarios and 150+ SOTA models with end-to-end optimization, multi-platform and multi-framework support.
https://www.paddlepaddle.org.cn/fastdeploy
Apache License 2.0
3k stars 465 forks source link

yolov5推理速度比torch原生慢一些 #902

Open yz2yz opened 1 year ago

yz2yz commented 1 year ago

完整脚本和测试图片,网盘: 链接:https://pan.baidu.com/s/1wK9psssA7mL71bof4RJeSQ 提取码:7a3p


环境

问题描述

针对相同的yolov5导出的onnx模型,先预加载运行一次,然后再运行10次计算平均时间, torch的推理时间约为0.0132秒,fastdeploy的推理时间约为0.023秒。请问下,该如何配置优化。

fastdeploy测试脚本:

''' 车牌对象识别 ''' import os import cv2 import psutil from PIL import Image import fastdeploy as fd from common import is_en_fp16, set_en_fp16, get_gpuid, get_most_idle_gpu, plot_one_box, merge_iou import numpy as np import time

resnet_model = None conf_thres = 0.3 iou_thres = 0.3 weights = 'weights/plates.onnx' trtfile = 'weights/plates.trt' names = ['plate'] sel_color = (12,16,255)

def load_model(): global resnet_model, names, colors, imgsz if resnet_model: return

# 模型推理的配置信息
option = fd.RuntimeOption()

# 切换使用CPU/GPU
gpuid = get_gpuid()
if gpuid == -1:
    gpuid = get_most_idle_gpu()
print('chk_plates use_gpuid:'+str(gpuid))
option.use_gpu(gpuid)   # 使用GPU

# 切换不同后端
option.use_trt_backend() # TensorRT
option.set_trt_cache_file(trtfile)
if is_en_fp16():
    option.enable_trt_fp16()
# option.set_trt_input_shape('images', (1, 3, 640, 640), (1, 3, 1280, 1280), (1, 3, 1280, 1280))
# option.set_trt_max_workspace_size(1 << 28) # 256M 1GB

# 
model_file = weights
params_file = ''
model = fd.vision.detection.YOLOv5(model_file, params_file, option)
model.preprocessor.size = [1280, 1280]
resnet_model = model

def get_plate_rects(img0:np.ndarray, draw=False): ''' 得到里程表的边框,一个图片最多一个 ''' if isinstance(img0, Image.Image): img0 = np.array(img0)

load_model()

objs = []
result = resnet_model.predict(img0)
for cls, conf, box in zip(result.label_ids, result.scores, result.boxes):
    # print('res:', cls, conf, box)
    a = (box[3] - box[1]) * (box[2] - box[0])
    objs.append( (a, cls, box, conf) )

objs = sorted(objs, key=lambda t: t[0], reverse=True)
objs = merge_iou(objs)
# if draw:
#     img0 = fd.vision.visualize.vis_detection(img0, result, score_threshold=conf_thres)
if draw:
    for a, icls, r, conf in objs:
        label = f'{names[icls]} {conf:.2f}'
        plot_one_box(r, img0, label=label, color=sel_color, line_thickness=2)

return objs, img0

if name == 'main': set_en_fp16(True)

fname = 'imgs/b5125c60-4dcd-4cdf-998a-d418591e041a.jpg'
# fname = 'imgs/123456.jpg'
# fname = 'imgs/fd55869c-5f4d-4bf5-a3dc-b4bde317370f.jpg'
img0 = cv2.imdecode(np.fromfile(fname, dtype=np.uint8), cv2.IMREAD_COLOR)   #BGR
print('img0.shape:', img0.shape)
objs, img2 = get_plate_rects(img0, True)
# objs = get_plate_rects_old(img0, True)
# print('img0.shape:', img0.shape)
# print('objs:', objs)

cnt = 10
t1 = time.time()
for i in range(cnt):
    get_plate_rects(img0)
t2 = time.time()
print('avg:', (t2-t1)/cnt, 'total:', (t2-t1))

mem = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
print(f'mem:{mem:.4f}GB')

torch测试脚本:

''' 车牌对象识别 ''' import os import cv2 import torch import random import numpy as np from models.experimental import attempt_load from utils.general import check_img_size, non_max_suppression, scale_coords from utils.dataloaders import letterbox import time from utils.plots import plot_one_box import psutil

resnet_model = None conf_thres = 0.3 iou_thres = 0.3

stride = 32

dohalf = False augment = False

imgsz = 640

stride = 64 imgsz = 1280 weights = 'weights/plates.pt' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") names = [] colors = []

def load_model(): global resnet_model, names, colors, imgsz if resnet_model: return

# Load model
resnet_model = attempt_load(weights, device=device)  # load FP32 model
resnet_model.eval()
gs = max(int(resnet_model.stride.max()), 32)  # grid size (max stride)
imgsz = check_img_size(imgsz, s=gs)  # check img_size
# print(f'load model:{time.time()-t1}  imgsz:{imgsz}')

names = {k: v for k, v in enumerate(resnet_model.names if hasattr(resnet_model, 'names') else resnet_model.module.names)}
colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]

def get_img(img0:np.ndarray): img = letterbox(img0, imgsz, stride=stride)[0]

# Convert
img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
img = np.ascontiguousarray(img)
img = torch.from_numpy(img).to(device, non_blocking=True)
if dohalf:
    img = img.half()
else:
    img = img.float()
img /= 255.0  # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
    img = img.unsqueeze(0)

# 原始图片,处理后图片
return img0, img

def get_plate_rects(img0:np.ndarray, draw=False): load_model()

img0 = np.array(org_img)

img0, img = get_img(img0)
# print('img0:',  img0.shape)
# print(' img:',  img.shape)

objs = []
with torch.no_grad():
    pred, _ = resnet_model(img, augment=augment)  # inference and training outputs

    out = non_max_suppression(pred, conf_thres=conf_thres, iou_thres=iou_thres, labels=None, multi_label=False)

    for i, det in enumerate(out):  # detections per image
        if len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()

            # Write results
            for *xyxy, conf, cls in reversed(det):
                icls = int(cls)
                label = f'{names[icls]} {conf:.2f}'
                # print(f'cls:{icls}  label:{label}')

                r = [int(i.cpu().item()) for i in xyxy]
                a = (r[3]-r[1]) * (r[2]-r[0])
                objs.append( (a, icls, r, float(conf)) )
                if draw:
                    plot_one_box(xyxy, img0, label=label, color=colors[int(cls)], line_thickness=1)

# objs = sorted(objs, key=lambda t: t[3], reverse=True)
objs = sorted(objs, key=lambda t: t[3], reverse=True)
return objs

if name == 'main': fname = 'imgs/b5125c60-4dcd-4cdf-998a-d418591e041a.jpg'

fname = 'imgs/123456.jpg'

# fname = 'imgs/fd55869c-5f4d-4bf5-a3dc-b4bde317370f.jpg'
img0 = cv2.imdecode(np.fromfile(fname, dtype=np.uint8), cv2.IMREAD_COLOR)   #BGR
print('img0.shape:', img0.shape)
objs = get_plate_rects(img0, True)

cnt = 10
t1 = time.time()
for i in range(cnt):
    get_plate_rects(img0)
t2 = time.time()
print('avg:', (t2-t1)/cnt, 'total:', (t2-t1))

mem = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
print(f'mem:{mem:.4f}GB')
wjj19950828 commented 1 year ago

@yz2yz 这个问题我记得之前看过,有以下几个问题:

  1. 首先确认下FD版本是fastdeploy_python-1.0.1还是fastdeploy_gpu_python-1.0.1
  2. 这个预处理其实没有对齐,letter_box的auto参数,你那边为True,FD为False,导致出来图片你的为1*3*1280*832,FD图片大小为1*3*1280*1280,图片较大肯定预处理以及infer时间都会增加耗时,你的代码将auto参数设置为Fasle跟FD对齐
  3. 后处理阈值需对齐,添加如下代码到model.preprocessor.size = [1280, 1280]之后
postprocessor = fd.vision.detection.YOLOv5Postprocessor()
postprocessor.conf_threshold = 0.3
postprocessor.nms_threshold = 0.3

然后再重测一下

yz2yz commented 1 year ago

@wjj19950828 谢谢,如您指导的一样,改成auto=False,速度和fd差不多了;什么时候fd支持letterbox的auto=True预处理呢?测试情况如云盘,谢谢! 链接:https://pan.baidu.com/s/1c7bm9uzjAfByPF_RqBb_kg 提取码:bvwz