环境

FastDeploy版本： fastdeploy_python-1.0.1-cp38-cp38-win_amd64.whl
系统平台:Windows x64(Windows10)
硬件：如 Nvidia GPU 3070TI， CUDA 11.6 CUDNN 8.5
编译语言： Python 3.8

问题描述

针对相同的yolov5导出的onnx模型，先预加载运行一次，然后再运行10次计算平均时间， torch的推理时间约为0.0132秒，fastdeploy的推理时间约为0.023秒。请问下，该如何配置优化。

fastdeploy测试脚本：

''' 车牌对象识别 ''' import os import cv2 import psutil from PIL import Image import fastdeploy as fd from common import is_en_fp16, set_en_fp16, get_gpuid, get_most_idle_gpu, plot_one_box, merge_iou import numpy as np import time

resnet_model = None conf_thres = 0.3 iou_thres = 0.3 weights = 'weights/plates.onnx' trtfile = 'weights/plates.trt' names = ['plate'] sel_color = (12,16,255)

def load_model(): global resnet_model, names, colors, imgsz if resnet_model: return

# 模型推理的配置信息
option = fd.RuntimeOption()

# 切换使用CPU/GPU
gpuid = get_gpuid()
if gpuid == -1:
    gpuid = get_most_idle_gpu()
print('chk_plates use_gpuid:'+str(gpuid))
option.use_gpu(gpuid)   # 使用GPU

# 切换不同后端
option.use_trt_backend() # TensorRT
option.set_trt_cache_file(trtfile)
if is_en_fp16():
    option.enable_trt_fp16()
# option.set_trt_input_shape('images', (1, 3, 640, 640), (1, 3, 1280, 1280), (1, 3, 1280, 1280))
# option.set_trt_max_workspace_size(1 << 28) # 256M 1GB

# 
model_file = weights
params_file = ''
model = fd.vision.detection.YOLOv5(model_file, params_file, option)
model.preprocessor.size = [1280, 1280]
resnet_model = model

def get_plate_rects(img0:np.ndarray, draw=False): ''' 得到里程表的边框，一个图片最多一个 ''' if isinstance(img0, Image.Image): img0 = np.array(img0)

load_model()

objs = []
result = resnet_model.predict(img0)
for cls, conf, box in zip(result.label_ids, result.scores, result.boxes):
    # print('res:', cls, conf, box)
    a = (box[3] - box[1]) * (box[2] - box[0])
    objs.append( (a, cls, box, conf) )

objs = sorted(objs, key=lambda t: t[0], reverse=True)
objs = merge_iou(objs)
# if draw:
#     img0 = fd.vision.visualize.vis_detection(img0, result, score_threshold=conf_thres)
if draw:
    for a, icls, r, conf in objs:
        label = f'{names[icls]} {conf:.2f}'
        plot_one_box(r, img0, label=label, color=sel_color, line_thickness=2)

return objs, img0

if name == 'main': set_en_fp16(True)

fname = 'imgs/b5125c60-4dcd-4cdf-998a-d418591e041a.jpg'
# fname = 'imgs/123456.jpg'
# fname = 'imgs/fd55869c-5f4d-4bf5-a3dc-b4bde317370f.jpg'
img0 = cv2.imdecode(np.fromfile(fname, dtype=np.uint8), cv2.IMREAD_COLOR)   #BGR
print('img0.shape:', img0.shape)
objs, img2 = get_plate_rects(img0, True)
# objs = get_plate_rects_old(img0, True)
# print('img0.shape:', img0.shape)
# print('objs:', objs)

cnt = 10
t1 = time.time()
for i in range(cnt):
    get_plate_rects(img0)
t2 = time.time()
print('avg:', (t2-t1)/cnt, 'total:', (t2-t1))

mem = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
print(f'mem:{mem:.4f}GB')

torch测试脚本：

''' 车牌对象识别 ''' import os import cv2 import torch import random import numpy as np from models.experimental import attempt_load from utils.general import check_img_size, non_max_suppression, scale_coords from utils.dataloaders import letterbox import time from utils.plots import plot_one_box import psutil

resnet_model = None conf_thres = 0.3 iou_thres = 0.3

stride = 32

dohalf = False augment = False

imgsz = 640

stride = 64 imgsz = 1280 weights = 'weights/plates.pt' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") names = [] colors = []

def load_model(): global resnet_model, names, colors, imgsz if resnet_model: return

# Load model
resnet_model = attempt_load(weights, device=device)  # load FP32 model
resnet_model.eval()
gs = max(int(resnet_model.stride.max()), 32)  # grid size (max stride)
imgsz = check_img_size(imgsz, s=gs)  # check img_size
# print(f'load model:{time.time()-t1}  imgsz:{imgsz}')

names = {k: v for k, v in enumerate(resnet_model.names if hasattr(resnet_model, 'names') else resnet_model.module.names)}
colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]

def get_img(img0:np.ndarray): img = letterbox(img0, imgsz, stride=stride)[0]

# Convert
img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
img = np.ascontiguousarray(img)
img = torch.from_numpy(img).to(device, non_blocking=True)
if dohalf:
    img = img.half()
else:
    img = img.float()
img /= 255.0  # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
    img = img.unsqueeze(0)

# 原始图片，处理后图片
return img0, img

def get_plate_rects(img0:np.ndarray, draw=False): load_model()

img0 = np.array(org_img)

img0, img = get_img(img0)
# print('img0:',  img0.shape)
# print(' img:',  img.shape)

objs = []
with torch.no_grad():
    pred, _ = resnet_model(img, augment=augment)  # inference and training outputs

    out = non_max_suppression(pred, conf_thres=conf_thres, iou_thres=iou_thres, labels=None, multi_label=False)

    for i, det in enumerate(out):  # detections per image
        if len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()

            # Write results
            for *xyxy, conf, cls in reversed(det):
                icls = int(cls)
                label = f'{names[icls]} {conf:.2f}'
                # print(f'cls:{icls}  label:{label}')

                r = [int(i.cpu().item()) for i in xyxy]
                a = (r[3]-r[1]) * (r[2]-r[0])
                objs.append( (a, icls, r, float(conf)) )
                if draw:
                    plot_one_box(xyxy, img0, label=label, color=colors[int(cls)], line_thickness=1)

# objs = sorted(objs, key=lambda t: t[3], reverse=True)
objs = sorted(objs, key=lambda t: t[3], reverse=True)
return objs

if name == 'main': fname = 'imgs/b5125c60-4dcd-4cdf-998a-d418591e041a.jpg'

fname = 'imgs/123456.jpg'

# fname = 'imgs/fd55869c-5f4d-4bf5-a3dc-b4bde317370f.jpg'
img0 = cv2.imdecode(np.fromfile(fname, dtype=np.uint8), cv2.IMREAD_COLOR)   #BGR
print('img0.shape:', img0.shape)
objs = get_plate_rects(img0, True)

cnt = 10
t1 = time.time()
for i in range(cnt):
    get_plate_rects(img0)
t2 = time.time()
print('avg:', (t2-t1)/cnt, 'total:', (t2-t1))

mem = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
print(f'mem:{mem:.4f}GB')

wjj19950828 commented 1 year ago

@yz2yz 这个问题我记得之前看过，有以下几个问题：

首先确认下FD版本是fastdeploy_python-1.0.1还是fastdeploy_gpu_python-1.0.1
这个预处理其实没有对齐，letter_box的auto参数，你那边为True，FD为False，导致出来图片你的为1*3*1280*832，FD图片大小为1*3*1280*1280，图片较大肯定预处理以及infer时间都会增加耗时，你的代码将auto参数设置为Fasle跟FD对齐
后处理阈值需对齐，添加如下代码到model.preprocessor.size = [1280, 1280]之后

postprocessor = fd.vision.detection.YOLOv5Postprocessor()
postprocessor.conf_threshold = 0.3
postprocessor.nms_threshold = 0.3

然后再重测一下

yz2yz commented 1 year ago

@wjj19950828 谢谢，如您指导的一样，改成auto=False，速度和fd差不多了；什么时候fd支持letterbox的auto=True预处理呢？测试情况如云盘，谢谢！链接：https://pan.baidu.com/s/1c7bm9uzjAfByPF_RqBb_kg 提取码：bvwz

PaddlePaddle / FastDeploy

yolov5推理速度比torch原生慢一些 #902

环境