triple-Mu / YOLOv8-TensorRT

YOLOv8 using TensorRT accelerate !
MIT License
1.29k stars 223 forks source link

python 程序中每帧图像的耗时问题 #236

Open stephen-TT opened 1 month ago

stephen-TT commented 1 month ago

大佬 您好,我用的自己训练的yolov8s pt模型通过readme中的教程转成的engine,然后修改了infer_det.py代码使其能够调用摄像头,我想计算每帧的耗时,不知道下面的代码计算对不对:

from models import TRTModule  # isort:skip
import argparse
from pathlib import Path

import time
import cv2
import torch

from config import CLASSES_DET, COLORS
from models.torch_utils import det_postprocess
from models.utils import blob, letterbox, path_to_list

def main(args: argparse.Namespace) -> None:
    device = torch.device(args.device)
    Engine = TRTModule(args.engine, device)
    H, W = Engine.inp_info[0].shape[-2:]

    # set desired output names order
    Engine.set_desired(['num_dets', 'bboxes', 'scores', 'labels'])

    save_path = Path(args.out_dir)

    if not args.show and not save_path.exists():
        save_path.mkdir(parents=True, exist_ok=True)

    if args.imgs:
        images = path_to_list(args.imgs)
        print(f'images:{images}')
        for image in images:
            save_image = save_path / image.name
            bgr = cv2.imread(str(image))
            draw = bgr.copy()
            bgr, ratio, dwdh = letterbox(bgr, (W, H))
            rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
            tensor = blob(rgb, return_seg=False)
            dwdh = torch.asarray(dwdh * 2, dtype=torch.float32, device=device)
            tensor = torch.asarray(tensor, device=device)
            # inference
            data = Engine(tensor)

            bboxes, scores, labels = det_postprocess(data)
            if bboxes.numel() == 0:
                # if no bounding box
                print(f'{image}: no object!')
                continue
            bboxes -= dwdh
            bboxes /= ratio

            for (bbox, score, label) in zip(bboxes, scores, labels):
                bbox = bbox.round().int().tolist()
                cls_id = int(label)
                cls = CLASSES_DET[cls_id]
                color = COLORS[cls]

                text = f'{cls}:{score:.3f}'
                x1, y1, x2, y2 = bbox

                (_w, _h), _bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)
                _y1 = min(y1 + 1, draw.shape[0])

                cv2.rectangle(draw, (x1, y1), (x2, y2), color, 2)
                cv2.rectangle(draw, (x1, _y1), (x1 + _w, _y1 + _h + _bl), (0, 0, 255), -1)
                cv2.putText(draw, text, (x1, _y1 + _h), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)

            if args.show:
                cv2.imshow('result', draw)
                cv2.waitKey(0)
            else:
                cv2.imwrite(str(save_image), draw)

    print(f'111 camera:{args.camera}')  # 111 camera:0
    if args.camera:
        print(f'camera:{args.camera}')
        cap = cv2.VideoCapture(0)
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                assert cap.isOpened(), 'VideoCapture is not opened'
                break
            draw = frame.copy()
            bgr, ratio, dwdh = letterbox(frame, (W, H))
            rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
            tensor = blob(rgb, return_seg=False)
            dwdh = torch.asarray(dwdh * 2, dtype=torch.float32, device=device)
            tensor = torch.asarray(tensor, device=device)

            t1 = time.perf_counter()
            # inference
            data = Engine(tensor)

            bboxes, scores, labels = det_postprocess(data)
            bboxes -= dwdh
            bboxes /= ratio
            print(f'inference time: {time.perf_counter() - t1:.3f}s')

            for (bbox, score, label) in zip(bboxes, scores, labels):
                bbox = bbox.round().int().tolist()
                cls_id = int(label)
                cls = CLASSES_DET[cls_id]
                color = COLORS[cls]

                text = f'{cls}:{score:.3f}'
                x1, y1, x2, y2 = bbox

                (_w, _h), _bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)
                _y1 = min(y1 + 1, draw.shape[0])

                cv2.rectangle(draw, (x1, y1), (x2, y2), color, 2)
                cv2.rectangle(draw, (x1, _y1), (x1 + _w, _y1 + _h + _bl), (0, 0, 255), -1)
                cv2.putText(draw, text, (x1, _y1 + _h), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)
            if args.show:
                cv2.imshow('0', draw)
                if cv2.waitKey(1) in [ord('q'), 27]:
                    break

def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument('--engine', type=str, help='Engine file')
    parser.add_argument('--camera', action='store_true', help='camera index')

    parser.add_argument('--imgs', type=str,help='Images file, dir path or single img file path')
    parser.add_argument('--show',
                        action='store_true',
                        help='Show the detection results')
    parser.add_argument('--out-dir',
                        type=str,
                        default='./output',
                        help='Path to output file')
    parser.add_argument('--device',
                        type=str,
                        default='cuda:0',
                        help='TensorRT infer device')
    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args = parse_args()
    main(args)

和pt模型推理的速度相比,engine加速感觉没啥提升,如下图

rtx2060 win 10 trt 8.5.1.7

修改后代码的测速

image

pt模型的推理速度

image

triple-Mu commented 1 month ago

大佬 您好,我用的自己训练的yolov8s pt模型通过readme中的教程转成的engine,然后修改了infer_det.py代码使其能够调用摄像头,我想计算每帧的耗时,不知道下面的代码计算对不对:

from models import TRTModule  # isort:skip
import argparse
from pathlib import Path

import time
import cv2
import torch

from config import CLASSES_DET, COLORS
from models.torch_utils import det_postprocess
from models.utils import blob, letterbox, path_to_list

def main(args: argparse.Namespace) -> None:
    device = torch.device(args.device)
    Engine = TRTModule(args.engine, device)
    H, W = Engine.inp_info[0].shape[-2:]

    # set desired output names order
    Engine.set_desired(['num_dets', 'bboxes', 'scores', 'labels'])

    save_path = Path(args.out_dir)

    if not args.show and not save_path.exists():
        save_path.mkdir(parents=True, exist_ok=True)

    if args.imgs:
        images = path_to_list(args.imgs)
        print(f'images:{images}')
        for image in images:
            save_image = save_path / image.name
            bgr = cv2.imread(str(image))
            draw = bgr.copy()
            bgr, ratio, dwdh = letterbox(bgr, (W, H))
            rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
            tensor = blob(rgb, return_seg=False)
            dwdh = torch.asarray(dwdh * 2, dtype=torch.float32, device=device)
            tensor = torch.asarray(tensor, device=device)
            # inference
            data = Engine(tensor)

            bboxes, scores, labels = det_postprocess(data)
            if bboxes.numel() == 0:
                # if no bounding box
                print(f'{image}: no object!')
                continue
            bboxes -= dwdh
            bboxes /= ratio

            for (bbox, score, label) in zip(bboxes, scores, labels):
                bbox = bbox.round().int().tolist()
                cls_id = int(label)
                cls = CLASSES_DET[cls_id]
                color = COLORS[cls]

                text = f'{cls}:{score:.3f}'
                x1, y1, x2, y2 = bbox

                (_w, _h), _bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)
                _y1 = min(y1 + 1, draw.shape[0])

                cv2.rectangle(draw, (x1, y1), (x2, y2), color, 2)
                cv2.rectangle(draw, (x1, _y1), (x1 + _w, _y1 + _h + _bl), (0, 0, 255), -1)
                cv2.putText(draw, text, (x1, _y1 + _h), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)

            if args.show:
                cv2.imshow('result', draw)
                cv2.waitKey(0)
            else:
                cv2.imwrite(str(save_image), draw)

    print(f'111 camera:{args.camera}')  # 111 camera:0
    if args.camera:
        print(f'camera:{args.camera}')
        cap = cv2.VideoCapture(0)
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                assert cap.isOpened(), 'VideoCapture is not opened'
                break
            draw = frame.copy()
            bgr, ratio, dwdh = letterbox(frame, (W, H))
            rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
            tensor = blob(rgb, return_seg=False)
            dwdh = torch.asarray(dwdh * 2, dtype=torch.float32, device=device)
            tensor = torch.asarray(tensor, device=device)

            t1 = time.perf_counter()
            # inference
            data = Engine(tensor)

            bboxes, scores, labels = det_postprocess(data)
            bboxes -= dwdh
            bboxes /= ratio
            print(f'inference time: {time.perf_counter() - t1:.3f}s')

            for (bbox, score, label) in zip(bboxes, scores, labels):
                bbox = bbox.round().int().tolist()
                cls_id = int(label)
                cls = CLASSES_DET[cls_id]
                color = COLORS[cls]

                text = f'{cls}:{score:.3f}'
                x1, y1, x2, y2 = bbox

                (_w, _h), _bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 1)
                _y1 = min(y1 + 1, draw.shape[0])

                cv2.rectangle(draw, (x1, y1), (x2, y2), color, 2)
                cv2.rectangle(draw, (x1, _y1), (x1 + _w, _y1 + _h + _bl), (0, 0, 255), -1)
                cv2.putText(draw, text, (x1, _y1 + _h), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255, 255, 255), 2)
            if args.show:
                cv2.imshow('0', draw)
                if cv2.waitKey(1) in [ord('q'), 27]:
                    break

def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument('--engine', type=str, help='Engine file')
    parser.add_argument('--camera', action='store_true', help='camera index')

    parser.add_argument('--imgs', type=str,help='Images file, dir path or single img file path')
    parser.add_argument('--show',
                        action='store_true',
                        help='Show the detection results')
    parser.add_argument('--out-dir',
                        type=str,
                        default='./output',
                        help='Path to output file')
    parser.add_argument('--device',
                        type=str,
                        default='cuda:0',
                        help='TensorRT infer device')
    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args = parse_args()
    main(args)

和pt模型推理的速度相比,engine加速感觉没啥提升,如下图

rtx2060 win 10 trt 8.5.1.7

修改后代码的测速

image

pt模型的推理速度

image

因为本仓库采用了静态shape(640*640)的进行推理,然后pytorch的版本可能采用了最小padding(480x640),这样的话确实存在效率会不如pytorch. 您可以尝试fp16导出然后在进行对比,或者使用c++的版本

Yichen-user commented 1 month ago

请问下 使用infer-det.py 进行fp16的engine模型推理时 为什么输入的图片不需要指定dtype=float16呢 。yolov5代码里是加了half()的