hpc203 / pp-yoloe-onnxrun-cpp-py

使用ONNXRuntime部署PP-YOLOE目标检测,支持PP-YOLOE-s、PP-YOLOE-m、PP-YOLOE-l、PP-YOLOE-x四种结构,包含C++和Python两个版本的程序
18 stars 1 forks source link

关于我修改了代码后只能达到6fps #3

Open simple123456T opened 2 years ago

simple123456T commented 2 years ago

Windows paddlepaddle-gpu 2.3 cuda10.1 cudnn7.6.5

对main.py做了如下的修改:

import time
import paddle
import cv2
import numpy as np
import argparse
import onnxruntime

paddle.device.set_device('gpu')

class PP_YOLOE():
    def __init__(self, model_path, label_path, prob_threshold=0.8):
        with open(label_path, 'rt') as f:
            self.class_names = f.read().rstrip('\n').split('\n')
        so = onnxruntime.SessionOptions()
        so.log_severity_level = 3
        self.session = onnxruntime.InferenceSession(model_path, so)
        self.input_size = (640, 640)  ###width, height
        self.mean_ = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        self.std_ = np.array([0.229, 0.224, 0.225], dtype=np.float32)
        self.confThreshold = prob_threshold

    def preprocess(self, srcimg):
        img = cv2.cvtColor(srcimg, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, self.input_size, interpolation=cv2.INTER_LINEAR)
        img = img.astype(np.float32)
        img = img / 255.
        img -= self.mean_[None, None, :]
        img /= self.std_[None, None, :]
        img = np.transpose(img, [2, 0, 1])
        scale_factor = np.array([1., 1.], dtype=np.float32)
        return img, scale_factor

    def detect(self, srcimg):
        img, scale_factor = self.preprocess(srcimg)
        inputs = {'image': img[None, :, :, :], 'scale_factor': scale_factor[None, :]}
        ort_inputs = {i.name: inputs[i.name] for i in self.session.get_inputs() if i.name in inputs}
        output = self.session.run(None, ort_inputs)
        bbox, bbox_num = output
        keep_idx = (bbox[:, 1] > self.confThreshold) & (bbox[:, 0] > -1)
        bbox = bbox[keep_idx, :]
        ratioh = srcimg.shape[0] / self.input_size[1]
        ratiow = srcimg.shape[1] / self.input_size[0]
        for (clsid, score, xmin, ymin, xmax, ymax) in bbox:
            xmin = int(xmin * ratiow)
            ymin = int(ymin * ratioh)
            xmax = int(xmax * ratiow)
            ymax = int(ymax * ratioh)
            cv2.rectangle(srcimg, (xmin, ymin), (xmax, ymax), (0, 0, 255), thickness=2)
            cv2.putText(srcimg, self.class_names[int(clsid)] + ': ' + str(round(score, 2)), (xmin, ymin - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), thickness=1)
            # print(self.class_names[int(clsid)])
        return srcimg

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--imgpath', type=str, default='', help="image path")
    parser.add_argument('--modelpath', type=str, default='model/ppyoloe_crn_s_300e_coco.onnx', help="onnx filepath")
    parser.add_argument('--classfile', type=str, default='coco.names', help="classname filepath")
    parser.add_argument('--confThreshold', default=0.7, type=float, help='class confidence')
    parser.add_argument('--cameraId', default=0, type=int, help=' camera id')
    args = parser.parse_args()

    net = PP_YOLOE(args.modelpath, args.classfile, prob_threshold=args.confThreshold)

    # camera id
    cap = cv2.VideoCapture(args.cameraId)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)

    while cap.isOpened():
        _ , frame = cap.read()
        last_time = time.time()
        srcimg = net.detect(frame)
        winName = 'Deep learning object detection in ONNXRuntime'
        fps = 1 / (time.time() - last_time)
        cv2.putText(frame, 'fps:{}'.format(float('%.2f' % fps)), (5, 50), cv2.FONT_HERSHEY_PLAIN, 1.2, (0, 0, 255), 2)
        cv2.imshow(winName, srcimg)
        if cv2.waitKey(1) in [ord('q'), 27]:
            break
    cap.release()
    cv2.destroyAllWindows()

运行效果: image

hpc203 commented 2 years ago

Windows paddlepaddle-gpu 2.3 cuda10.1 cudnn7.6.5

对main.py做了如下的修改:

import time
import paddle
import cv2
import numpy as np
import argparse
import onnxruntime

paddle.device.set_device('gpu')

class PP_YOLOE():
    def __init__(self, model_path, label_path, prob_threshold=0.8):
        with open(label_path, 'rt') as f:
            self.class_names = f.read().rstrip('\n').split('\n')
        so = onnxruntime.SessionOptions()
        so.log_severity_level = 3
        self.session = onnxruntime.InferenceSession(model_path, so)
        self.input_size = (640, 640)  ###width, height
        self.mean_ = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        self.std_ = np.array([0.229, 0.224, 0.225], dtype=np.float32)
        self.confThreshold = prob_threshold

    def preprocess(self, srcimg):
        img = cv2.cvtColor(srcimg, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, self.input_size, interpolation=cv2.INTER_LINEAR)
        img = img.astype(np.float32)
        img = img / 255.
        img -= self.mean_[None, None, :]
        img /= self.std_[None, None, :]
        img = np.transpose(img, [2, 0, 1])
        scale_factor = np.array([1., 1.], dtype=np.float32)
        return img, scale_factor

    def detect(self, srcimg):
        img, scale_factor = self.preprocess(srcimg)
        inputs = {'image': img[None, :, :, :], 'scale_factor': scale_factor[None, :]}
        ort_inputs = {i.name: inputs[i.name] for i in self.session.get_inputs() if i.name in inputs}
        output = self.session.run(None, ort_inputs)
        bbox, bbox_num = output
        keep_idx = (bbox[:, 1] > self.confThreshold) & (bbox[:, 0] > -1)
        bbox = bbox[keep_idx, :]
        ratioh = srcimg.shape[0] / self.input_size[1]
        ratiow = srcimg.shape[1] / self.input_size[0]
        for (clsid, score, xmin, ymin, xmax, ymax) in bbox:
            xmin = int(xmin * ratiow)
            ymin = int(ymin * ratioh)
            xmax = int(xmax * ratiow)
            ymax = int(ymax * ratioh)
            cv2.rectangle(srcimg, (xmin, ymin), (xmax, ymax), (0, 0, 255), thickness=2)
            cv2.putText(srcimg, self.class_names[int(clsid)] + ': ' + str(round(score, 2)), (xmin, ymin - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), thickness=1)
            # print(self.class_names[int(clsid)])
        return srcimg

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--imgpath', type=str, default='', help="image path")
    parser.add_argument('--modelpath', type=str, default='model/ppyoloe_crn_s_300e_coco.onnx', help="onnx filepath")
    parser.add_argument('--classfile', type=str, default='coco.names', help="classname filepath")
    parser.add_argument('--confThreshold', default=0.7, type=float, help='class confidence')
    parser.add_argument('--cameraId', default=0, type=int, help=' camera id')
    args = parser.parse_args()

    net = PP_YOLOE(args.modelpath, args.classfile, prob_threshold=args.confThreshold)

    # camera id
    cap = cv2.VideoCapture(args.cameraId)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)

    while cap.isOpened():
        _ , frame = cap.read()
        last_time = time.time()
        srcimg = net.detect(frame)
        winName = 'Deep learning object detection in ONNXRuntime'
        fps = 1 / (time.time() - last_time)
        cv2.putText(frame, 'fps:{}'.format(float('%.2f' % fps)), (5, 50), cv2.FONT_HERSHEY_PLAIN, 1.2, (0, 0, 255), 2)
        cv2.imshow(winName, srcimg)
        if cv2.waitKey(1) in [ord('q'), 27]:
            break
    cap.release()
    cv2.destroyAllWindows()

运行效果: image

这个程序是用onnxruntime做推理引擎的,你安装的onnxruntime是gpu版本的吗? pip install onnxruntime-gpu

simple123456T commented 2 years ago

when i change my onnxruntime version, it also like the same way. haishibuxing

appdirs==1.4.4
astor==0.8.1
certifi==2022.5.18.1
charset-normalizer==2.0.12
decorator==5.1.1
flatbuffers==2.0
GPUtil==1.4.0
graphsurgeon @ file:///D:/Advanced_research_projects/ppyoloe/conda%E7%8E%AF%E5%A2%83/TensorRT-6.0.1.5/graphsurgeon/graphsurgeon-0.4.1-py2.py3-none-any.whl
idna==3.3
numpy==1.22.4
onnxruntime-gpu==1.11.1
opencv-python==4.5.5.64
opt-einsum==3.3.0
paddle-bfloat==0.1.2
paddlepaddle-gpu @ file:///D:/Advanced_research_projects/ppyoloe_trt/conda%E7%8E%AF%E5%A2%83/paddlepaddle_gpu-2.3.0.post101-cp38-cp38-win_amd64.whl
Pillow==9.1.1
platformdirs==2.5.2
protobuf==3.20.1
psutil==5.9.1
pyaml==21.10.1
pycuda @ file:///D:/Advanced_research_projects/ppyoloe/conda%E7%8E%AF%E5%A2%83/pycuda-2020.1%2Bcuda101-cp38-cp38-win_amd64.whl
pynvml==11.4.1
pytools==2022.1.9
PyYAML==6.0
requests==2.27.1
scipy==1.8.1
six==1.16.0
typing_extensions==4.2.0
uff @ file:///D:/Advanced_research_projects/ppyoloe/conda%E7%8E%AF%E5%A2%83/TensorRT-6.0.1.5/uff/uff-0.6.5-py2.py3-none-any.whl
urllib3==1.26.9
wincertstore==0.2

这是我的conda 环境

LemonWang0110 commented 2 years ago

@simple123456T 对于onnxruntime, 我设置了self.session = ort.InferenceSession(model_path, so, providers=['CUDAExecutionProvider']) , 然并未发现cpu版本比cpu版本快.

gouzi-tu commented 2 years ago

cpu 比 cpu 快? -----这是我另外一个github账号,还是同一个我

LemonWang0110 commented 2 years ago

@gouzi-tu 抱歉,打错,是并未发现gpu版本比cpu快 .

JiaPai12138 commented 2 years ago

之前没注意到你已经设置为['CUDAExecutionProvider']了... 话说你显卡型号是什么呢? 如果是好显卡的话 那可能说明这个模型更适合并行而不是串行 另外可能需要你更新cuda到10.2, 并更新cudnn到8.2. 你也可以尝试['TensorrtExecutionProvider'], 就是要等一段量化时间(需要安装tensorrt, 现在支持py了)

然后那个cv2.waitkey(1)实际耗时15ms, 你帧数不可能超过66FPS, 你不如把它去掉在另一个进程显示效果