OpenCV4.8.0 DNN inference speed reduced by 50%. #23911

Open ZJDATY opened 11 months ago

ZJDATY commented 11 months ago

System Information

OpenCV version: 4.8.0 Operating System / Platform: win10 Compiler & compiler version: vs2019

Compare versions: OpenCV version:OpenCV 4.5.5-openvino and the same envirment

Detailed description

Display compilation information and statistical time for both versions. The time consumption has increased from 20ms to 30ms.

        init >> 39.592ms
        inference >> min = 21.601ms, max = 32.751ms, mean = 21.846ms, stddev = 1.35271ms

D:\vcworkspaces\yolov4_tiny_dnn_demo\x64\Release\yolov4_tiny_dnn_demo.exe (进程 7704)已退出,代码为 0。
按任意键关闭此窗口. . .


        init >> 92.686ms
        inference >> min = 29.27ms, max = 38.598ms, mean = 32.5029ms, stddev = 1.07574ms

D:\vcworkspaces\yolov4_tiny_dnn_demo\x64\Release\yolov4_tiny_dnn_demo.exe (进程 11968)已退出,代码为 0。
按任意键关闭此窗口. . .

Steps to reproduce

#include <iostream>
#include <queue>
#include <iterator>
#include <sstream>
#include <fstream>
#include <iomanip>
#include <chrono>

#include <opencv2/core.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/dnn/all_layers.hpp>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <numeric>

constexpr float CONFIDENCE_THRESHOLD = 0;
constexpr float NMS_THRESHOLD = 0.4;
constexpr int NUM_CLASSES = 80;

// colors for bounding boxes
const cv::Scalar colors[] = {
    {0, 255, 255},
    {255, 255, 0},
    {0, 255, 0},
    {255, 0, 0}
const auto NUM_COLORS = sizeof(colors) / sizeof(colors[0]);

int main()
    std::cout << cv::getBuildInformation() << std::endl;
    std::vector<std::string> class_names;
        std::ifstream class_file("yolo/coco.names");
        if (!class_file)
            std::cerr << "failed to open classes.txt\n";
            return 0;

        std::string line;
        while (std::getline(class_file, line))
    std::string b = "./yolo/yolo_test.mp4";
    cv::VideoCapture source(b);

    auto net = cv::dnn::readNetFromDarknet("yolo/yolov4-tiny.cfg", "yolo/yolov4-tiny.weights");
    auto output_names = net.getUnconnectedOutLayersNames();

    cv::Mat frame(416, 416, CV_32FC3), blob; //Tiny
    std::vector<cv::Mat> detections;
    std::vector<float> runtimes;
    auto init_start = std::chrono::steady_clock::now();
    cv::dnn::blobFromImage(frame, blob, 1 / 255.0, cv::Size(416, 416), cv::Scalar(), true, false, CV_32F); //Tiny
    net.forward(detections, output_names);
    auto init_end = std::chrono::steady_clock::now();
    while (cv::waitKey(1) < 1)
        source >> frame;
        if (frame.empty())

        auto total_start = std::chrono::steady_clock::now();
        cv::dnn::blobFromImage(frame, blob, 1.0 / 255, cv::Size(416, 416), cv::Scalar(), true, false, CV_32F);  //Tiny

        auto dnn_start = std::chrono::steady_clock::now();
        net.forward(detections, output_names);
        auto dnn_end = std::chrono::steady_clock::now();

        std::vector<int> indices[NUM_CLASSES];
        std::vector<cv::Rect> boxes[NUM_CLASSES];
        std::vector<float> scores[NUM_CLASSES];

        for (auto& output : detections)
            const auto num_boxes = output.rows;
            for (int i = 0; i < num_boxes; i++)
                auto x =<float>(i, 0) * frame.cols;
                auto y =<float>(i, 1) * frame.rows;
                auto width =<float>(i, 2) * frame.cols;
                auto height =<float>(i, 3) * frame.rows;
                cv::Rect rect(x - width / 2, y - height / 2, width, height);

                for (int c = 0; c < NUM_CLASSES; c++)
                    auto confidence = *output.ptr<float>(i, 5 + c);
                    if (confidence >= CONFIDENCE_THRESHOLD)

        for (int c = 0; c < NUM_CLASSES; c++)
            cv::dnn::NMSBoxes(boxes[c], scores[c], 0.0, NMS_THRESHOLD, indices[c]);

        for (int c = 0; c < NUM_CLASSES; c++)
            for (size_t i = 0; i < indices[c].size(); ++i)
                const auto color = colors[c % NUM_COLORS];

                auto idx = indices[c][i];
                const auto& rect = boxes[c][idx];
                cv::rectangle(frame, cv::Point(rect.x, rect.y), cv::Point(rect.x + rect.width, rect.y + rect.height), color, 3);

                std::ostringstream label_ss;
                label_ss << class_names[c] << ": " << std::fixed << std::setprecision(2) << scores[c][idx];
                auto label = label_ss.str();

                int baseline;
                auto label_bg_sz = cv::getTextSize(label.c_str(), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, 1, &baseline);
                cv::rectangle(frame, cv::Point(rect.x, rect.y - label_bg_sz.height - baseline - 10), cv::Point(rect.x + label_bg_sz.width, rect.y), color, cv::FILLED);
                cv::putText(frame, label.c_str(), cv::Point(rect.x, rect.y - baseline - 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, cv::Scalar(0, 0, 0));

        auto total_end = std::chrono::steady_clock::now();

        float inference_fps = std::chrono::duration_cast<std::chrono::microseconds>(dnn_end - dnn_start).count() / 1000.0;
        //std::cout << "模型推理时间为:" << inference_fps << " ms" << std::endl;
        float total_fps = std::chrono::duration_cast<std::chrono::microseconds>(total_end - total_start).count() / 1000.0;
        //std::cout << "单帧总耗费时间为:" << total_fps << " ms" << std::endl;
        std::ostringstream stats_ss;
        stats_ss << std::fixed << std::setprecision(2);
        stats_ss << "Inference FPS: " << 1000.0 / inference_fps << ", Total FPS: " << 1000.0 / total_fps;
        auto stats = stats_ss.str();
        int baseline;
        auto stats_bg_sz = cv::getTextSize(stats.c_str(), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, 1, &baseline);
        cv::rectangle(frame, cv::Point(0, 0), cv::Point(stats_bg_sz.width, stats_bg_sz.height + 10), cv::Scalar(0, 0, 0), cv::FILLED);
        cv::putText(frame, stats.c_str(), cv::Point(0, stats_bg_sz.height + 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 1, cv::Scalar(255, 255, 255));
        //cv::namedWindow("output", cv::WindowFlags::WINDOW_AUTOSIZE);
        //cv::imshow("output", frame);
    auto sum = std::accumulate(std::begin(runtimes), std::end(runtimes), 0.0f);
    auto squared_sum = std::inner_product(std::begin(runtimes), std::end(runtimes), std::begin(runtimes), 0.0f);

    auto min = *std::min_element(std::begin(runtimes), std::end(runtimes));
    auto max = *std::max_element(std::begin(runtimes), std::end(runtimes));
    auto mean = sum / runtimes.size();
    auto stddev = std::sqrt(squared_sum / runtimes.size() - mean * mean);

    std::cout << '[' << "yolov4-tiny" << "]" << '\n'
        << "\tinit >> " << std::chrono::duration_cast<std::chrono::microseconds>(init_end - init_start).count() / 1000.0 << "ms" << '\n'
        << "\tinference >> " << "min = " << min << "ms, max = " << max << "ms, mean = " << mean << "ms, stddev = " << stddev << "ms" << std::endl;

    return 0;

Related models and video downloads.


Issue submission checklist

ukoehler commented 9 months ago

@ZJDATY Sorry, not easily, since the computer is not connected to the internet. However, all you have to do is add net.enableWinograd(false); after net.setInput(blob); for version 4.8.0

ZJDATY commented 9 months ago

@ZJDATY Sorry, not easily, since the computer is not connected to the internet. However, all you have to do is add net.enableWinograd(false); after net.setInput(blob); for version 4.8.0

Thank you. I tested it and the results are the same as before. Opencv480 will slow down by about 10ms.

ukoehler commented 9 months ago

Well, it was worth a try.

ZJDATY commented 5 months ago

I am excited to see that version 4.9 has been released, but unfortunately, the optimization issue has not been resolved in version 4.9 yet.

ZJDATY commented 5 months ago

I am excited to see that version 4.9 has been released, but unfortunately, the optimization issue has not been resolved in version 4.9 yet.

My computer's CPU is i7-10700 now, and the inference time using Opencv455 version is 13ms. The inference time for using Opencv490 version is 23ms.