Open JunGenius opened 4 years ago
What device are you using?
I suspect the OpenCL implmentation is using an non-optimized version of mish which is ~2x slower on integrated graphics and CPUs.
What device are you using?
I suspect the OpenCL implmentation is using an non-optimized version of mish which is ~2x slower on integrated graphics and CPUs.
Win10 + i7-8550U + Intel(R) UHD Graphics 620. Yeah , i use it on integrated graphics, so how can I use an optimized version of mish ? Can you give me some suggestions ? Thx.
Sorry, yolov4-tiny does not use mish. My previous comment is irrelavent. I don't know why yolov4-tiny is so much slower than yolov3-tiny.
Sorry, yolov4-tiny does not use mish. My previous comment is irrelavent. I don't know why yolov4-tiny is so much slower than yolov3-tiny.
Maybe I made a mistake somewhere and i will try again. Thanks for your answer.
@JunGenius Can you share the full code? Can you try with this code? Change the backend and target to inference engine and OpenCL.
@YashasSamaga
This is my code:
#include <fstream>
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include "opencv2/opencv.hpp"
#include "opencv2/highgui/highgui_c.h"
using namespace std;
using namespace cv;
using namespace cv::dnn;
const char *cfg = "../model/yolov4-tiny.cfg";
const char *name = "../model/coco.names";
const char *weights = "../model/yolov4-tiny.weights";
const char* imagePath = "../model/dog.jpg";
const char* videoPath = "../model/test.mp4";
Net net;
vector<String> GetOutputsNames()
{
vector<String> names;
if (names.empty())
{
vector<int> outLayers = net.getUnconnectedOutLayers();
vector<String> layersNames = net.getLayerNames();
names.resize(outLayers.size());
for (int i = 0; i < outLayers.size(); ++i)
names[i] = layersNames[outLayers[i] - 1];
}
return names;
}
int main() {
net = readNetFromDarknet(cfg, weights);
net.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
net.setPreferableTarget(DNN_TARGET_OPENCL);
std::vector<String> outNames = GetOutputsNames();
vector<string> classNamesVec;
ifstream classNamesFile(name);
if (classNamesFile.is_open())
{
string className = "";
while (std::getline(classNamesFile, className))
classNamesVec.push_back(className);
}
int64 start = getTickCount();
VideoCapture cap;
cap.open(videoPath);
Mat frame;
while (1)
{
try
{
int64 start = getTickCount();
cap >> frame;
if (frame.empty()) break;
Mat inputBlob = blobFromImage(frame, 1 / 255.F, Size(416, 416), Scalar(), true, false);
net.setInput(inputBlob);
std::vector<Mat> outs;
net.forward(outs, outNames);
vector<Rect> boxes;
vector<int> classIds;
vector<float> confidences;
for (size_t i = 0; i < outs.size(); ++i)
{
// detected objects and C is a number of classes + 4 where the first 4
float* data = (float*)outs[i].data;
for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
{
Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
Point classIdPoint;
double confidence;
minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
if (confidence > 0.5)
{
int centerX = (int)(data[0] * frame.cols);
int centerY = (int)(data[1] * frame.rows);
int width = (int)(data[2] * frame.cols);
int height = (int)(data[3] * frame.rows);
int left = centerX - width / 2;
int top = centerY - height / 2;
classIds.push_back(classIdPoint.x);
confidences.push_back((float)confidence);
boxes.push_back(Rect(left, top, width, height));
}
}
}
vector<int> indices;
NMSBoxes(boxes, confidences, 0.5, 0.2, indices);
for (size_t i = 0; i < indices.size(); ++i)
{
int idx = indices[i];
Rect box = boxes[idx];
String className = classNamesVec[classIds[idx]];
putText(frame, className.c_str(), box.tl(), FONT_HERSHEY_SIMPLEX, 1.0, Scalar(255, 0, 0), 2, 8);
rectangle(frame, box, Scalar(0, 0, 255), 2, 8, 0);
}
float fps = getTickFrequency() / (getTickCount() - start);
float time = (getTickCount() - start) / getTickFrequency();
ostringstream ss;
ss << "FPS : " << fps << " detection time: " << time * 1000 << " ms";
putText(frame, ss.str(), Point(20, 20), 0, 0.5, Scalar(0, 0, 255));
imshow("YOLOv4-Detections", frame);
char c = waitKey(1);
if (c == 27) {
return 1;
}
}
catch (const std::exception& ex)
{
std::cout << ex.what() << std::endl;
}
}
system("pause");
return 1;
}
I found that when I use yolov4 model, opencl is ~2x faster than cpu.
Opencl + yolov4:
Cpu + yolov4:
But when I use yolov4-tiny, the result is opposite.
When I use oponcv to run the yolov3 and yolov4 models respectively, I found that the detection speed of yolov3 is twice that of yolov4. Is this normal? (I set DNN_BACKEND_INFERENCE_ENGINE and DNN_TARGET_OPENCL)
YOLOV3-tiny: (OPENCL)
YOLOV4-tiny: (OPENCL)
But when i use cpu , it work well.
YOLOV4-tiny: (CPU)
So,how can I improve the detection speed when I use opencl ? Thx.