How to predict face landmarks with Caffe model?

The following code detects faces on camera with yufacedetectnet-open-v2.prototxt and yufacedetectnet-open-v2.caffemodel, but how can I predict face landmarks with this Caffe model?

main.cpp:

#include <chrono>
#include <cstdlib>
#include <iostream>
#include <string>

#include <opencv2/opencv.hpp>

#define MODEL_INPUT_WIDTH 320
#define MODEL_INPUT_HEIGHT 240
#define MODEL_INPUT_CHANNEL 3

#define ESC_KEY_CODE 27

int main(int argc, char* argv[])
{
    //----------------------------------------------------------------------------------------------------

    if((argc < 3) || (argc > 4))
    {
        std::cout << "Usage: ./face-detector <model-prototxt> <model-caffemodel> [<camera-index>]" << std::endl;
        std::cout << "Example: ./face-detector yufacedetectnet-open-v2.prototxt yufacedetectnet-open-v2.caffemodel -1" << std::endl;
        return EXIT_FAILURE;
    }

    //----------------------------------------------------------------------------------------------------

    std::string prototxtFileName = argv[1];
    std::string caffemodelFileName = argv[2];

    int cameraIndex = -1;
    if(argc == 4)
    {
        cameraIndex = std::stoi(argv[3]);
    }

    //----------------------------------------------------------------------------------------------------

    std::cout << "Loading model..." << std::endl;

    cv::dnn::Net net = cv::dnn::readNetFromCaffe(prototxtFileName, caffemodelFileName);

    net.setPreferableBackend(cv::dnn::DNN_BACKEND_OPENCV);
    net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);

    std::cout << std::endl;

    //----------------------------------------------------------------------------------------------------

    std::cout << "Opening camera..." << std::endl;

    cv::VideoCapture videoCapture;

    if (!videoCapture.open(cameraIndex))
    {
        std::cout << "Error: Could not open camera: " << cameraIndex << std::endl;
        return EXIT_FAILURE;
    }

    std::cout << std::endl;

    //----------------------------------------------------------------------------------------------------

    std::cout << "Detecting..." << std::endl;

    while (true)
    {
        cv::Mat frame;
        videoCapture >> frame;

        if(frame.empty())
        {
            std::cout << "Error: Could not read camera frame." << std::endl;
            return EXIT_FAILURE;
        }

        //cv::resize(frame, frame, cv::Size(MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT));

        auto beginTime = std::chrono::steady_clock::now();

        auto input = cv::dnn::blobFromImage(frame, 1.0, cv::Size(), cv::Scalar(), true);

        net.setInput(input, "data");

        auto output = net.forward("detection_out");

        auto endTime = std::chrono::steady_clock::now();

        int timespan = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - beginTime).count();

        std::cout << "Detection time = " << timespan << "ms" << std::endl;

        // output.size = 1 x 1 x 50 x 7
        // detectionMat.size = 50 x 7
        cv::Mat detectionMat(output.size[2], output.size[3], CV_32F, output.ptr<float>());

        for(int i = 0; i < detectionMat.rows; i++)
        {
            float confidence = detectionMat.at<float>(i, 2);

            if(confidence < 0.5)
            {
                continue;
            }

            int x = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols);
            int y = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows);
            int width = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols + 0.5f) - x;
            int height = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows + 0.5f) - y;

            cv::putText(frame, std::to_string(confidence), cv::Point(x, y - 3), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1);       

            cv::rectangle(frame, cv::Rect(x, y, width, height), cv::Scalar(0, 255, 0), 2);
        }

        //cv::resize(frame, frame, cv::Size(640, 480));

        cv::imshow("Face Detection", frame);

        if (cv::waitKey(1) == ESC_KEY_CODE)
        {
            break;
        }
    }

    std::cout << std::endl;

    //----------------------------------------------------------------------------------------------------

    std::cout << "Releasing camera..." << std::endl;

    videoCapture.release();

    std::cout << std::endl;

    //----------------------------------------------------------------------------------------------------

    return EXIT_SUCCESS;

    //----------------------------------------------------------------------------------------------------
}

CMakeLists.txt:

cmake_minimum_required(VERSION 3.5)

project(face-detector LANGUAGES CXX)

set(APP_NAME "${PROJECT_NAME}")

find_package(OpenCV REQUIRED)

add_executable(${APP_NAME} main.cpp)

set_property(TARGET ${APP_NAME} PROPERTY CXX_STANDARD 11)
set_property(TARGET ${APP_NAME} PROPERTY CXX_STANDARD_REQUIRED ON)
set_property(TARGET ${APP_NAME} PROPERTY CXX_EXTENSIONS OFF)

target_link_libraries(${APP_NAME} PRIVATE ${OpenCV_LIBS})

ShiqiYu / libfacedetection

How to predict face landmarks with Caffe model? #271