请教一下 yolov8pose.onnx 在python和c++上得到不同结果的原因，以及如何处理c++接口的outputtensor

ZZJzhou commented 10 months ago

我按照ultralytics的官方文件把我自己训练的yolov8的模型best.pt转成了 best.onnx,然后用python做预测的时候 ''' outputs = session.run(None, {modelinputs[0].name: img}) result = outputs[0] ''' 得到的结果shape是(1,20,8400)。但是用C++做预测时， ''' auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names_char.data(), input_tensors.data(), input_names_char.size(), output_names_char.data(), output_names_char.size()); if (output_tensors[0].IsTensor()) { float* outputData = output_tensors[0].GetTensorMutableData(); auto outputInfo = output_tensors[0].GetTensorTypeAndShapeInfo(); std::cout << "output_tensor shape: " << print_shape(outputInfo.GetShape()) << "\n"; ''' 得到的shape是（1x20x33600）。请问为什么同一个模型在不同的接口上会有这种不同的output？

还有就是，得到的outputData理论上应该是12033600大小的vector,但是我得到的数据超过这个范围，导致我无法正确的postprocess数据，请问有什么好的建议吗？

Melody-Zhou commented 10 months ago

我按照ultralytics的官方文件把我自己训练的yolov8的模型best.pt转成了 best.onnx,然后用python做预测的时候 ''' outputs = session.run(None, {modelinputs[0].name: img}) result = outputs[0] ''' 得到的结果shape是(1,20,8400)。但是用C++做预测时， ''' auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names_char.data(), input_tensors.data(), input_names_char.size(), output_names_char.data(), output_names_char.size()); if (output_tensors[0].IsTensor()) { float* outputData = output_tensors[0].GetTensorMutableData(); auto outputInfo = output_tensors[0].GetTensorTypeAndShapeInfo(); std::cout << "output_tensor shape: " << print_shape(outputInfo.GetShape()) << "\n"; ''' 得到的shape是（1x20x33600）。请问为什么同一个模型在不同的接口上会有这种不同的output？

还有就是，得到的outputData理论上应该是1_20_33600大小的vector,但是我得到的数据超过这个范围，导致我无法正确的postprocess数据，请问有什么好的建议吗？

应该是模型的分辨率不同导致的，640x640 图像分辨率下输出的框个数是 8400，你利用 C++ 的 onnxruntime 输出结果框个数是 33600，刚好是 8400 的 4 倍，这意味着你 C++ 推理的模型的分辨率是你 Python 推理的模型的分辨率的 2 倍，也就是 1280x1280 另外关于输出数据超出 vector 容器范围的问题，建议直接操作指针，只将解码后的结果放到 vector 容器中就行，类似于下面这种形式

vector<vector<float>> bboxes;
float confidence_threshold = 0.25;
float nms_threshold = 0.5;
for(int i = 0; i < output_numbox; ++i){
    float* ptr = outputData  + i * output_numprob;
    float objness = ptr[4];
    if(objness < confidence_threshold)
        continue;

    float* pclass = ptr + 5;
    int label     = std::max_element(pclass, pclass + num_classes) - pclass;
    float prob    = pclass[label];
    float confidence = prob * objness;
    if(confidence < confidence_threshold)
        continue;

    float cx     = ptr[0];
    float cy     = ptr[1];
    float width  = ptr[2];
    float height = ptr[3];
    float left   = cx - width * 0.5;
    float top    = cy - height * 0.5;
    float right  = cx + width * 0.5;
    float bottom = cy + height * 0.5;
    float image_base_left   = d2i[0] * left   + d2i[2];
    float image_base_right  = d2i[0] * right  + d2i[2];
    float image_base_top    = d2i[0] * top    + d2i[5];
    float image_base_bottom = d2i[0] * bottom + d2i[5];
    bboxes.push_back({image_base_left, image_base_top, image_base_right, image_base_bottom, (float)label, confidence});
}

ZZJzhou commented 10 months ago

我按照ultralytics的官方文件把我自己训练的yolov8的模型best.pt转成了 best.onnx,然后用python做预测的时候 ''' outputs = session.run(None, {modelinputs[0].name: img}) result = outputs[0] ''' 得到的结果shape是(1,20,8400)。但是用C++做预测时， ''' auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names_char.data(), input_tensors.data(), input_names_char.size(), output_names_char.data(), output_names_char.size()); if (output_tensors[0].IsTensor()) { float* outputData = output_tensors[0].GetTensorMutableData(); auto outputInfo = output_tensors[0].GetTensorTypeAndShapeInfo(); std::cout << "output_tensor shape: " << print_shape(outputInfo.GetShape()) << "\n"; ''' 得到的shape是（1x20x33600）。请问为什么同一个模型在不同的接口上会有这种不同的output？还有就是，得到的outputData理论上应该是1_20_33600大小的vector,但是我得到的数据超过这个范围，导致我无法正确的postprocess数据，请问有什么好的建议吗？

应该是模型的分辨率不同导致的，640x640 图像分辨率下输出的框个数是 8400，你利用 C++ 的 onnxruntime 输出结果框个数是 33600，刚好是 8400 的 4 倍，这意味着你 C++ 推理的模型的分辨率是你 Python 推理的模型的分辨率的 2 倍，也就是 1280x1280 另外关于输出数据超出 vector 容器范围的问题，建议直接操作指针，只将解码后的结果放到 vector 容器中就行，类似于下面这种形式
vector<vector<float>> bboxes;
float confidence_threshold = 0.25;
float nms_threshold = 0.5;
for(int i = 0; i < output_numbox; ++i){
    float* ptr = outputData  + i * output_numprob;
    float objness = ptr[4];
    if(objness < confidence_threshold)
        continue;

    float* pclass = ptr + 5;
    int label     = std::max_element(pclass, pclass + num_classes) - pclass;
    float prob    = pclass[label];
    float confidence = prob * objness;
    if(confidence < confidence_threshold)
        continue;

    float cx     = ptr[0];
    float cy     = ptr[1];
    float width  = ptr[2];
    float height = ptr[3];
    float left   = cx - width * 0.5;
    float top    = cy - height * 0.5;
    float right  = cx + width * 0.5;
    float bottom = cy + height * 0.5;
    float image_base_left   = d2i[0] * left   + d2i[2];
    float image_base_right  = d2i[0] * right  + d2i[2];
    float image_base_top    = d2i[0] * top    + d2i[5];
    float image_base_bottom = d2i[0] * bottom + d2i[5];
    bboxes.push_back({image_base_left, image_base_top, image_base_right, image_base_bottom, (float)label, confidence});
}

谢谢回复！

float* ptr = outputData  + i * output_numprob;

请问是什么意思？

#include <onnxruntime_cxx_api.h>
#include <iostream>
#include <stdio.h>
#include <opencv2/opencv.hpp>
#include <vector>
#include <cstdint>

using namespace cv;

std::string print_shape(const std::vector<std::int64_t>& v) {
  std::stringstream ss("");
  for (std::size_t i = 0; i < v.size() - 1; i++) ss << v[i] << "x";
  ss << v[v.size() - 1];
  return ss.str();
}

int calculate_product(const std::vector<std::int64_t>& v) {
  int total = 1;
  for (auto& i : v) total *= i;
  return total;
}

template <typename T>
Ort::Value vec_to_tensor(std::vector<T>& data, const std::vector<std::int64_t>& shape) {
  Ort::MemoryInfo mem_info =
      Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
  auto tensor = Ort::Value::CreateTensor<T>(mem_info, data.data(), data.size(), shape.data(), shape.size());
  return tensor;
}

float area_box(const std::vector<float>& box) {
    return (box[2] - box[0]) * (box[3] - box[1]);
}

float iou(const std::vector<float>& box1, const std::vector<float>& box2) {
    float left = std::max(box1[0], box2[0]);
    float top = std::max(box1[1], box2[1]);
    float right = std::min(box1[2], box2[2]);
    float bottom = std::min(box1[3], box2[3]);
    float cross = std::max((right - left), 0.0f) * std::max((bottom - top), 0.0f);
    float union_area = area_box(box1) + area_box(box2) - cross;

    if (cross == 0.0f || union_area == 0.0f) {
        return 0.0f;
    }

    return cross / union_area;
}

std::vector<std::vector<float>> NMS(const std::vector<std::vector<float>>& boxes, float iou_thres) {
    std::vector<bool> remove_flags(boxes.size(), false);
    std::vector<std::vector<float>> keep_boxes;

    for (size_t i = 0; i < boxes.size(); ++i) {
        if (remove_flags[i]) {
            continue;
        }

        keep_boxes.push_back(boxes[i]);
        for (size_t j = i + 1; j < boxes.size(); ++j) {
            if (remove_flags[j]) {
                continue;
            }

            float iou_value = iou(boxes[i], boxes[j]);
            if (iou_value > iou_thres) {
                remove_flags[j] = true;
            }
        }
    }

    return keep_boxes;
}

int main(int argc, char** argv)
{
    // load model
    const char* model_path = "../workspace/best.onnx";
    // onnxruntime setup
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "yolov8pose");
    Ort::SessionOptions session_options;
    Ort::Session session = Ort::Session(env, model_path, session_options);

    // print name/shape of inputs
    Ort::AllocatorWithDefaultOptions allocator;
    std::vector<std::string> input_names;
    std::vector<std::int64_t> input_shapes;
    std::cout << "Input Node Name/Shape (" << input_names.size() << "):" << std::endl;
    for (std::size_t i = 0; i < session.GetInputCount(); i++) {
      input_names.emplace_back(session.GetInputNameAllocated(i, allocator).get());
      input_shapes = session.GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
      std::cout << "\t" << input_names.at(i) << " : " << print_shape(input_shapes) << std::endl;
    }

    // print name/shape of outputs
    std::vector<std::string> output_names;
    std::cout << "Output Node Name/Shape (" << output_names.size() << "):" << std::endl;
    for (std::size_t i = 0; i < session.GetOutputCount(); i++) {
      output_names.emplace_back(session.GetOutputNameAllocated(i, allocator).get());
      auto output_shapes = session.GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
      std::cout << "\t" << output_names.at(i) << " : " << print_shape(output_shapes) << std::endl;
    }

    // Create a single Ort tensor of random numbers
    auto input_shape = input_shapes;
    auto total_number_elements = calculate_product(input_shape);

    // load image
    std::string imageFilepath{"../workspace/img.png"};
    cv::Mat image = cv::imread(imageFilepath, cv::ImreadModes::IMREAD_COLOR);

    // preprocess image
    cv::Mat preprocessedImage;
    int dst_width = input_shapes.at(3);
    int dst_height = input_shapes.at(2);

    double scale = std::min(static_cast<double>(dst_width) / image.cols, static_cast<double>(dst_height) / image.rows);
    double ox = (dst_width - scale * image.cols) / 2;
    double oy = (dst_height - scale * image.rows) / 2;

    cv::Mat M = (cv::Mat_<float>(2, 3) << scale, 0, ox, 0, scale, oy);

    cv::Mat img_pre;
    cv::warpAffine(image, img_pre, M, cv::Size(dst_width, dst_height), cv::INTER_LINEAR,
                   cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));

    cv::Mat IM;
    cv::invertAffineTransform(M, IM);
    img_pre.convertTo(img_pre, CV_32F, 1.0 / 255.0);

    cv::dnn::blobFromImage(img_pre, preprocessedImage);

    std::vector<float> input_tensor_values(total_number_elements);
    std::copy(preprocessedImage.begin<float>(),
                  preprocessedImage.end<float>(), input_tensor_values.begin());

    // vector to tensor
    std::vector<Ort::Value> input_tensors;
    input_tensors.emplace_back(vec_to_tensor<float>(input_tensor_values, input_shape));

    // double-check the dimensions of the input tensor
    assert(input_tensors[0].IsTensor() && input_tensors[0].GetTensorTypeAndShapeInfo().GetShape() == input_shape);
    std::cout << "\ninput_tensor shape: " << print_shape(input_tensors[0].GetTensorTypeAndShapeInfo().GetShape()) << std::endl;

    // pass data through model
    std::vector<const char*> input_names_char(input_names.size(), nullptr);
    std::transform(std::begin(input_names), std::end(input_names), std::begin(input_names_char),
                   [&](const std::string& str) { return str.c_str(); });

    std::vector<const char*> output_names_char(output_names.size(), nullptr);
    std::transform(std::begin(output_names), std::end(output_names), std::begin(output_names_char),
                   [&](const std::string& str) { return str.c_str(); });

    std::cout << "Running model..." << std::endl;
    try {
      auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names_char.data(), input_tensors.data(),
                                        input_names_char.size(), output_names_char.data(), output_names_char.size());
      std::cout << "Done!" << std::endl;

      // double-check the dimensions of the output tensors
      // NOTE: the number of output tensors is equal to the number of output nodes specifed in the Run() call
      assert(output_tensors.size() == output_names.size() && output_tensors[0].IsTensor());
      if (output_tensors[0].IsTensor())
        {
        float* outputData = output_tensors[0].GetTensorMutableData<float>();
        auto outputInfo = output_tensors[0].GetTensorTypeAndShapeInfo();
        std::cout << "output_tensor shape: " << print_shape(outputInfo.GetShape()) << "\n";
        size_t dim1 = outputInfo.GetShape()[0];
        size_t dim2 = outputInfo.GetShape()[1];
        size_t dim3 = outputInfo.GetShape()[2];
        std::cout << "dim1 " << dim1 << "\n";
        std::cout << "dim2 " << dim2 << "\n";
        std::cout << "dim3 " << dim3 << "\n";
        // transpose output tensor from (1,20,33600)to (1,33600,20)
        std::vector<std::vector<std::vector<int>>> output_transposed(dim1, std::vector<std::vector<int>>(dim3, std::vector<int>(dim2)));

        for (size_t i = 0; i < dim1; ++i) {
            for (size_t j = 0; j < dim2; ++j) {
                for (size_t k = 0; k < dim3; ++k) {
                    output_transposed[i][k][j] = outputData[i * k * j + k * j + j];
                }
            }
        }

        //resize to original image shape
        float conf_thres = 0.25;
        std::vector<std::vector<float>> boxes;
        for (int img_id = 0; img_id < dim1; ++img_id) {
          for (int box_id = 0; box_id < dim3; ++box_id){
            if (output_transposed[img_id][box_id][4] > conf_thres){
              auto item = output_transposed[img_id][box_id];
              float cx = item[0], cy = item[1], w = item[2], h = item[3], conf = item[4];

              float left = (cx - w * 0.5) * IM.at<float>(0, 0) + IM.at<float>(0, 2);
              float top = cy - h * 0.5 * IM.at<float>(1, 1) + IM.at<float>(1, 2);
              float right = cx + w * 0.5 * IM.at<float>(0, 0) + IM.at<float>(0, 2);
              float bottom = cy + h * 0.5 * IM.at<float>(1, 1) + IM.at<float>(1, 2);
              std::vector<float> box = {left, top, right, bottom, conf};

              std::vector<float> keypoints(item.begin() + 5, item.end());
              if (keypoints.size() % 3 == 0) {
                size_t num_kpvalues = keypoints.size();
                for (size_t i = 0; i < num_kpvalues; ++i){
                  if (i % 3 == 0){
                    keypoints[i] = keypoints[i] * IM.at<float>(0, 0) + IM.at<float>(0, 2);
                  }
                  else if (i % 3 == 1)
                  {
                    keypoints[i] = keypoints[i] * IM.at<float>(1, 1) + IM.at<float>(1, 2);
                  }
                }
                } else {
                    std::cerr << "Error: Number of keypoints is not divisible by 3." << std::endl;
                }              
              box.insert(box.end(), keypoints.begin(), keypoints.end());
              boxes.push_back(box);
            }
          }
        }
        std::sort(boxes.begin(), boxes.end(), [](const std::vector<float>& a, const std::vector<float>& b) {
            return a[4] > b[4];  
        });

        // NMS
        float iou_thres = 0.45;
        std::vector<std::vector<float>> kept_boxes = NMS(boxes, iou_thres);
        std::cout << kept_boxes[0].size() << "\n";

        }

    } 

    catch (const Ort::Exception& exception) 
    {
        std::cout << "ERROR running model inference: " << exception.what() << std::endl;
      exit(-1);
    }

}

可以幫忙看看怎么修改吗，谢谢啦：）

Melody-Zhou commented 10 months ago

float* ptr = outputData  + i * output_numprob;

这行代码的意思是获取第 i 个框的地址，代码会循环每个框去解码，其中 output_numprob 代表你每个框的维度数量，在你的例子中 output_numprob 是 20 简单来说你可以理解网络的输出在内存中的存储方式是 box1,box2,...,boxn，每个 box 又是 20 个维度因此当你 for 循环 output_numbox 时就是循环每个框，outputData + i * output_numprob 代表的第 i 个框的首地址

ZZJzhou commented 10 months ago

int label     = std::max_element(pclass, pclass + num_classes) - pclass;

请问这里的num_classes是什么？如果我没有理解错的话， pclass是指向每个box里第一个keypoint的指针，pclass+num_class应该包含了多个keypoints的x,y坐标以及每个keypoints的confidence, 那么为什么会需要找到这一段数据里的最大值和box的confidence相乘呢？

float prob    = pclass[label];
float confidence = prob * objness;

Melody-Zhou commented 10 months ago

int label     = std::max_element(pclass, pclass + num_classes) - pclass;
请问这里的num_classes是什么？如果我没有理解错的话， pclass是指向每个box里第一个keypoint的指针，pclass+num_class应该包含了多个keypoints的x,y坐标以及每个keypoints的confidence, 那么为什么会需要找到这一段数据里的最大值和box的confidence相乘呢？
float prob    = pclass[label];
float confidence = prob * objness;

抱歉，可能有些误导了，提供的示例代码是检测模型的解码过程，所以需要获取每个框中最高的类别置信度得分和其对应的标签，姿态点估计任务的话就不需要这个了

Melody-Zhou / tensorRT_Pro-YOLOv8

请教一下 yolov8pose.onnx 在python和c++上得到不同结果的原因，以及如何处理c++接口的outputtensor #7