lyuwenyu / RT-DETR

[CVPR 2024] Official RT-DETR (RTDETR paddle pytorch), Real-Time DEtection TRansformer, DETRs Beat YOLOs on Real-time Object Detection. 🔥 🔥 🔥
Apache License 2.0
2.65k stars 305 forks source link

pth2onnx和onnx2trt是没有问题,但是trt推理出现box坐标是特别大的值,标签和得分是正确的 #447

Open Kingxudong opened 2 months ago

Kingxudong commented 2 months ago

pth转换为onnx,用官方的代码转,并且测试是正确。但是onnx转换为trt,用的是tensorRT8.6.1,转换没有问题,但是推理出现box坐标是特别大的值,标签和得分是正确的。 D:\tool\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8\TensorRT-8.6.1.6\bin\trtexec.exe --onnx=model.onnx --workspace=4096 --avgRuns=100 --shapes=images:1x3x640x640 --saveEngine=model.trt

我的C++推理代码如下

include

include

include

include

include

include

include <opencv2/opencv.hpp>

include "NvInfer.h"

include "NvInferRuntimeCommon.h"

include

include <opencv2/core/utils/filesystem.hpp>

using namespace nvinfer1; using namespace std;

define CUDA_CHECK(call) \

do { \
    cudaError_t status = call; \
    if (status != cudaSuccess) { \
        fprintf(stderr, "CUDA Error in file '%s' in line %d: %s\n", \
                __FILE__, __LINE__, cudaGetErrorString(status)); \
        exit(EXIT_FAILURE); \
    } \
} while (0)

class Logger : public ILogger { public: void log(Severity severity, const char* msg) noexcept override { if (severity != Severity::kINFO) std::cout << msg << std::endl; } } gLogger; class TensorRTInference { public: TensorRTInference(const std::string& enginePath); ~TensorRTInference(); void doInference(const std::vector& image_paths);

private: IRuntime runtime; ICudaEngine engine; IExecutionContext* context; void* buffers[5]; int inputIndex1; int inputIndex2; int outputIndex1; int outputIndex2; int outputIndex3; int batchSize; int inputSize1; int inputSize2; int outputSize1; int outputSize2; int outputSize3;

void allocateBuffers();
void preprocess(const cv::Mat& image, float* buffer1, float* buffer2);
void postprocess(float* output1, float* output2, float* output3, cv::Mat& image);

};

int volume(const Dims& dims) { int vol = 1; for (int i = 0; i < dims.nbDims; ++i) { vol *= dims.d[i]; } return vol; } TensorRTInference::TensorRTInference(const std::string& enginePath) : engine(nullptr), context(nullptr) { std::ifstream engineFile(enginePath, std::ios::binary); if (!engineFile.good()) { std::cerr << "Error opening engine file: " << enginePath << std::endl; return; }

std::stringstream engineStream;
engineStream << engineFile.rdbuf();
engineFile.close();

runtime = createInferRuntime(gLogger);
if (!runtime) {
    std::cerr << "Error creating InferRuntime" << std::endl;
    return;
}

engine_ = runtime->deserializeCudaEngine(engineStream.str().data(), engineStream.str().size(), nullptr);
if (engine_ == nullptr) {
    std::cerr << "Error deserializing the engine file: " << enginePath << std::endl;
    return;
}

context_ = engine_->createExecutionContext();
if (!context_) {
    std::cerr << "Error creating ExecutionContext" << std::endl;
    return;
}

inputIndex1_ = engine_->getBindingIndex("images");
inputIndex2_ = engine_->getBindingIndex("orig_target_sizes");
outputIndex1_ = engine_->getBindingIndex("labels");
outputIndex2_ = engine_->getBindingIndex("boxes");
outputIndex3_ = engine_->getBindingIndex("scores");

// Get input sizes
const Dims& inputDims1 = engine_->getBindingDimensions(inputIndex1_);
const Dims& inputDims2 = engine_->getBindingDimensions(inputIndex2_);
for (int i = 0; i < inputDims2.nbDims; ++i) {
    std::cout << "inputDims2[" << i << "]: " << inputDims2.d[i] << std::endl;
}
const Dims& outputDims1 = engine_->getBindingDimensions(outputIndex1_);
const Dims& outputDims2 = engine_->getBindingDimensions(outputIndex2_);

const Dims& outputDims3 = engine_->getBindingDimensions(outputIndex3_);

batchSize_ = 1; 

inputSize1_ = volume(inputDims1) * batchSize_ * sizeof(float);
inputSize2_ = volume(inputDims2) * sizeof(float);
outputSize1_ = volume(outputDims1) * sizeof(float); 
outputSize2_ = volume(outputDims2) * sizeof(float); 
outputSize3_ = volume(outputDims3) * sizeof(float); 

cout << inputSize2_ << endl;

allocateBuffers();

}

std::vector ToTensor(cv::Mat image) { if (image.empty()) { std::cerr << "Error: Empty image" << std::endl; return {}; }

if (image.channels() != 3) {
    std::cerr << "Error: Image must have 3 channels" << std::endl;
    return {};
}

image.convertTo(image, CV_32FC3, 1.0f / 255.0f);

// Convert image to tensor
std::vector<cv::Mat> channels(3);
cv::split(image, channels);

std::vector<float> tensor(image.total() * image.channels());
int index = 0;

for (int c = 0; c < 3; ++c) {
    for (int i = 0; i < channels[c].rows; ++i) {
        for (int j = 0; j < channels[c].cols; ++j) {
            tensor[index++] = channels[c].at<float>(i, j);
        }
    }
}

return tensor;

} std::vector ToTensorAndNormalize(cv::Mat image) { if (image.empty()) { std::cerr << "Error: Empty image" << std::endl; return {}; }

if (image.channels() != 3) {
    std::cerr << "Error: Image must have 3 channels" << std::endl;
    return {};
}
cv::resize(image, image, cv::Size(640.f, 640.f));
image.convertTo(image, CV_32FC3, 1.0f / 255.0f);

const float mean[3] = { 0.485f, 0.456f, 0.406f };
const float std[3] = { 0.229f, 0.224f, 0.225f };

std::vector<cv::Mat> channels(3);
cv::split(image, channels);

std::vector<float> input_tensor;
input_tensor.reserve(640 * 640 * 3);  

for (int c = 0; c < 3; ++c) {
    for (int i = 0; i < channels[c].rows; ++i) {
        for (int j = 0; j < channels[c].cols; ++j) {
            float pixel = (channels[c].at<float>(i, j) - mean[c]) / std[c];
            input_tensor.push_back(pixel);
        }
    }
}

return input_tensor; 

}

TensorRTInference::~TensorRTInference() { if (context) { context->destroy(); } if (engine) { engine->destroy(); } if (runtime) { runtime->destroy(); } for (int i = 0; i < 5; ++i) { if (buffers[i]) cudaFree(buffers[i]); } }

void TensorRTInference::preprocess(const cv::Mat& image, float buffer1, float buffer2) { cv::Mat rgb_image;

if (image.channels() == 1) {
    cv::cvtColor(image, rgb_image, cv::COLOR_GRAY2RGB);
}
else if (image.channels() == 4) {
    cv::cvtColor(image, rgb_image, cv::COLOR_BGRA2RGB);
}
else if (image.channels() == 3) {
    cv::cvtColor(image, rgb_image, cv::COLOR_BGR2RGB);
}
else {
    rgb_image = image;  
}

std::vector<float> tensor1 = ToTensorAndNormalize(rgb_image);

//float tensor[1][2] = { {1, 1} };
//float orig_target_sizes[2] = { 640.0f, 640.0f };
float orig_target_sizes[2] = { static_cast<float>(image.cols), static_cast<float>(image.rows) };
std::cout << "Original target sizes: " << orig_target_sizes[0] << ", " << orig_target_sizes[1] << std::endl;

std::cout << "Input tensor size: " << tensor1.size() << std::endl;
//std::cout << "Input tensor size: " << tensor1.size() << std::endl;

std::memcpy(buffer1, tensor1.data(), inputSize1_);

std::memcpy(buffer2, orig_target_sizes, inputSize2_);

}

void TensorRTInference::postprocess(float output1, float output2, float output3, cv::Mat& image) { float numDetections = outputSize2_ / (4 sizeof(float)); float confThreshold = 0.5f;

std::cout << "Box " << (output2 + 1 * 4)[0] <<  std::endl;

for (int i = 0; i < numDetections; ++i) {

    float* bbox = output2 + i * 4;
    float labels = output1[i];

    int x1 = static_cast<float>(bbox[0]);
    int y1 = static_cast<float>(bbox[1]);
    int x2 = static_cast<float>(bbox[2]);
    int y2 = static_cast<float>(bbox[3]);

    // Draw bounding box
    cv::rectangle(image, cv::Point(x1, y1), cv::Point(x2, y2), cv::Scalar(0, 255, 0), 2);
    std::string label = "label: " + std::to_string(labels);
    cv::putText(image, label, cv::Point(x1, y1 - 5), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 255, 0), 1);

}

float* additionalData = output3;

std::cout << "First values of output3:" << std::endl;
for (int i = 0; i < 50; ++i) {
    std::cout << additionalData[i] << " ";
}
std::cout << std::endl;

}

std::string replaceFolderName(const std::string& path, const std::string& oldFolder, const std::string& newFolder) { size_t pos = path.find(oldFolder); if (pos != std::string::npos) { std::string newPath = path.substr(0, pos) + newFolder + path.substr(pos + oldFolder.length()); return newPath; } else { return path; } }

void softmax(float output, std::vector& probs) { probs.clear(); float sum = 0.0f; for (int i = 0; i < 2; ++i) { probs.push_back(std::exp(output[i])); sum += probs.back(); } for (int i = 0; i < 2; ++i) { probs[i] /= sum; } } void TensorRTInference::allocateBuffers() { std::cout << "Allocating buffers..." << std::endl; CUDACHECK(cudaMalloc(&buffers[inputIndex1], inputSize1)); CUDACHECK(cudaMalloc(&buffers[inputIndex2], inputSize2)); CUDACHECK(cudaMalloc(&buffers[outputIndex1], outputSize1)); CUDACHECK(cudaMalloc(&buffers[outputIndex2], outputSize2)); CUDACHECK(cudaMalloc(&buffers[outputIndex3], outputSize3)); std::cout << "Buffers allocated successfully." << std::endl; } void TensorRTInference::doInference(const std::vector& image_paths) { float inputBuffer1 = new float[inputSize1 / sizeof(float)]; float* inputBuffer2 = new float[inputSize2 / sizeof(float)];

float* outputBuffer1 = new float[outputSize1_ / sizeof(float)];
float* outputBuffer2 = new float[outputSize2_ / sizeof(float)];
float* outputBuffer3 = new float[outputSize3_ / sizeof(float)];

for (const auto& filename : image_paths) {
    std::cout << "Processing image: " << filename << std::endl;
    clock_t start = clock();
    cv::Mat image = cv::imread(filename);
    int height = image.rows;
    int width = image.cols;
    int channels = image.channels();

    // Print the shape of the image
    std::cout << "Image shape: (" << height << ", " << width << ", " << channels << ")" << std::endl;

    if (image.empty()) {
        std::cerr << "Error loading image: " << filename << std::endl;
        continue;
    }

    preprocess(image, inputBuffer1, inputBuffer2);

    clock_t gpuStart = clock();
    CUDA_CHECK(cudaMemcpy(buffers_[inputIndex1_], inputBuffer1, inputSize1_, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(buffers_[inputIndex2_], inputBuffer2, inputSize2_, cudaMemcpyHostToDevice));

    bool success = context_->executeV2(buffers_);
    if (!success) {
        std::cerr << "TensorRT execution failed." << std::endl;
        continue;
    }

    CUDA_CHECK(cudaMemcpy(outputBuffer1, buffers_[outputIndex1_], outputSize1_, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(outputBuffer2, buffers_[outputIndex2_], outputSize2_, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(outputBuffer3, buffers_[outputIndex3_], outputSize3_, cudaMemcpyDeviceToHost));

    clock_t gpuEnd = clock();
    std::cout << "GPU inference time: " << (gpuEnd - gpuStart) / (double)CLOCKS_PER_SEC << " seconds." << std::endl;

    postprocess(outputBuffer1, outputBuffer2, outputBuffer3, image);
    std::cout << "First values of outputBuffer1:" << std::endl;
    for (int i = 0; i < 10; ++i) {
        std::cout << outputBuffer2[i] << " ";
    }
    std::cout << std::endl;
    std::string output_path = replaceFolderName(filename, "debug", "debug_out");
    cv::imwrite(output_path, image);

}

delete[] inputBuffer1;
delete[] inputBuffer2;
delete[] outputBuffer1;
delete[] outputBuffer2;
delete[] outputBuffer3;

}

int main(int argc, char* argv) { try { cudaSetDevice(0); TensorRTInference inference("D:\tool\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8\TensorRT-8.6.1.6\bin\cfg_model.trt"); std::string img_dir = "E:\YOLOv8-main\val2017\debug"; std::vector image_paths; cv::utils::fs::glob(img_dir, ".png", image_paths); inference.doInference(image_paths); } catch (const std::exception& e) { std::cerr << "Exception: " << e.what() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; } 能帮忙检查一下吗

lyuwenyu commented 2 months ago

可以先用这个脚本测一下导出的结果

https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py

Kingxudong commented 2 months ago

可以先用这个脚本测一下导出的结果

https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py

有测试,无论是官方提供的权重文件还是我自己的权重文件,都出向没有“create_execution_context”这个模块,未定义

lyuwenyu commented 2 months ago

trt版本是多少


box不对看下是不是数据类型的问题

Kingxudong commented 2 months ago

trt版本是多少

box不对看下是不是数据类型的问题

trt版本是8.6.1; 但是标签和得分都是对的

gk966988 commented 2 months ago

遇见相同的问题,在onnx转tensorrt后,使用https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetrv2_pytorch/references/deploy/rtdetrv2_tensorrt.py脚本推理结果不对。