NVIDIA / TensorRT

NVIDIA® TensorRT™ is an SDK for high-performance deep learning inference on NVIDIA GPUs. This repository contains the open source components of TensorRT.
https://developer.nvidia.com/tensorrt
Apache License 2.0
10.82k stars 2.14k forks source link

The same engine produces incorrect inference output using C++, but correct results using Python. TensorRT-8.6.1.6+cuda12.1 #3617

Open 154775258 opened 10 months ago

154775258 commented 10 months ago

TensorRT-8.6.1.6+cuda12.1 GPU RTX3090 24G

tp-nan commented 10 months ago

can you share your python and c++ code?

154775258 commented 10 months ago

can you share your python code?

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2

# 加载TensorRT引擎
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
with open("cls.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()

# Load and preprocess the image
image = cv2.imread("1.png")
image = cv2.resize(image, (320, 48))  # Resize the image to match the input size of the model
image = image.transpose((2, 0, 1))  # Change the layout from HWC to CHW
image = image.astype(np.float32)  # Convert the image to float32
image = image / 255  # Normalize the image

# Allocate device memory for the input image
d_input = cuda.mem_alloc(image.nbytes)

# Allocate device memory for the output
output = np.empty(2, dtype=np.float32)
d_output = cuda.mem_alloc(output.nbytes)

# Transfer the input image to the GPU
cuda.memcpy_htod(d_input, np.ascontiguousarray(image))

# Perform inference
context.execute(1, [int(d_input), int(d_output)])

# Transfer the output back to the host
cuda.memcpy_dtoh(output, d_output)

# Post-process the output if needed
print(output)

# Clean up
tp-nan commented 10 months ago

how about c++ code

154775258 commented 10 months ago

how about c++ code

pragma once

include

include "NvInfer.h"

include "cuda_runtime_api.h"

include

include <opencv2/opencv.hpp>

include

using namespace nvinfer1;

class Cls { public: ~Cls(); bool Init(std::string engine_file_path); int Detect(cv::Mat img); private: int cudaInference(std::vector& prob); void MeanAndNorm(cv::Mat img, float output); const char INPUT_BLOB_NAME = "x"; const char OUTPUT_BLOB_NAME = "softmax_0.tmp_0"; ICudaEngine engine; IRuntime runtime; IExecutionContext context; };

include "TRTCls.h"

include "logging.h"

class SilentLogger : public nvinfer1::ILogger { void log(Severity severity, const char* msg) noexcept override { // 不输出任何logger信息 } };

static Logger gLogger;

Cls::~Cls() { }

int Cls::Detect(cv::Mat img) { cv::Mat out; cv::resize(img, out, cv::Size(320, 48)); int img_w = out.cols; int img_h = out.rows; std::vector prob(img_w img_h 3); MeanAndNorm(out, prob.data()); return cudaInference(prob); }

int Cls::cudaInference(std::vector& prob) { const ICudaEngine& engine = context->getEngine();

void* buffers[2];

const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
int mBatchSize = engine.getMaxBatchSize();
cudaMalloc(&buffers[inputIndex], prob.size() * sizeof(float));
cudaMalloc(&buffers[outputIndex], 2 * sizeof(float));
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaMemcpy(buffers[inputIndex], prob.data(), prob.size() * sizeof(float), cudaMemcpyHostToDevice);
context->enqueueV2(buffers, 0, nullptr);
float ans[2];
cudaMemcpyAsync(ans, buffers[outputIndex], 2 * sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);

cudaFree(buffers[inputIndex]);
cudaFree(buffers[outputIndex]);
std::cout << ans[0] << ' ' << ans[1] << '\n';
return ans[0] > ans[1] ? 0 : 1;

}

void Cls::MeanAndNorm(cv::Mat img, float* output) { int w = img.cols; int h = img.rows;

uchar* imgData = img.data;
for (int y = 0; y < h; ++y) {
    for (int x = 0; x < w; ++x) {
        output[y * h + x] = (imgData[(y * w + x) * 3 + 2] / 255.f);
        output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
        output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3] / 255.f );

    }
}

}

bool Cls::Init(std::string engine_file_path) { size_t size{ 0 }; char* trtModelStream{ nullptr }; std::ifstream file(engine_file_path, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; file.read(trtModelStream, size); file.close(); }

runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
return true;

}

tp-nan commented 10 months ago

how about modify

    for (int y = 0; y < h; ++y) {
        for (int x = 0; x < w; ++x) {
            output[y * h + x] = (imgData[(y * w + x) * 3 + 2] / 255.f);
            output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
            output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3] / 255.f );

        }
    }

to

    for (int y = 0; y < h; ++y) {
        for (int x = 0; x < w; ++x) {
            output[y * h + x] = (imgData[(y * w + x) * 3 + 0] / 255.f);
            output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
            output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3+2] / 255.f );

        }
    }
154775258 commented 10 months ago

how about modify

    for (int y = 0; y < h; ++y) {
        for (int x = 0; x < w; ++x) {
            output[y * h + x] = (imgData[(y * w + x) * 3 + 2] / 255.f);
            output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
            output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3] / 255.f );

        }
    }

to

    for (int y = 0; y < h; ++y) {
        for (int x = 0; x < w; ++x) {
            output[y * h + x] = (imgData[(y * w + x) * 3 + 0] / 255.f);
            output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
            output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3+2] / 255.f );

        }
    }

Tried but no effect. This engine model is used for classifying the text angle. The image loading uses OpenCV. The original code converts the image data from OpenCV to RGB and normalizes it, which is not a problem. I have tried this modification and it does not affect the final result.

154775258 commented 10 months ago

how about modify

    for (int y = 0; y < h; ++y) {
        for (int x = 0; x < w; ++x) {
            output[y * h + x] = (imgData[(y * w + x) * 3 + 2] / 255.f);
            output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
            output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3] / 255.f );

        }
    }

to

    for (int y = 0; y < h; ++y) {
        for (int x = 0; x < w; ++x) {
            output[y * h + x] = (imgData[(y * w + x) * 3 + 0] / 255.f);
            output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
            output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3+2] / 255.f );

        }
    }

cls.zip This model is the onnx model I encountered problems with. You Can convert it to an engine to reproduce it。

RajUpadhyay commented 10 months ago

Have you tried to run this with deepstream sdk? Maybe write the custom post-processing and see if even there it does incorrect inference. Since python kind of uses wrapper c++ tensorrt, this is kind of weird. Could you please try deepstream? just use any one of the nvidia's docker image.