Open 154775258 opened 10 months ago
can you share your python and c++ code?
can you share your python code?
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
# 加载TensorRT引擎
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
with open("cls.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
# Load and preprocess the image
image = cv2.imread("1.png")
image = cv2.resize(image, (320, 48)) # Resize the image to match the input size of the model
image = image.transpose((2, 0, 1)) # Change the layout from HWC to CHW
image = image.astype(np.float32) # Convert the image to float32
image = image / 255 # Normalize the image
# Allocate device memory for the input image
d_input = cuda.mem_alloc(image.nbytes)
# Allocate device memory for the output
output = np.empty(2, dtype=np.float32)
d_output = cuda.mem_alloc(output.nbytes)
# Transfer the input image to the GPU
cuda.memcpy_htod(d_input, np.ascontiguousarray(image))
# Perform inference
context.execute(1, [int(d_input), int(d_output)])
# Transfer the output back to the host
cuda.memcpy_dtoh(output, d_output)
# Post-process the output if needed
print(output)
# Clean up
how about c++ code
how about c++ code
using namespace nvinfer1;
class Cls {
public:
~Cls();
bool Init(std::string engine_file_path);
int Detect(cv::Mat img);
private:
int cudaInference(std::vector
class SilentLogger : public nvinfer1::ILogger { void log(Severity severity, const char* msg) noexcept override { // 不输出任何logger信息 } };
static Logger gLogger;
Cls::~Cls() { }
int Cls::Detect(cv::Mat img) {
cv::Mat out;
cv::resize(img, out, cv::Size(320, 48));
int img_w = out.cols;
int img_h = out.rows;
std::vector
int Cls::cudaInference(std::vector
void* buffers[2];
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
int mBatchSize = engine.getMaxBatchSize();
cudaMalloc(&buffers[inputIndex], prob.size() * sizeof(float));
cudaMalloc(&buffers[outputIndex], 2 * sizeof(float));
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaMemcpy(buffers[inputIndex], prob.data(), prob.size() * sizeof(float), cudaMemcpyHostToDevice);
context->enqueueV2(buffers, 0, nullptr);
float ans[2];
cudaMemcpyAsync(ans, buffers[outputIndex], 2 * sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
cudaFree(buffers[inputIndex]);
cudaFree(buffers[outputIndex]);
std::cout << ans[0] << ' ' << ans[1] << '\n';
return ans[0] > ans[1] ? 0 : 1;
}
void Cls::MeanAndNorm(cv::Mat img, float* output) { int w = img.cols; int h = img.rows;
uchar* imgData = img.data;
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
output[y * h + x] = (imgData[(y * w + x) * 3 + 2] / 255.f);
output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3] / 255.f );
}
}
}
bool Cls::Init(std::string engine_file_path) { size_t size{ 0 }; char* trtModelStream{ nullptr }; std::ifstream file(engine_file_path, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; file.read(trtModelStream, size); file.close(); }
runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
return true;
}
how about modify
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
output[y * h + x] = (imgData[(y * w + x) * 3 + 2] / 255.f);
output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3] / 255.f );
}
}
to
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x) {
output[y * h + x] = (imgData[(y * w + x) * 3 + 0] / 255.f);
output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f);
output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3+2] / 255.f );
}
}
how about modify
for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { output[y * h + x] = (imgData[(y * w + x) * 3 + 2] / 255.f); output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f); output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3] / 255.f ); } }
to
for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { output[y * h + x] = (imgData[(y * w + x) * 3 + 0] / 255.f); output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f); output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3+2] / 255.f ); } }
Tried but no effect. This engine model is used for classifying the text angle. The image loading uses OpenCV. The original code converts the image data from OpenCV to RGB and normalizes it, which is not a problem. I have tried this modification and it does not affect the final result.
how about modify
for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { output[y * h + x] = (imgData[(y * w + x) * 3 + 2] / 255.f); output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f); output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3] / 255.f ); } }
to
for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { output[y * h + x] = (imgData[(y * w + x) * 3 + 0] / 255.f); output[y * h + x + h * w] = (imgData[(y * w + x) * 3 + 1] / 255.f); output[y * h + x + 2 * (h * w)] = (imgData[(y * w + x) * 3+2] / 255.f ); } }
cls.zip This model is the onnx model I encountered problems with. You Can convert it to an engine to reproduce it。
Have you tried to run this with deepstream sdk? Maybe write the custom post-processing and see if even there it does incorrect inference. Since python kind of uses wrapper c++ tensorrt, this is kind of weird. Could you please try deepstream? just use any one of the nvidia's docker image.
TensorRT-8.6.1.6+cuda12.1 GPU RTX3090 24G