Will this project upgrade to the latest API, like "enqueueV3"?

feiyuhuahuo commented 1 year ago

Hi, really a nice project. Since "enqueuev2" is deprecated in the latest version of TensorRT, will this project be upgrade to the latest API? I'm learning to deploy ONNX models with TensorRT. But due to lack of a complete example with the latest API, it's hard for me to write a demo. Hope I can get some help here.

cyrusbehr commented 1 year ago

@feiyuhuahuo I wasn't aware there was an new API, thanks for bringing this to my attention. Yes I will upgrade my implementation when I have some time over the next few weeks.

cyrusbehr commented 1 year ago

Hi @feiyuhuahuo I have upgraded to latest TensorRT API (8.6) and use EnqueueV3.

As of right now, the changes are in the v3.0 branch, available here. They will be merged into main once some other changes have been completed.

feiyuhuahuo commented 1 year ago

Hi, I tried this project and I can get the result successfully. But when I try to inference with my own engine, I just can't get the result. Could you help me to take a look at my code?

#include <fstream>
#include <iostream>
#include <NvInfer.h>
#include <NvOnnxParser.h>
#include <logger.h>
#include "cuda_runtime_api.h"
#include <opencv2/opencv.hpp>

using namespace std;
using namespace nvinfer1;
using namespace nvonnxparser;
using namespace sample;
using namespace cv;

Logger logger;

ICudaEngine *deserialize_engine(string path) // 从文件中读取序列化后的引擎并反序列化
{
    IRuntime *runtime = createInferRuntime(logger);
    ICudaEngine *engine = nullptr;

    // 读取文件
    ifstream file(path, std::ios::binary);
    if (file.good())
    {
        // 获取文件大小
        file.seekg(0, file.end);
        size_t size = file.tellg();
        file.seekg(0, file.beg);

        // 分配内存
        vector<char> trtModelStream(size);
        assert(trtModelStream.data());

        // 读取文件内容
        file.read(trtModelStream.data(), size);
        file.close();

        // 反序列化引擎
        engine = runtime->deserializeCudaEngine(trtModelStream.data(), size);
    }

    return engine;
}

vector<int32_t> mat2vector(string img_path, int img_w, int img_h)
{
    Mat img_src = imread(img_path, IMREAD_COLOR);
    Mat img_resized;

    resize(img_src, img_resized, Size(img_w, img_h), InterpolationFlags::INTER_LINEAR);

    vector<int32_t> img_vector = vector<int32_t>(img_resized.reshape(1, 1)); // 必须展开成一维，不然无法构建vector
    return img_vector;
}

#define CHECK(status)                                          \
    do                                                         \
    {                                                          \
        auto ret = (status);                                   \
        if (ret != 0)                                          \
        {                                                      \
            std::cerr << "Cuda failure: " << ret << std::endl; \
            abort();                                           \
        }                                                      \
    } while (0)

struct OneIO
{
    const char *name;
    Dims dim;
    uint64_t pixel_num;
    nvinfer1::DataType dtype;
    uint64_t size;
};

int main()
{
    ICudaEngine *engine = deserialize_engine("test_int32.engine");
    assert(engine != nullptr);

    int32_t io_tensor_num = engine->getNbIOTensors();
    cout << "IO tensor count: " << io_tensor_num << endl;

    const int bs = 1;
    vector<OneIO> all_ios;

    for (size_t i = 0; i < io_tensor_num; ++i)
    {
        OneIO one_io;
        const char *name = engine->getIOTensorName(i);
        one_io.name = name;
        cout << "IO name " << i << ":  " << name;

        Dims dim = engine->getTensorShape(name);
        one_io.dim = dim;
        uint64_t pixel_num = 1;
        cout << "  shape: ";
        for (size_t j = 0; j < dim.nbDims; ++j)
        {
            cout << dim.d[j] << ", ";
            pixel_num *= dim.d[j];
        }
        one_io.pixel_num = pixel_num;

        nvinfer1::DataType dtype = engine->getTensorDataType(name);
        one_io.dtype = dtype;
        cout << "dtype: " << static_cast<int32_t>(dtype) << "  dtype size: " << sizeof(dtype);

        uint64_t size = pixel_num * sizeof(dtype);
        one_io.size = size;
        cout << "  pixel num: " << pixel_num << "  all size: " << size << endl;

        all_ios.push_back(one_io);
    }

    IExecutionContext *context = engine->createExecutionContext();
    assert(context != nullptr);

    int32_t output_data[all_ios[1].pixel_num];
    for (int i = 0; i < all_ios[1].pixel_num; ++i)
    {
        output_data[i] = 999;
    }

    for (int i = 0; i < 1; i++)
    {
        auto start = chrono::system_clock::now();

        vector<int32_t> img_vector = mat2vector("test2.bmp", all_ios[0].dim.d[1], all_ios[0].dim.d[0]);
        assert(img_vector.size() == all_ios[0].pixel_num);

        int32_t *img_data = img_vector.data();

        auto end = chrono::system_clock::now();
        auto time_read_img = chrono::duration_cast<chrono::milliseconds>(end - start).count();
        cout << "read img time:  " << time_read_img << endl;

        start = chrono::system_clock::now();

        void *input_mem;
        void *output_mem1;

        CHECK(cudaMalloc(&input_mem, all_ios[0].size));
        CHECK(cudaMalloc(&output_mem1, all_ios[1].size));

        cudaStream_t stream;
        CHECK(cudaStreamCreate(&stream));

        cudaMemcpyAsync(input_mem, img_data, all_ios[0].size, cudaMemcpyHostToDevice, stream);

        context->setTensorAddress(all_ios[0].name, img_data);
        context->setTensorAddress(all_ios[1].name, output_mem1);

        bool status = context->enqueueV3(stream);
        cout << "enqueue status  " << status << endl;

        cudaMemcpyAsync(output_data, output_mem1, all_ios[1].size, cudaMemcpyDeviceToHost, stream);
        cout << "one pixel  "<< output_data[0] << endl;

        cudaStreamSynchronize(stream);
        cudaStreamDestroy(stream);

        // CHECK(cudaFree(input_mem));
        // CHECK(cudaFree(output_mem1));

        // end = std::chrono::system_clock::now();
        // time_infer = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_infer;
        // std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
    }

    // // Destroy the engine
    // context->destroy();
    // engine->destroy();
    // runtime->destroy();

    return 0;
}

I checked the value in output_data and it's still 999 which is not a possible value for the model inference output. I just don't know where I did wrong. I put the engine file and the image here(https://drive.google.com/drive/folders/1N-cscZ0uReMm2x2MuLd9OSjqNNgUAKka?usp=sharing). 💐 for your kind help.

cyrusbehr / tensorrt-cpp-api

Will this project upgrade to the latest API, like "enqueueV3"? #15