The differences between jetson-inference and official tensorrt

Cuzny commented 2 years ago

Hello, could you please tell me the differences between jetson-inference and official tensorrt? I use the official tensorrt on my jetson xavier nx, but the 3dconv operation takes so long....

dusty-nv commented 2 years ago

Hi @Cuzny, jetson-inference uses TensorRT underneath for the execution of the DNN inferencing. So the 3dconv operation would be the same.

Cuzny commented 2 years ago

Okay, thanks. Is there anything I need to notice when defining a 3D network? This simple conv3d takes nearly 30ms...

#include "NvInfer.h"
#include <iostream>
#include <vector>
#include <math.h>
#include <dirent.h>
#include "cuda_runtime_api.h"
#include "logging.h"
#include "common.h"
#include <map>
#include <fstream>
#include <chrono>

#define DEVICE 0  // GPU id
#define BATCH_SIZE 1  // currently, only support BATCH=1
#define USE_FP32  // set USE_INT8 or USE_FP16 or USE_FP32

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

std::map<std::string, Weights> loadWeights(const std::string file)
{
    std::cout << "Loading weights: " << file << std::endl;
    std::map<std::string, Weights> weightMap;

    // Open weights file
    std::ifstream input(file);
    assert(input.is_open() && "Unable to load weight file.");

    // Read number of weight blobs
    int32_t count;
    input >> count;
    assert(count > 0 && "Invalid weight map file.");

    while (count--)
    {
        Weights wt{DataType::kFLOAT, nullptr, 0};
        uint32_t size;

        // Read name and type of blob
        std::string name;
        input >> name >> std::dec >> size;
        wt.type = DataType::kFLOAT;

        // Load blob
        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
        for (uint32_t x = 0, y = size; x < y; ++x)
        {
            input >> std::hex >> val[x];
        }
        wt.values = val;

        wt.count = size;
        weightMap[name] = wt;
    }

    return weightMap;
}

ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{

    INetworkDefinition* network = builder->createNetworkV2(0U);

    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{3, 8, 256, 320});
    assert(data);  

    std::map<std::string, Weights> weightMap = loadWeights("./conv3d.wts");
    Weights emptywts{DataType::kFLOAT, nullptr, 0};

    IConvolutionLayer* conv1_slow = network->addConvolutionNd(*data, 64, Dims3{1, 7, 7}, weightMap["conv.weight"], emptywts);
    conv1_slow->setStrideNd(Dims3{1, 2, 2});
    conv1_slow->setPaddingNd(Dims3{0, 3, 3});
    assert(conv1_slow);
    conv1_slow->getOutput(0)->setName(OUTPUT_BLOB_NAME);
    network->markOutput(*conv1_slow->getOutput(0));

    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(16 << 20);

#ifdef USE_FP16
    config->setFlag(BuilderFlag::kFP16);
#endif
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);

    // Don't need the network any more
    network->destroy();

    // Release host memory
    for (auto& mem : weightMap)
    {
        free((void*) (mem.second.values));
    }

    return engine;
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize) 
{
    const ICudaEngine& engine = context.getEngine();
    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    // assert(engine.getNbBindings() == 2);
    void* buffers[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
    // Create GPU buffers on device
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize *3*8*256*320 * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * 64*8*128*160 * sizeof(float)));
    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize *3*8*256*320* sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize *64*8*128*160* sizeof(float), cudaMemcpyDeviceToHost, stream));

    cudaStreamSynchronize(stream);
    // Release stream and buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

int main()
{
    cudaSetDevice(DEVICE);

    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(1, builder, config, DataType::kFLOAT);
    assert(engine != nullptr);

    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);

    std::cout << "Register engine successfully!" << std::endl;

    static float data[3*8*256*320];
    static float prob_data[64*8*128*160];

    for(int i=0; i<30; i++) doInference(*context, data, prob_data, 1);
    auto conv3d_start = std::chrono::system_clock::now();
    doInference(*context, data, prob_data, 1);

    auto conv3d_end = std::chrono::system_clock::now();
    std::cout << "cost of conv3d: " << std::chrono::duration_cast<std::chrono::milliseconds>(conv3d_end - conv3d_start).count() << "ms" << std::endl;
    // Close everything down
}

dusty-nv commented 2 years ago

Other than enabling INT8/FP16 mode, that would be my only suggestion. That and decreasing the resolution of the grid size if you can.

Cuzny commented 2 years ago

I'm confused that my network slowfast takes almost one second, while it only takes 60+ms on the pytorch side. Even if I use FP16, it takes 350+ms

dusty-nv commented 2 years ago

It looks like you are doing 30 inferencing runs in your benchmark, are you accounting for that in the total execution time?

Regardless, the TensorRT forum may be a better venue for discussion about this: https://forums.developer.nvidia.com/c/ai-data-science/deep-learning/tensorrt/92

Cuzny commented 2 years ago

Thanks, I will go to the TensorRT forum.

dusty-nv / jetson-inference

The differences between jetson-inference and official tensorrt #1457