Closed Cuzny closed 1 year ago
Hi @Cuzny, jetson-inference uses TensorRT underneath for the execution of the DNN inferencing. So the 3dconv operation would be the same.
Okay, thanks. Is there anything I need to notice when defining a 3D network? This simple conv3d takes nearly 30ms...
#include "NvInfer.h"
#include <iostream>
#include <vector>
#include <math.h>
#include <dirent.h>
#include "cuda_runtime_api.h"
#include "logging.h"
#include "common.h"
#include <map>
#include <fstream>
#include <chrono>
#define DEVICE 0 // GPU id
#define BATCH_SIZE 1 // currently, only support BATCH=1
#define USE_FP32 // set USE_INT8 or USE_FP16 or USE_FP32
#define CHECK(status) \
do\
{\
auto ret = (status);\
if (ret != 0)\
{\
std::cerr << "Cuda failure: " << ret << std::endl;\
abort();\
}\
} while (0)
std::map<std::string, Weights> loadWeights(const std::string file)
{
std::cout << "Loading weights: " << file << std::endl;
std::map<std::string, Weights> weightMap;
// Open weights file
std::ifstream input(file);
assert(input.is_open() && "Unable to load weight file.");
// Read number of weight blobs
int32_t count;
input >> count;
assert(count > 0 && "Invalid weight map file.");
while (count--)
{
Weights wt{DataType::kFLOAT, nullptr, 0};
uint32_t size;
// Read name and type of blob
std::string name;
input >> name >> std::dec >> size;
wt.type = DataType::kFLOAT;
// Load blob
uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
for (uint32_t x = 0, y = size; x < y; ++x)
{
input >> std::hex >> val[x];
}
wt.values = val;
wt.count = size;
weightMap[name] = wt;
}
return weightMap;
}
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
INetworkDefinition* network = builder->createNetworkV2(0U);
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims4{3, 8, 256, 320});
assert(data);
std::map<std::string, Weights> weightMap = loadWeights("./conv3d.wts");
Weights emptywts{DataType::kFLOAT, nullptr, 0};
IConvolutionLayer* conv1_slow = network->addConvolutionNd(*data, 64, Dims3{1, 7, 7}, weightMap["conv.weight"], emptywts);
conv1_slow->setStrideNd(Dims3{1, 2, 2});
conv1_slow->setPaddingNd(Dims3{0, 3, 3});
assert(conv1_slow);
conv1_slow->getOutput(0)->setName(OUTPUT_BLOB_NAME);
network->markOutput(*conv1_slow->getOutput(0));
// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(16 << 20);
#ifdef USE_FP16
config->setFlag(BuilderFlag::kFP16);
#endif
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
// Don't need the network any more
network->destroy();
// Release host memory
for (auto& mem : weightMap)
{
free((void*) (mem.second.values));
}
return engine;
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
// assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], batchSize *3*8*256*320 * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * 64*8*128*160 * sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize *3*8*256*320* sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize *64*8*128*160* sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
int main()
{
cudaSetDevice(DEVICE);
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
// Create model to populate the network, then set the outputs and create an engine
ICudaEngine* engine = createEngine(1, builder, config, DataType::kFLOAT);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
std::cout << "Register engine successfully!" << std::endl;
static float data[3*8*256*320];
static float prob_data[64*8*128*160];
for(int i=0; i<30; i++) doInference(*context, data, prob_data, 1);
auto conv3d_start = std::chrono::system_clock::now();
doInference(*context, data, prob_data, 1);
auto conv3d_end = std::chrono::system_clock::now();
std::cout << "cost of conv3d: " << std::chrono::duration_cast<std::chrono::milliseconds>(conv3d_end - conv3d_start).count() << "ms" << std::endl;
// Close everything down
}
Other than enabling INT8/FP16 mode, that would be my only suggestion. That and decreasing the resolution of the grid size if you can.
I'm confused that my network slowfast takes almost one second, while it only takes 60+ms on the pytorch side. Even if I use FP16, it takes 350+ms
It looks like you are doing 30 inferencing runs in your benchmark, are you accounting for that in the total execution time?
Regardless, the TensorRT forum may be a better venue for discussion about this: https://forums.developer.nvidia.com/c/ai-data-science/deep-learning/tensorrt/92
Thanks, I will go to the TensorRT forum.
Hello, could you please tell me the differences between jetson-inference and official tensorrt? I use the official tensorrt on my jetson xavier nx, but the 3dconv operation takes so long....