NVIDIA / TensorRT

NVIDIA® TensorRT™ is an SDK for high-performance deep learning inference on NVIDIA GPUs. This repository contains the open source components of TensorRT.
https://developer.nvidia.com/tensorrt
Apache License 2.0
10.61k stars 2.11k forks source link

Network has dynamic or shape inputs, but no optimization profile has been defined. #1116

Closed 1icas closed 3 years ago

1icas commented 3 years ago

Environment

TensorRT Version: 7.2.2.3 NVIDIA GPU: 1080TI NVIDIA Driver Version: 450.102.04 CUDA Version: 11.0 CUDNN Version: 11.0 Operating System: ubuntu Python Version (if applicable): 3.8 Tensorflow Version (if applicable): PyTorch Version (if applicable): 1.8 Baremetal or Container (if so, version):

Relevant Files

#include "BatchStream.h"
#include "EntropyCalibrator.h"
#include "argsParser.h"
#include "buffers.h"
#include "common.h"
#include "logger.h"
#include "parserOnnxConfig.h"

#include "NvInfer.h"
#include <cuda_runtime_api.h>
#include <random>

#include <opencv2/opencv.hpp>

const std::string gSampleName = "TensorRT.sample_dynamic_reshape";

//! \brief The SampleDynamicReshape class implementes the dynamic reshape sample.
//!
//! \details This class builds one engine that resizes a given input to the correct size, and a
//! second engine based on an ONNX MNIST model that generates a prediction.
//!
class SampleDynamicReshape
{
    template <typename T>
    using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;

public:
    SampleDynamicReshape(const samplesCommon::OnnxSampleParams& params)
        : mParams(params)
    {
    }

    //!
    //! \brief Builds both engines.
    //!
    bool build();

    //!
    //! \brief Prepares the model for inference by creating execution contexts and allocating buffers.
    //!
    bool prepare();

    //!
    //! \brief Runs inference using TensorRT on a random image.
    //!
    bool infer();

private:
    bool buildPreprocessorEngine(const SampleUniquePtr<nvinfer1::IBuilder>& builder);
    bool buildPredictionEngine(const SampleUniquePtr<nvinfer1::IBuilder>& builder);
  Dims loadPGMFile(const std::string& fileName);
    bool validateOutput();

    samplesCommon::OnnxSampleParams mParams; //!< The parameters for the sample.

    nvinfer1::Dims mPredictionInputDims;  //!< The dimensions of the input of the MNIST model.
    nvinfer1::Dims mPredictionOutputDims; //!< The dimensions of the output of the MNIST model.

    // Engines used for inference. The first is used for resizing inputs, the second for prediction.
    SampleUniquePtr<nvinfer1::ICudaEngine> mPreprocessorEngine{nullptr}, mPredictionEngine{nullptr};

    SampleUniquePtr<nvinfer1::IExecutionContext> mPreprocessorContext{nullptr}, mPredictionContext{nullptr};

    samplesCommon::ManagedBuffer mInput{};          //!< Host and device buffers for the input.
    samplesCommon::DeviceBuffer mPredictionInput{}; //!< Device buffer for the output of the preprocessor, i.e. the
                                                    //!< input to the prediction model.
    samplesCommon::ManagedBuffer mOutput{};         //!< Host buffer for the ouptut

    template <typename T>
    SampleUniquePtr<T> makeUnique(T* t)
    {
        return SampleUniquePtr<T>{t};
    }
};

//!
//! \brief Builds the two engines required for inference.
//!
//! \details This function creates one TensorRT engine for resizing inputs to the correct sizes,
//!          then creates a TensorRT network by parsing the ONNX model and builds
//!          an engine that will be used to run inference (mPredictionEngine).
//!
//! \return Ruturns false if error in build preprocessor or predict engine.
//!
bool SampleDynamicReshape::build()
{
    auto builder = makeUnique(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
    if (!builder)
    {
        sample::gLogError << "Create inference builder failed." << std::endl;
        return false;
    }
    // This function will also set mPredictionInputDims and mPredictionOutputDims,
    // so it needs to be called before building the preprocessor.
    return buildPredictionEngine(builder) && buildPreprocessorEngine(builder);
}
bool SampleDynamicReshape::buildPreprocessorEngine(const SampleUniquePtr<nvinfer1::IBuilder>& builder)
{
    std::cout << "can't enter this code" << std::endl;
    // Create the preprocessor engine using a network that supports full dimensions (createNetworkV2).
    auto preprocessorNetwork = makeUnique(
        builder->createNetworkV2(1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
    if (!preprocessorNetwork)
    {
        sample::gLogError << "Create network failed." << std::endl;
        return false;
    }

    // Reshape a dynamically shaped input to the size expected by the model, (1, 1, 28, 28).
    auto input = preprocessorNetwork->addInput("input", nvinfer1::DataType::kFLOAT, Dims4{1, 3, -1, -1});
    auto resizeLayer = preprocessorNetwork->addResize(*input);
    resizeLayer->setOutputDimensions(mPredictionInputDims);
    preprocessorNetwork->markOutput(*resizeLayer->getOutput(0));

    // Finally, configure and build the preprocessor engine.
    auto preprocessorConfig = makeUnique(builder->createBuilderConfig());
    if (!preprocessorConfig)
    {
        sample::gLogError << "Create builder config failed." << std::endl;
        return false;
    }

    // Create an optimization profile so that we can specify a range of input dimensions.
    auto profile = builder->createOptimizationProfile();
    // This profile will be valid for all images whose size falls in the range of [(1, 1, 1, 1), (1, 1, 56, 56)]
    // but TensorRT will optimize for (1, 1, 28, 28)
    // We do not need to check the return of setDimension and addOptimizationProfile here as all dims are explicitly set
    profile->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims4{1, 3, 640, 640});
    profile->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims4{1, 3, 840, 840});
    profile->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims4{1, 3, 2000, 2000});
    preprocessorConfig->addOptimizationProfile(profile);

    // Create a calibration profile.
    auto profileCalib = builder->createOptimizationProfile();
    const int calibBatchSize{1};
    // We do not need to check the return of setDimension and setCalibrationProfile here as all dims are explicitly set
    profileCalib->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims4{calibBatchSize, 3, 840, 840});
    profileCalib->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims4{calibBatchSize, 3, 840, 840});
    profileCalib->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims4{calibBatchSize, 3, 840, 840});
    preprocessorConfig->setCalibrationProfile(profileCalib);
 std::unique_ptr<IInt8Calibrator> calibrator;
    if (mParams.int8)
    {
        preprocessorConfig->setFlag(BuilderFlag::kINT8);
        const int nCalibBatches{10};
        MNISTBatchStream calibrationStream(
            calibBatchSize, nCalibBatches, "train-images-idx3-ubyte", "train-labels-idx1-ubyte", mParams.dataDirs);
        calibrator.reset(
            new Int8EntropyCalibrator2<MNISTBatchStream>(calibrationStream, 0, "MNISTPreprocessor", "input"));
        preprocessorConfig->setInt8Calibrator(calibrator.get());
    }

    mPreprocessorEngine = makeUnique(builder->buildEngineWithConfig(*preprocessorNetwork, *preprocessorConfig));
    if (!mPreprocessorEngine)
    {
        sample::gLogError << "Preprocessor engine build failed." << std::endl;
        return false;
    }
    sample::gLogInfo << "Profile dimensions in preprocessor engine:" << std::endl;
    sample::gLogInfo << "    Minimum = " << mPreprocessorEngine->getProfileDimensions(0, 0, OptProfileSelector::kMIN)
                     << std::endl;
    sample::gLogInfo << "    Optimum = " << mPreprocessorEngine->getProfileDimensions(0, 0, OptProfileSelector::kOPT)
                     << std::endl;
    sample::gLogInfo << "    Maximum = " << mPreprocessorEngine->getProfileDimensions(0, 0, OptProfileSelector::kMAX)
                     << std::endl;
    return true;
}

bool SampleDynamicReshape::buildPredictionEngine(const SampleUniquePtr<nvinfer1::IBuilder>& builder)
{
    // Create a network using the parser.
    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    auto network = makeUnique(builder->createNetworkV2(explicitBatch));
    if (!network)
    {
        sample::gLogError << "Create network failed." << std::endl;
        return false;
    }

    auto parser = samplesCommon::infer_object(nvonnxparser::createParser(*network, sample::gLogger.getTRTLogger()));
    bool parsingSuccess = parser->parseFromFile(mParams.onnxFileName.c_str(),
        static_cast<int>(sample::gLogger.getReportableSeverity()));
    if (!parsingSuccess)
    {
        sample::gLogError << "Failed to parse model." << std::endl;
        return false;
    }

    // Attach a softmax layer to the end of the network.
    //auto softmax = network->addSoftMax(*network->getOutput(0));
    // Set softmax axis to 1 since network output has shape [1, 10] in full dims mode
    //softmax->setAxes(1 << 1);
    //network->unmarkOutput(*network->getOutput(0));
    //network->markOutput(*softmax->getOutput(0));

    // Get information about the inputs/outputs directly from the model.
    mPredictionInputDims = network->getInput(0)->getDimensions();
    mPredictionOutputDims = network->getOutput(0)->getDimensions();

    std::cout << mPredictionInputDims.d[2] << std::endl;
    auto nb_output = network->getNbInputs();
    std::cout << "output numbers: " << nb_output << std::endl;
    // Create a builder config
    auto config = makeUnique(builder->createBuilderConfig());
    if (!config)
    {
        sample::gLogError << "Create builder config failed." << std::endl;
        return false;
    }
    config->setMaxWorkspaceSize(16_MiB);

    auto profileCalib = builder->createOptimizationProfile();
    const auto inputName = mParams.inputTensorNames[0].c_str();
    const int calibBatchSize{1};
    std::cout << inputName << std::endl;
    // We do not need to check the return of setDimension and setCalibrationProfile here as all dims are explicitly set
    profileCalib->setDimensions(inputName, OptProfileSelector::kMIN, Dims4{calibBatchSize, 3, 640, 640});
profileCalib->setDimensions(inputName, OptProfileSelector::kOPT, Dims4{calibBatchSize, 3, 840, 840});
    profileCalib->setDimensions(inputName, OptProfileSelector::kMAX, Dims4{calibBatchSize, 3, 2000, 2000});
//    const int a = 1;
//    profileCalib->setShapeValues(inputName, OptProfileSelector::kMIN, &a, 1);
//    profileCalib->setShapeValues(inputName, OptProfileSelector::kOPT, &a, 1);
//    profileCalib->setShapeValues(inputName, OptProfileSelector::kMAX, &a, 1);
    auto nn = config->setCalibrationProfile(profileCalib);
    std::cout << nn << std::endl;
    std::cout << mParams.int8 << std::endl;

    // Build the prediciton engine.
    mPredictionEngine = makeUnique(builder->buildEngineWithConfig(*network, *config));
    if (!mPredictionEngine)
    {
        sample::gLogError << "Prediction engine build failed." << std::endl;
        return false;
    }
    std::cout << "hello ---" << std::endl;
    return true;
}
bool SampleDynamicReshape::prepare()
{
    mPreprocessorContext = makeUnique(mPreprocessorEngine->createExecutionContext());
    if (!mPreprocessorContext)
    {
        sample::gLogError << "Preprocessor context build failed." << std::endl;
        return false;
    }

    mPredictionContext = makeUnique(mPredictionEngine->createExecutionContext());
    if (!mPredictionContext)
    {
        sample::gLogError << "Prediction context build failed." << std::endl;
        return false;
    }

    // Since input dimensions are not known ahead of time, we only allocate the output buffer and preprocessor output
    // buffer.
    mPredictionInput.resize(mPredictionInputDims);
    mOutput.hostBuffer.resize(mPredictionOutputDims);
    mOutput.deviceBuffer.resize(mPredictionOutputDims);
    return true;
}

bool SampleDynamicReshape::infer()
{
    // Load a random PGM file into a host buffer, then copy to device.
    std::random_device rd{};
    std::default_random_engine generator{rd()};
    std::uniform_int_distribution<int> digitDistribution{0, 9};
    int digit = digitDistribution(generator);

    Dims inputDims = loadPGMFile("test");
    mInput.deviceBuffer.resize(inputDims);
    CHECK(cudaMemcpy(
        mInput.deviceBuffer.data(), mInput.hostBuffer.data(), mInput.hostBuffer.nbBytes(), cudaMemcpyHostToDevice));

    // Set the input size for the preprocessor
    CHECK_RETURN_W_MSG(mPreprocessorContext->setBindingDimensions(0, inputDims), false, "Invalid binding dimensions.");

    // We can only run inference once all dynamic input shapes have been specified.
    if (!mPreprocessorContext->allInputDimensionsSpecified())
    {
        return false;
    }

    // Run the preprocessor to resize the input to the correct shape
    std::vector<void*> preprocessorBindings = {mInput.deviceBuffer.data(), mPredictionInput.data()};
    // For engines using full dims, we can use executeV2, which does not include a separate batch size parameter.
    bool status = mPreprocessorContext->executeV2(preprocessorBindings.data());
    if (!status)
    {
        return false;
    }

    // Next, run the model to generate a prediction.
    std::vector<void*> predicitonBindings = {mPredictionInput.data(), mOutput.deviceBuffer.data()};
    status = mPredictionContext->executeV2(predicitonBindings.data());
    if (!status)
    {
        return false;
    }

    // Copy the outputs back to the host and verify the output.
    CHECK(cudaMemcpy(mOutput.hostBuffer.data(), mOutput.deviceBuffer.data(), mOutput.deviceBuffer.nbBytes(),
        cudaMemcpyDeviceToHost));
    return validateOutput();
}

Dims SampleDynamicReshape::loadPGMFile(const std::string& fileName)
{

    cv::Mat img = cv::imread("./1389.jpg");
    cv::Mat dst;
    img.convertTo(dst, CV_32F, 1.0 / 127.5, -1.0);
    float* input_data = (float*)dst.data;
    auto inputC = dst.dims;
    auto inputW = dst.cols;
    auto inputH = dst.rows;
    Dims4 inputDims{1, inputC, inputH, inputW};
    // Normalize and copy to the host buffer.
    mInput.hostBuffer.resize(inputDims);
    float* hostDataBuffer = static_cast<float*>(mInput.hostBuffer.data());
    //std::transform(fileData.begin(), fileData.end(), hostDataBuffer,
    //    [](uint8_t x) { return 1.0 - static_cast<float>(x / 255.0); });
    for (int c = 0; c < inputC; ++c) {
        for(int j = 0, volChl=inputH*inputW; j < volChl; ++j) {
            hostDataBuffer[c*volChl + j] = input_data[j*inputC + c];
        }
    }
    return inputDims;
}

bool SampleDynamicReshape::validateOutput()
{
    const float* bufRaw = static_cast<const float*>(mOutput.hostBuffer.data());
    //std::vector<float> prob(bufRaw, bufRaw + mOutput.hostBuffer.size());
    std::cout <<  mOutput.hostBuffer.size() << std::endl;

    return true;
}

samplesCommon::OnnxSampleParams initializeSampleParams(const samplesCommon::Args& args)
{
    samplesCommon::OnnxSampleParams params;
    params.onnxFileName = "detect_test.onnx";
    params.inputTensorNames.push_back("input");
    params.outputTensorNames.push_back("output");
    return params;
}

//!
//! \brief Prints the help information for running this sample
//!
void printHelpInfo()
{
    std::cout << "Usage: ./sample_dynamic_reshape [-h or --help] [-d or --datadir=<path to data directory>]"
              << std::endl;
    std::cout << "--help, -h      Display help information" << std::endl;
    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used "
                 "multiple times to add multiple directories. If no data directories are given, the default is to use "
                 "(data/samples/mnist/, data/mnist/)"
              << std::endl;
    std::cout << "--int8          Run in Int8 mode." << std::endl;
    std::cout << "--fp16          Run in FP16 mode." << std::endl;
}

int main(int argc, char** argv)
{
    samplesCommon::Args args;
    bool argsOK = samplesCommon::parseArgs(args, argc, argv);
    if (!argsOK)
    {
        sample::gLogError << "Invalid arguments" << std::endl;
        printHelpInfo();
        return EXIT_FAILURE;
    }
    if (args.help)
    {
        printHelpInfo();
        return EXIT_SUCCESS;
    }

    auto sampleTest = sample::gLogger.defineTest(gSampleName, argc, argv);

    sample::gLogger.reportTestStart(sampleTest);

    SampleDynamicReshape sample{initializeSampleParams(args)};

    if (!sample.build())
    {
        return sample::gLogger.reportFail(sampleTest);
    }
    if (!sample.prepare())
    {
        return sample::gLogger.reportFail(sampleTest);
    }
    if (!sample.infer())
    {
        return sample::gLogger.reportFail(sampleTest);
    }

    return sample::gLogger.reportPass(sampleTest);
}

Steps To Reproduce and Description

I modify the code from https://github.com/NVIDIA/TensorRT/tree/master/samples/opensource/sampleDynamicReshape . I dont know why the code have a bug. [03/12/2021-17:53:57] [E] [TRT] Network has dynamic or shape inputs, but no optimization profile has been defined. [03/12/2021-17:53:57] [E] [TRT] Network validation failed. [03/12/2021-17:53:57] [E] Prediction engine build failed.

ps: detect_test.onnx is generate use the follow code:

import torch
import torchvision
import torch.onnx
from model_retina import RetinaFaceResNetBackbone
from torch.autograd import Variable

# An instance of your model
model = RetinaFaceResNetBackbone()#$torchvision.models.resnet18()
#model.load_state_dict({k.replace('module.',''):v for k,v in torch.load("./lite.pth", map_location="cpu").items()}, strict=False)
# An example input you would normally provide to your model's forward() method
x = Variable(torch.rand(1, 3, 840, 840))

dynamic_ax = {'input' : {2 : 'image_height',3:'image_wdith'}, 'output' : {2 : 'image_height',3:'image_wdith'}}

input_name = ['input']
output_name = ['output']

# Export the model
torch_out = torch.onnx._export(model, x, "detect_test.onnx", input_names=input_name, output_names=output_name, verbose=True, dynamic_axes=dynamic_ax)

And i can ust the detect_test.onnx in python tensorrt for dynamic input. But the c++ tensorrt is not ok. I am very confusing.

ttyio commented 3 years ago

Hello @1icas , sorry for the delay response, seems you have not call addOptimizationProfile in the buildPredictionEngine, could you take a check? thanks!

ttyio commented 3 years ago

Close since no activity for more than 3 weeks, please reopen if you still have question, thanks!

chenjun2hao commented 3 years ago

@ttyio @rajeevsrao , i meet the same problem

[09/07/2021-15:54:19] [W] [TRT] /home/nvidia/TensorRT/parsers/onnx/onnx2trt_utils.cpp:220: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[09/07/2021-15:54:19] [W] [TRT] Tensor DataType is determined at build time for tensors not marked as input or output.
[09/07/2021-15:54:19] [E] [TRT] Network has dynamic or shape inputs, but no optimization profile has been defined.
[09/07/2021-15:54:19] [E] [TRT] Network validation failed.
[09/07/2021-15:54:19] [E] Engine creation failed
[09/07/2021-15:54:19] [E] Engine set up failed

however, the same model works well on pc, but on tx2 and nx, it has problem with trtexec.this is my model. https://drive.google.com/file/d/1s36DFVYA0xftf_ihDaVhsxZjEXIaf7g-/view?usp=sharing

ttyio commented 3 years ago

Hello @chenjun2hao , what's your command line to run this model? try

      trtexec --onnx=*.onnx --best --optShapes='inputx':1x3x480x640 
chenjun2hao commented 3 years ago

@ttyio , the command line is:

./trtexec --onnx=DDRNet23_OCR_17Class_stable_BN2_dynamic.onnx --saveEngine=DDRNet23_OCR_17Class_stable_BN2_dynamicf16.trt --workspace=64 --minShapes=inputx:1x3x480x640 --optShapes=inputx:16x3x480x640 --maxShapes=inputx:32x3x480x640 --fp16

and it works fine on pc with 3080+tensorrt7.2.2.3

but on tx2 and nx, i tried the same model and command, it print the problem: Network has dynamic or shape inputs, but no optimization profile has been defined. and i check the source code, it seems has config the profile.

chenjun2hao commented 3 years ago

@ttyio i just test your command on tx2. it has the same problem...

ttyio commented 3 years ago

@chenjun2hao , I am not sure if these is bug in the trtexec, TRT is 7.1.3 in the jetson, have you tried TRT API to build the engine? thanks.

chenjun2hao commented 3 years ago

@ttyio , i only use the trtexec not use the TRT API. and how to use the TRT API to build the engine from onnx? thanks.

ttyio commented 3 years ago

@chenjun2hao

I have just tried TX2 and cannot repro the failure. So maybe you are using old JetPack version, there was once a bug need --explicitBatch to WAR the bug. Could you try add --explicitBatch in your command?thanks!

chenjun2hao commented 3 years ago

@ttyio also the same error! this is my error and my command:

[09/09/2021-16:02:40] [W] [TRT] /home/nvidia/TensorRT/parsers/onnx/onnx2trt_utils.cpp:220: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[09/09/2021-16:02:40] [W] [TRT] Tensor DataType is determined at build time for tensors not marked as input or output.
[09/09/2021-16:02:40] [E] [TRT] Network has dynamic or shape inputs, but no optimization profile has been defined.
[09/09/2021-16:02:40] [E] [TRT] Network validation failed.
[09/09/2021-16:02:40] [E] Engine creation failed
[09/09/2021-16:02:40] [E] Engine set up failed
&&&& FAILED TensorRT.trtexec # ./trtexec_debug --onnx=/home/nvidia/chenjun/model/DDRNet23_OCR_17Class_stable_BN2_dynamic.onnx --saveEngine=/home/nvidia/chenjun/model/DDRNet23_OCR_17Class_stable_BN2_dynamicf16.trt --minShapes=inputx:1x3x244x244 --optShapes=inputx:16x3x244x244 --maxShapes=inputx:32x3x244x244 --workspace=32 --explicitBatch

my jetpack version is:jetpack 4.4 [L4T 32.4.3].

ttyio commented 3 years ago

@chenjun2hao , sorry I cannot tell from your log, Could you try the trtexec from the system instead of your debug version? If that still failed, maybe you need upgrade your JetPack?

chenjun2hao commented 3 years ago

@ttyio . OK, i will try.

chenjun2hao commented 3 years ago

@ttyio ,i have solved this problem. the solve is: just use the trtexec in usr/src/tensorrt/bin folder on tx2 or nx. I just build the project by my own, and i have the error.