NVIDIA / TensorRT

NVIDIA® TensorRT™ is an SDK for high-performance deep learning inference on NVIDIA GPUs. This repository contains the open source components of TensorRT.
https://developer.nvidia.com/tensorrt
Apache License 2.0
10.68k stars 2.12k forks source link

dynamic gather layer 8406 fail ,8034 ok #2099

Closed zhoutianzi666 closed 11 months ago

zhoutianzi666 commented 2 years ago

Description

Environment

TensorRT Version: 8406 NVIDIA GPU: T4 NVIDIA Driver Version: CUDA Version: CUDNN Version: Operating System: Ubuntu 18 Python Version (if applicable): Tensorflow Version (if applicable): PyTorch Version (if applicable): Baremetal or Container (if so, version):

Relevant Files

Steps To Reproduce


#include "/usr/local/tensorrt/include/NvInfer.h"
#include <vector>
#include <iostream>
// g++ A.cc -lnvinfer -lcudart  -L /usr/local/tensorrt/lib/ -L/usr/local/cuda/lib64 -I /usr/local/cuda/include
// export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/tensorrt/lib

#define TRT_VERSION                                    \
  NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
      NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD

#include "/usr/local/cuda/include/cuda_runtime.h"
class TensorrtLogger : public nvinfer1::ILogger {
  nvinfer1::ILogger::Severity verbosity_;

 public:
  TensorrtLogger(Severity verbosity = Severity::kWARNING)
      : verbosity_(verbosity) {}
  void log(Severity severity, const char* msg) noexcept override {
    if (severity <= verbosity_) {
        printf("%s\n", msg);
  }
  }
};

int main()
{
    std::cout << TRT_VERSION  << std::endl;
    static TensorrtLogger trt_logger(nvinfer1::ILogger::Severity::kWARNING);
    auto trt_builder = nvinfer1::createInferBuilder(trt_logger);
    std::cout << "trt_builder->getNbDLACores():" <<  trt_builder->getNbDLACores() << std::endl;
    std::cout << "trt_builder->platformHasFastFp16()" << trt_builder->platformHasFastFp16() << std::endl;
    const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    auto trt_network = trt_builder->createNetworkV2(explicitBatch);
    auto trt_config = trt_builder->createBuilderConfig();
    trt_config->setMaxWorkspaceSize(1<<30);

    nvinfer1::Dims a;
    a.nbDims = 4;
    a.d[0] = -1;
    a.d[1] = -1;
    a.d[2] = -1;
    a.d[3] = -1;
    nvinfer1::Dims b;
    b.nbDims = 1;
    b.d[0] = -1;
    trt_network->addInput("foo0", nvinfer1::DataType::kFLOAT, a);
    trt_network->addInput("foo1", nvinfer1::DataType::kINT32, b);

nvinfer1::IOptimizationProfile* profile1 = trt_builder->createOptimizationProfile();
profile1->setDimensions("foo0", nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(2, 4, 4, 32));
profile1->setDimensions("foo0", nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4(16, 64, 16, 32));
profile1->setDimensions("foo0", nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4(128, 256, 64, 32));
nvinfer1::Dims c;
c.nbDims = 1;
c.d[0] = 1;
profile1->setDimensions("foo1", nvinfer1::OptProfileSelector::kMIN, c);
c.d[0] = 2;
profile1->setDimensions("foo1", nvinfer1::OptProfileSelector::kOPT, c);
c.d[0] = 4;
profile1->setDimensions("foo1", nvinfer1::OptProfileSelector::kMAX, c);
trt_config->addOptimizationProfile(profile1);

    auto x = trt_network->getInput(0);
    auto y = trt_network->getInput(1);
    auto layer = trt_network->addGather(*x, *y, 0);
    trt_network->markOutput(*layer->getOutput(0));

    auto engine = trt_builder->buildEngineWithConfig(*trt_network, *trt_config);
    auto engine_out_dims = engine->getBindingDimensions(0);
    for (int i = 0 ; i < engine_out_dims.nbDims; i++)
    {
        std::cout << engine_out_dims.d[i] << std::endl;
    }
}
8406
trt_builder->getNbDLACores():0
trt_builder->platformHasFastFp16()1
TensorRT was linked against cuBLAS/cuBLAS LT 11.8.0 but loaded cuBLAS/cuBLAS LT 11.4.1
TensorRT was linked against cuDNN 8.3.2 but loaded cuDNN 8.1.1
Myelin graph with multiple dynamic values may have poor performance if they differ. Dynamic values are: 
 (# 0 (SHAPE foo1))
 (# 2 (SHAPE foo0))
 (# 0 (SHAPE foo0))
 (# 1 (SHAPE foo0))
Skipping tactic 0 due to insuficient memory on requested size of 4294968320 detected for tactic 0.
10: [optimizer.cpp::computeCosts::2033] Error Code 10: Internal Error (Could not find any implementation for node {ForeignNode[(Unnamed Layer* 0) [Gather]]}.)
8034
trt_builder->getNbDLACores():0
trt_builder->platformHasFastFp16()1
TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.1
TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.1.1
Detected invalid timing cache, setup a local cache instead
TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.1
TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.1.1
-1
-1
-1
32
nvpohanh commented 2 years ago

@zhoutianzi666 Could you set the max workspace size to 4GB or 5GB? trt_config->setMaxWorkspaceSize(5<<30);

zhoutianzi666 commented 2 years ago

@zhoutianzi666 Could you set the max workspace size to 4GB or 5GB? trt_config->setMaxWorkspaceSize(5<<30);

5L<<30 ok! 4L<<30 failing. why need so much space in trt8.4??

nvpohanh commented 2 years ago

Long story short: In TRT 8.4, we use a new backend for Gather layer so that it can be fused with other point-wise layers and achieve better performance. However, that backend requires significant more workspace when there are multiple dynamic axes. We are currently fixing this issue so that hopefully this can be solved in next version or next-next version

ttyio commented 11 months ago

closing legacy issues, thanks all!