grimoire / torch2trt_dynamic

A pytorch to tensorrt convert with dynamic shape support
MIT License
254 stars 34 forks source link

Error number of channels in input tensor to a convolution layer must not be dynamic #27

Open Kracozebr opened 2 years ago

Kracozebr commented 2 years ago

Hi. I'm trying using torch2trt_dynamic to convert models from torch to trt. I got an unexpected issue. To reproduce it I use lenet5 like neural network.

from torch2trt_dynamic import torch2trt_dynamic
import torch
from lenet5 import LeNet5

model = LeNet5()
model_path = 'models/lenet5_mnist.pt'
model_to_save = 'models/lenet5_mnist_trt.pt'

# loading the model and getting model parameters by using load_state_dict
model.load_state_dict(torch.load(model_path))
model.eval().cuda()

# create example data
x = torch.ones((1, 1, 32, 32)).cuda()

# convert to TensorRT feeding sample data as input
opt_shape_param = [
    [
        [5, 1, 16, 16],   # min
        [5, 1, 32, 32],   # opt
        [5, 1, 64, 64]    # max
    ]
]
model_trt = torch2trt_dynamic(model, [x], fp16_mode=False, opt_shape_param=opt_shape_param)

x = torch.rand(1,1,32,32).cuda()
with torch.no_grad():
    y = model(x)
    y_trt = model_trt(x)

# check the output against PyTorch
print(torch.max(torch.abs(y - y_trt)))

I got the next error:

[TensorRT] ERROR: (Unnamed Layer* 31) [Convolution]: number of channels in input tensor to a convolution layer must not be dynamic
[TensorRT] ERROR: Builder failed while analyzing shapes.
Traceback (most recent call last):
  File "example_converter.py", line 39, in <module>
    y_trt = model_trt(x)

I tried to deal with it and found that if I change opt_shape_param it works:

opt_shape_param = [
    [
        [5, 1, 32, 32],   # min
        [5, 1, 32, 32],   # opt
        [5, 1, 32, 32]    # max
    ]
]

@grimoire So I do not understand why this error occurs in your example opt shape param has different shapes for height and width? TensorRT Version: 7.1.3 GPU Type: GeForce RTX 2080 Ti Nvidia Driver Version: 470.103.01 CUDA Version: 10.2 CUDNN Version: 8.0.5 Operating System + Version: Ubuntu 18.04 Python Version: 3.6.9 PyTorch Version: 1.8.1

grimoire commented 2 years ago

Hi, according to the error log you provided, is there some structure like torch.nn.conv2d(input, weight,...) in your model where weight is not a static parameter?

Kracozebr commented 2 years ago

Thanks for fast reply , no there is no such structure. Here is the code of model

import torch.nn as nn

class LeNet5(nn.Module):
    def __init__(self):
        super().__init__()

        self._body = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
        )
        self._head = nn.Sequential(
            nn.Linear(in_features=16 * 5 * 5, out_features=120),
            nn.ReLU(inplace=True),
            nn.Linear(in_features=120, out_features=84),
            nn.ReLU(inplace=True),
            nn.Linear(in_features=84, out_features=10)
        )

    def forward(self, x):
        x = self._body(x)
        x = x.view(x.size()[0], -1)
        x = self._head(x)
        return x

I think that it is due to tensorrt version I built the model in trt and when I to inference there is the error. The code

import torch
import torch.nn as nn

import common
import numpy as np

import time

import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

global BATCH_SIZE 
BATCH_SIZE = 8

class LeNet5(nn.Module):
    def __init__(self):
        super().__init__()
        self._body = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5), 
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
        )

        self._head = nn.Sequential(
            nn.Linear(in_features=16 * 5 * 5, out_features=120), 
            nn.ReLU(inplace=True),
            nn.Linear(in_features=120, out_features=84), 
            nn.ReLU(inplace=True),
            nn.Linear(in_features=84, out_features=10)
        )

    def forward(self, x):
        x = self._body(x)
        x = x.view(x.size()[0], -1)
        x = self._head(x)
        return x

# initialize the model
lenet5_model = LeNet5()

class ModelData(object):
    INPUT_NAME = "data"
    MODEL_PATH = 'models/lenet5_mnist.pt'
    INPUT_SHAPE = (-1, 1, -1, -1)
    OUTPUT_NAME = "prob"
    DTYPE = trt.float32

class LeNet5TRT(object):
    def __init__(self, weights) -> None:
        super().__init__()
        self.weights = weights
        self.engine = self.build_engine()

    def populate_network(self):
        # Configure the network layers based on the self.weights provided.
        input_tensor = self.network.add_input(name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE)

        # body
        _body_conv1_w = self.weights['_body.0.weight'].numpy()
        _body_conv1_b = self.weights['_body.0.bias'].numpy()
        _body_conv1 = self.network.add_convolution(input=input_tensor,
                                             num_output_maps=6,
                                             kernel_shape=(5, 5),
                                             kernel=_body_conv1_w ,
                                             bias=_body_conv1_b)
        _body_conv1.stride = (1, 1)
        _body_conv1.padding = (0, 0)

        _body_relu1 = self.network.add_activation(
                                            input=_body_conv1.get_output(0), 
                                            type=trt.ActivationType.RELU)

        _body_maxpool1 = self.network.add_pooling(input=_body_relu1.get_output(0), 
                                        type=trt.PoolingType.MAX, 
                                        window_size=(2, 2))

        _body_conv2_w = self.weights['_body.3.weight'].numpy()
        _body_conv2_b = self.weights['_body.3.bias'].numpy()
        _body_conv2 = self.network.add_convolution(input=_body_maxpool1.get_output(0),
                                             num_output_maps=16,
                                             kernel_shape=(5, 5),
                                             kernel=_body_conv2_w,
                                             bias=_body_conv2_b)
        _body_conv2.stride = (1, 1)
        _body_conv2.padding = (0, 0)

        _body_relu2 = self.network.add_activation(
                                            input=_body_conv2.get_output(0), 
                                            type=trt.ActivationType.RELU)
        _body_maxpool2 = self.network.add_pooling(input=_body_relu2.get_output(0), 
                                        type=trt.PoolingType.MAX, 
                                        window_size=(2, 2))

        # head
        _head_linear1_w = self.weights['_head.0.weight'].numpy()
        _head_linear1_b = self.weights['_head.0.bias'].numpy()
        _head_linear1 = self.network.add_fully_connected(
                                                         input=_body_maxpool2.get_output(0),
                                                         num_outputs=120,
                                                         kernel=_head_linear1_w,
                                                         bias=_head_linear1_b)
        _head_relu1 = self.network.add_activation(
                                                  input=_head_linear1 .get_output(0), 
                                                  type=trt.ActivationType.RELU)

        _head_linear2_w = self.weights['_head.2.weight'].numpy()
        _head_linear2_b = self.weights['_head.2.bias'].numpy()
        _head_linear2 = self.network.add_fully_connected(
                                                         input=_head_relu1.get_output(0),
                                                         num_outputs=84,
                                                         kernel=_head_linear2_w,
                                                         bias=_head_linear2_b)

        _head_relu2 = self.network.add_activation(
                                                  input=_head_linear2 .get_output(0), 
                                                  type=trt.ActivationType.RELU)

        _head_linear3_w = self.weights['_head.4.weight'].numpy()
        _head_linear3_b = self.weights['_head.4.bias'].numpy()
        _head_linear3 = self.network.add_fully_connected(
                                                         input=_head_relu2.get_output(0),
                                                         num_outputs=10,
                                                         kernel=_head_linear3_w,
                                                         bias=_head_linear3_b)

        _head_linear3.get_output(0).name = "prob"
        self.network.mark_output(tensor=_head_linear3 .get_output(0))

    def GiB(self, val):
        return val * 1 << 30

    def build_engine(self):
        logger= trt.Logger(trt.Logger.INFO)

        with trt.Builder(logger) as builder:
                builder.max_batch_size = BATCH_SIZE

                network_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

                with builder.create_network(network_flag) as net:
                    self.network = net
                    self.populate_network()
                    self.network.get_input(0).dtype=trt.DataType.HALF
                    self.network.get_output(0).dtype=trt.DataType.HALF
                    # we set the inputs and outputs to be float16 type to enable
                    # maximum fp16 acceleration. Also helps for int8

                    config=builder.create_builder_config()
                    # we specify all the important parameters like precision, 
                    # device type, fallback in config object

                    config.max_workspace_size =  1 << 30 # 10 * (2 ** 30) # 1 gb

                    config.set_flag(trt.BuilderFlag.GPU_FALLBACK)
                    config.set_flag(trt.BuilderFlag.FP16)

                    config.profiling_verbosity = trt.ProfilingVerbosity.VERBOSE
                    #building with verbose profiling helps debug the engine if there are
                    #errors in inference output. Does not impact throughput.
                    profile = builder.create_optimization_profile();
                    profile.set_shape(ModelData.INPUT_NAME, 
                                     (BATCH_SIZE, 1, 16, 16), 
                                     (BATCH_SIZE, 1, 32, 32), 
                                     (BATCH_SIZE, 1, 64, 64)) 
                    config.add_optimization_profile(profile)

                    return builder.build_engine(net, config)

def load_random_test_case(pagelocked_buffer):
    # Select an image at random to be the test case.
    img = np.random.rand(BATCH_SIZE,1,32,32).astype(np.float32)
    # Copy to the pagelocked input buffer
    np.copyto(pagelocked_buffer, img.ravel())
    return img

def main():
    common.add_help(description="Yeah!")
    # Get the PyTorch weights
    lenet5_model = LeNet5()
    lenet5_model.eval()
    lenet5_model.load_state_dict(torch.load(ModelData.MODEL_PATH))
    weights = lenet5_model.state_dict()

    # Do inference with TensorRT.
    with LeNet5TRT(weights).engine as engine:
        # Build an engine, allocate buffers and create a stream.
        # For more information on buffer allocation, refer to the introductory samples.
        with open('models/lenet5_mnist.trt', "wb") as f:
            f.write(engine.serialize())

        with open('models/lenet5_mnist.trt', "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())
            inputs, outputs, bindings, stream = common.allocate_buffers(engine, batch_size=1)
            with engine.create_execution_context() as context:
                t = 0
                for _ in range(1):
                    img = load_random_test_case(pagelocked_buffer=inputs[0].host)
                    # For more information on performing inference, refer to the introductory samples.
                    # The common.do_inference function will return a list of outputs 
                    a = time.time()
                    context.set_binding_shape(0, (BATCH_SIZE, 1, 32, 32))
                    context.active_optimization_profile = 0
                    pred_trt = common.do_inference_v2(context, bindings=bindings, inputs=inputs, 
                                                      outputs=outputs, stream=stream)

                    t += time.time() - a

        with torch.no_grad():   
            pred_torch = lenet5_model.cuda()(torch.from_numpy(img).cuda())
            print('baseline: ', pred_torch.cpu().numpy())
        print(np.asarray(pred_trt, dtype=np.float32).shape)
        print('output:   ', np.asarray(pred_trt, dtype=np.float32).reshape((BATCH_SIZE, 10)))
        print('diff:    ', torch.max(torch.abs(pred_torch.cpu() - 
                                     torch.as_tensor(np.asarray(pred_trt, dtype=np.float32).reshape((BATCH_SIZE, 10))))))
    print('Time: ', t)

if __name__ == '__main__':
    main()

And the error is:

[TensorRT] ERROR: Try increasing the workspace size with IBuilderConfig::setMaxWorkspaceSize() if using IBuilder::buildEngineWithConfig, or IBuilder::setMaxWorkspaceSize() if using IBuilder::buildCudaEngine.
[TensorRT] ERROR: ../builder/tacticOptimizer.cpp (1715) - TRTInternal Error in computeCosts: 0 (Could not find any implementation for node (Unnamed Layer* 6) [Fully Connected] + (Unnamed Layer* 7) [Activation].)
[TensorRT] ERROR: ../builder/tacticOptimizer.cpp (1715) - TRTInternal Error in computeCosts: 0 (Could not find any implementation for node (Unnamed Layer* 6) [Fully Connected] + (Unnamed Layer* 7) [Activation].)
Traceback (most recent call last):
  File "torch_inf.py", line 261, in <module>
    main()
  File "torch_inf.py", line 224, in main
    with LeNet5TRT(weights).engine as engine:
AttributeError: __enter__

When I change the input shape and profile shape to :

INPUT_SHAPE = (-1, 1, 32, 32)

 profile.set_shape(ModelData.INPUT_NAME, 
                                     (BATCH_SIZE, 1, 32, 32), 
                                     (BATCH_SIZE, 1, 32, 32), 
                                     (BATCH_SIZE, 1, 32, 32)) 

Then it works properly. So it possible that it is error due to old version of trt. I wrote to the nvdia team and they answered that:

There were known similar issues in older TRT version. And those are fixed in the later versions. We recommend you to please use the latest Jetpack version, which comes with TRT 8.