[Bug] Tensorrt output incorrect with batch inference

phamkhactu commented 1 year ago

Thanks for excellence repo.

I encounter incorrect result when I infer with batch size >1.

Fist of all, I try to infer each image. The result is okie with each image.

['561694,76', '5,00','561694,76', '5,00','561694,76', '5,00','561694,76', '5,00']

Then I want to try to with batch infer(8 images), the result only first element correct, and the others left is incorrect.

['561694,76','','', '','', '','', '']

Here is my code:

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class OCR_Tensorrt:
    def __init__(self, model_path,batch_size=8) -> None:
        self.engine = self._load_model(model_path)
        self.context = self.engine.create_execution_context()

        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(batch_size)
        self.target_transform =  transforms.Compose([
            transforms.Resize((32, 128), transforms.InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize(0.5, 0.5)
        ])

    def _load_model(self, model_path):
        with open(model_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def allocate_buffers(self, batch_size=1):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        for binding in self.engine:
            size = trt.volume(self.engine.get_binding_shape(binding))* batch_size
            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(device_mem))
            # Append to the appropriate list.
            if self.engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))
        return inputs, outputs, bindings, stream

    def do_inference(self, bindings, inputs, outputs, stream, batch_size=1):
        # Transfer input data to the GPU.
        [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
        # Run inference.
        self.context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
        # Synchronize the stream
        stream.synchronize()
        # Return only the host outputs.
        return [out.host for out in outputs]

    def infer_batch(self, images:list):
        transformed_images = []
        for image in images:
            image = Image.fromarray(image).convert("RGB")
            image = self.target_transform(image)
            transformed_images.append(image.detach().cpu().numpy())

        transformed_images = np.array(transformed_images)
        np.copyto(self.inputs[0].host, transformed_images.ravel())
        outputs = self.do_inference(self.bindings,self.inputs, self.outputs, self.stream, len(images))
        return outputs

    def infer(self, image):
        image = Image.fromarray(image).convert("RGB")
        image = self.target_transform(image)
        np.copyto(self.inputs[0].host, image.ravel())
        outputs = self.do_inference(self.bindings,self.inputs, self.outputs, self.stream)
        return outputs
if __name__ == '__main__':
    lst_imgs=[
        cv2.imread("crop_imgs/0.jpg"),
        cv2.imread("crop_imgs/1.jpg"),
        cv2.imread("crop_imgs/0.jpg"),
        cv2.imread("crop_imgs/1.jpg"),
        cv2.imread("crop_imgs/0.jpg"),
        cv2.imread("crop_imgs/1.jpg"),
        cv2.imread("crop_imgs/0.jpg"),
        cv2.imread("crop_imgs/1.jpg")
        ]

    ocr_trt = OCR_Tensorrt("en_space_not_fintune_86.3990_f32.plan")

    tic = time.time()
    outputs = ocr_trt.infer_batch(lst_imgs)
    outputs = np.reshape(outputs,(8,1,26,95))

    toc = time.time()
    print(toc-tic)
    print("*"*50)

    charset_train = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~ '
    tokenizer = Tokenizer(charset_train)

    for output in outputs:
        p = torch.from_numpy(output).softmax(-1)
        predict , probs = tokenizer.decode(p)
        print(predict, probs)

When I have faced with problems, I think maybe when convert onnx to tensorrt I've set batch=1

def build_detec_engine(onnx_path, using_half=True, dynamic_input=True, workspace_size=2, name_input="x",
                min_shape=(1,3,100,100), opt_shape=(1,3,960,960), max_shape=(1,3,1280,1280)):
    trt.init_libnvinfer_plugins(None, '')
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_batch_size = 1 # always 1 for explicit batch
        config = builder.create_builder_config()
        config.max_workspace_size = GiB(int(workspace_size))
        if using_half:
            config.set_flag(trt.BuilderFlag.FP16)
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(onnx_path, 'rb') as model:
            if not parser.parse(model.read()):
                print ('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print (parser.get_error(error))
                return None

        if dynamic_input:
            profile = builder.create_optimization_profile();
            profile.set_shape(name_input, min_shape, opt_shape, max_shape) 
            config.add_optimization_profile(profile)
        print("hihihihi")
        return builder.build_serialized_network(network, config) 

def save_engine(serialized_engine, file_name):
    # buf = serialized_engine.serialize()
    with open(file_name, 'wb') as f:
       f.write(serialized_engine)

def load_engine(trt_runtime, plan_path):
   with open(plan_path, 'rb') as f:
       engine_data = f.read()
   engine = trt_runtime.deserialize_cuda_engine(engine_data)
   return engine

def convert_to_rt():

    engine_name = "en_space_not_fintune_86.3990_f32.plan"
    onnx_path = "en_space_not_fintune_86.3990.onnx" 
    batch_size = 16

    model = ModelProto()
    with open(onnx_path, "rb") as f:
        model.ParseFromString(f.read())

    d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
    d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
    d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
    shape = [batch_size , d0, d1 ,d2]
    print("shape input model:", shape) #[1, 3, -1, -1]
    engine = build_detec_engine(onnx_path=onnx_path, name_input="input", min_shape=(1,3,32,128),opt_shape=(1,3,32,128),max_shape=(1,3,32,128), using_half=False)
    save_engine(engine, engine_name) 

if __name__ == '__main__':
    convert_to_rt()

But when infer, I set buffer allocated, and assign batch_size in function do_infer and it parses from infer_batch func. I think it can not cause the problem. Thanks in advance for your help.

zerollzeng commented 1 year ago

Can you try it with polygraphy? see https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy/examples/cli/run/01_comparing_frameworks

zerollzeng commented 1 year ago

Or can you share the model here? Thanks!

phamkhactu commented 1 year ago

Hi @zerollzeng

I've set batch_size= 8 when convert onnx to RT, and model works well(8 outputs). It means that if it is set batch_size =1, it can not infer batch_size=8, especially we set buffer *8buffer_memory** because model only infer 3x32x128(flatten) buffer.

Do u want to check my model? I will give you.

zerollzeng commented 1 year ago

I've set batch_size= 8 when convert onnx to RT, and model works well(8 outputs). It means that if it is set batch_size =1, it can not infer batch_size=8, especially we set buffer *8buffer_memory** because model only infer 3x32x128(flatten) buffer.

If your model has fixed shape, then it's expected. if your model has dynamic shape in the batch dimension, then you can set a dynamic shape profile. Please refer to https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work_dynamic_shapes

phamkhactu commented 1 year ago

@zerollzeng I have another question for you that if I create dynamic input from onnx. It can be converted successfully.

dummy_input = torch.randn(1, 3, 32, 128)
dynamic_axes= {'input':[0,2,3]}

torch.onnx.export(model, dummy_input, model_path, verbose=True, input_names=['input'], 
                  output_names=['output'], opset_version=14, dynamic_axes=dynamic_axes)

But when convert onnx model to Tensorrt. Tensorrt can not recognize(parse error) input_dynamic_axes_1 I am very happy if you can give me some advance guide to face with problem. Thank you very much.

zerollzeng commented 1 year ago

Can you reproduce the parse error with trtexec? if yes please share the onnx here then I can take a further check. Thanks!

phamkhactu commented 1 year ago

Can you reproduce the parse error with trtexec? if yes please share the onnx here then I can take a further check. Thanks!

Hi @zerollzeng,

I give you my model

zerollzeng commented 1 year ago

Passed for me

[03/30/2023-14:06:27] [I] === Performance summary ===
[03/30/2023-14:06:27] [I] Throughput: 500.57 qps
[03/30/2023-14:06:27] [I] Latency: min = 1.99854 ms, max = 2.17212 ms, mean = 2.01351 ms, median = 2.01257 ms, percentile(90%) = 2.02368 ms, percentile(95%) = 2.03467 ms, percentile(99%) = 2.04382 ms
[03/30/2023-14:06:27] [I] Enqueue Time: min = 0.754272 ms, max = 1.53345 ms, mean = 0.940432 ms, median = 0.932678 ms, percentile(90%) = 1.02271 ms, percentile(95%) = 1.04611 ms, percentile(99%) = 1.0979 ms
[03/30/2023-14:06:27] [I] H2D Latency: min = 0.0088501 ms, max = 0.0406494 ms, mean = 0.0184139 ms, median = 0.0200806 ms, percentile(90%) = 0.0227051 ms, percentile(95%) = 0.0231934 ms, percentile(99%) = 0.0334473 ms
[03/30/2023-14:06:27] [I] GPU Compute Time: min = 1.98218 ms, max = 2.14471 ms, mean = 1.99089 ms, median = 1.98865 ms, percentile(90%) = 1.99634 ms, percentile(95%) = 2.01318 ms, percentile(99%) = 2.01727 ms
[03/30/2023-14:06:27] [I] D2H Latency: min = 0.00341797 ms, max = 0.012207 ms, mean = 0.00420898 ms, median = 0.00390625 ms, percentile(90%) = 0.00537109 ms, percentile(95%) = 0.0057373 ms, percentile(99%) = 0.00622559 ms
[03/30/2023-14:06:27] [I] Total Host Walltime: 3.00458 s
[03/30/2023-14:06:27] [I] Total GPU Compute Time: 2.9943 s
[03/30/2023-14:06:27] [I] Explanations of the performance metrics are printed in the verbose logs.
[03/30/2023-14:06:27] [I]
&&&& PASSED TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=enn_space_not_fintune_86.3990.onnx --optShapes=input:1x3x32x128

What error do you see? can you share the error log here?

phamkhactu commented 1 year ago

Passed for me

[03/30/2023-14:06:27] [I] === Performance summary ===
[03/30/2023-14:06:27] [I] Throughput: 500.57 qps
[03/30/2023-14:06:27] [I] Latency: min = 1.99854 ms, max = 2.17212 ms, mean = 2.01351 ms, median = 2.01257 ms, percentile(90%) = 2.02368 ms, percentile(95%) = 2.03467 ms, percentile(99%) = 2.04382 ms
[03/30/2023-14:06:27] [I] Enqueue Time: min = 0.754272 ms, max = 1.53345 ms, mean = 0.940432 ms, median = 0.932678 ms, percentile(90%) = 1.02271 ms, percentile(95%) = 1.04611 ms, percentile(99%) = 1.0979 ms
[03/30/2023-14:06:27] [I] H2D Latency: min = 0.0088501 ms, max = 0.0406494 ms, mean = 0.0184139 ms, median = 0.0200806 ms, percentile(90%) = 0.0227051 ms, percentile(95%) = 0.0231934 ms, percentile(99%) = 0.0334473 ms
[03/30/2023-14:06:27] [I] GPU Compute Time: min = 1.98218 ms, max = 2.14471 ms, mean = 1.99089 ms, median = 1.98865 ms, percentile(90%) = 1.99634 ms, percentile(95%) = 2.01318 ms, percentile(99%) = 2.01727 ms
[03/30/2023-14:06:27] [I] D2H Latency: min = 0.00341797 ms, max = 0.012207 ms, mean = 0.00420898 ms, median = 0.00390625 ms, percentile(90%) = 0.00537109 ms, percentile(95%) = 0.0057373 ms, percentile(99%) = 0.00622559 ms
[03/30/2023-14:06:27] [I] Total Host Walltime: 3.00458 s
[03/30/2023-14:06:27] [I] Total GPU Compute Time: 2.9943 s
[03/30/2023-14:06:27] [I] Explanations of the performance metrics are printed in the verbose logs.
[03/30/2023-14:06:27] [I]
&&&& PASSED TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=enn_space_not_fintune_86.3990.onnx --optShapes=input:1x3x32x128

What error do you see? can you share the error log here?

import tensorrt as trt
from onnx import ModelProto
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="1"

TRT_LOGGER = trt.Logger(trt.Logger.INFO)
a=(int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

def GiB(val):
    return val * 1 << 30

def build_detec_engine(onnx_path,using_int8= False, using_half=False, dynamic_input=True, workspace_size=2, name_input="x",
                min_shape=(1,3,100,100), opt_shape=(1,3,960,960), max_shape=(1,3,1280,1280)):
    trt.init_libnvinfer_plugins(None, '')
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_batch_size = 1 # always 1 for explicit batch
        config = builder.create_builder_config()
        config.max_workspace_size = GiB(int(workspace_size))
        if using_half:
            config.set_flag(trt.BuilderFlag.FP16)
        if using_int8:
            config.set_flag(trt.BuilderFlag.INT8)
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(onnx_path, 'rb') as model:
            if not parser.parse(model.read()):
                print ('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print (parser.get_error(error))
                return None

        if dynamic_input:
            profile = builder.create_optimization_profile();
            profile.set_shape(name_input, min_shape, opt_shape, max_shape) 
            config.add_optimization_profile(profile)
        print("hihihihi")
        return builder.build_serialized_network(network, config) 

def save_engine(serialized_engine, file_name):
    # buf = serialized_engine.serialize()
    with open(file_name, 'wb') as f:
       f.write(serialized_engine)

def load_engine(trt_runtime, plan_path):
   with open(plan_path, 'rb') as f:
       engine_data = f.read()
   engine = trt_runtime.deserialize_cuda_engine(engine_data)
   return engine

def convert_to_rt():

    engine_name = "trt_weights/enn_space_not_fintune_86.3990.plan"
    onnx_path = "onnx_weights/enn_space_not_fintune_86.3990.onnx" 
    batch_size = 1

    model = ModelProto()
    with open(onnx_path, "rb") as f:
        model.ParseFromString(f.read())

    d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
    d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
    d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
    shape = [batch_size , d0, d1 ,d2]
    print("shape input model:", shape) #[1, 3, -1, -1]
    engine = build_detec_engine(onnx_path=onnx_path, name_input="input", min_shape=(1,3,32,32),opt_shape=(4,3,32,128),max_shape=(8,3,32,150), using_half=True)
    save_engine(engine, engine_name) 

if __name__ == '__main__':
    convert_to_rt()

My error is:

[E] 2: [convBaseBuilder.cpp::createConvolution::247] Error Code 2: Internal Error (Assertion isOpConsistent(convolution.get()) failed. Cask convolution isConsistent check failed.)
[03/31/2023-09:02:37] [TRT] [E] 2: [builder.cpp::buildSerializedNetwork::751] Error Code 2: Internal Error (Assertion engine != nullptr failed. )

if

    engine = build_detec_engine(onnx_path=onnx_path, name_input="input", min_shape=(1,3,32,128),opt_shape=(4,3,32,128),max_shape=(8,3,32,128), using_half=True)

It can be converted Okie.

zerollzeng commented 1 year ago

min_shape is invalid.

[04/03/2023-15:27:35] [I] [TRT] BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/self_attn/MatMul: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,384][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/self_attn_1/MatMul: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,384][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/cross_attn/MatMul: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,384][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/cross_attn/MatMul_1: broadcasting input1 to make tensors conform, dims(input0)=[128,-1,384][NONE] dims(input1)=[1,384,768][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/self_attn_1/MatMul_1: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,768][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/cross_attn_1/MatMul: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,384][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/cross_attn_1/MatMul_1: broadcasting input1 to make tensors conform, dims(input0)=[128,-1,384][NONE] dims(input1)=[1,384,768][NONE].
[04/03/2023-15:27:35] [E] Error[4]: [graphShapeAnalyzer.cpp::analyzeShapes::2013] Error Code 4: Miscellaneous (IElementWiseLayer /encoder/Add: broadcast dimensions must be conformable)
[04/03/2023-15:27:35] [E] Engine could not be created from network
[04/03/2023-15:27:35] [E] Building engine failed
[04/03/2023-15:27:35] [E] Failed to create engine from model or file.
[04/03/2023-15:27:35] [E] Engine set up failed
&&&& FAILED TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=enn_space_not_fintune_86.3990.onnx --verbose --optShapes=input:1x3x32x32

phamkhactu commented 1 year ago

min_shape is invalid.

[04/03/2023-15:27:35] [I] [TRT] BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/self_attn/MatMul: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,384][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/self_attn_1/MatMul: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,384][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/cross_attn/MatMul: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,384][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/cross_attn/MatMul_1: broadcasting input1 to make tensors conform, dims(input0)=[128,-1,384][NONE] dims(input1)=[1,384,768][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/self_attn_1/MatMul_1: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,768][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/cross_attn_1/MatMul: broadcasting input1 to make tensors conform, dims(input0)=[26,-1,384][NONE] dims(input1)=[1,384,384][NONE].
[04/03/2023-15:27:35] [V] [TRT] /decoder/layers.0/cross_attn_1/MatMul_1: broadcasting input1 to make tensors conform, dims(input0)=[128,-1,384][NONE] dims(input1)=[1,384,768][NONE].
[04/03/2023-15:27:35] [E] Error[4]: [graphShapeAnalyzer.cpp::analyzeShapes::2013] Error Code 4: Miscellaneous (IElementWiseLayer /encoder/Add: broadcast dimensions must be conformable)
[04/03/2023-15:27:35] [E] Engine could not be created from network
[04/03/2023-15:27:35] [E] Building engine failed
[04/03/2023-15:27:35] [E] Failed to create engine from model or file.
[04/03/2023-15:27:35] [E] Engine set up failed
&&&& FAILED TensorRT.trtexec [TensorRT v8601] # trtexec --onnx=enn_space_not_fintune_86.3990.onnx --verbose --optShapes=input:1x3x32x32

@zerollzeng Thank you.

littleMatch03 commented 6 months ago

@phamkhactu , can you tell me how you fixed the issue? I'm facing same issue with my model.

phamkhactu commented 6 months ago

Hi @littleMatch03,

At that time, my project was destroyed because of broken storage. Now I can not reproducer it. I am so sorry for inconvenient.

NVIDIA / TensorRT

[Bug] Tensorrt output incorrect with batch inference #2808