TensorRT inference problem

Przemyslaw5 commented 1 month ago

I trained the pytorch model according to the instruction in the repository and then exported it to onnx and performed photo inference on that model:

import os 
import sys
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))

import argparse
import numpy as np 

from src.core import YAMLConfig

import torch
import torch.nn as nn 

def main(args, ):
    """main
    """
    cfg = YAMLConfig(args.config, resume=args.resume)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu') 
        if 'ema' in checkpoint:
            state = checkpoint['ema']['module']
        else:
            state = checkpoint['model']
    else:
        raise AttributeError('only support resume to load model.state_dict by now.')

    # NOTE load train mode state -> convert to deploy mode
    cfg.model.load_state_dict(state)

    class Model(nn.Module):
        def __init__(self, ) -> None:
            super().__init__()
            self.model = cfg.model.deploy()
            self.postprocessor = cfg.postprocessor.deploy()
            print(self.postprocessor.deploy_mode)

        def forward(self, images, orig_target_sizes):
            outputs = self.model(images)
            return self.postprocessor(outputs, orig_target_sizes)

    model = Model()

    dynamic_axes = {
        'images': {0: 'N', },
        'orig_target_sizes': {0: 'N'}
    }

    data = torch.rand(1, 3, 640, 640)
    size = torch.tensor([[640, 640]])

    torch.onnx.export(
        model, 
        (data, size), 
        args.file_name,
        input_names=['images', 'orig_target_sizes'],
        output_names=['labels', 'boxes', 'scores'],
        dynamic_axes=dynamic_axes,
        opset_version=16, 
        verbose=False
    )

    if args.check:
        import onnx
        onnx_model = onnx.load(args.file_name)
        onnx.checker.check_model(onnx_model)
        print('Check export onnx model done...')

    if args.simplify:
        import onnxsim
        dynamic = True 
        input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
        onnx_model_simplify, check = onnxsim.simplify(args.file_name, input_shapes=input_shapes, dynamic_input_shape=dynamic)
        onnx.save(onnx_model_simplify, args.file_name)
        print(f'Simplify onnx model {check}...')

    import onnxruntime as ort 
    from PIL import Image, ImageDraw, ImageFont
    from torchvision.transforms import ToTensor
    from src.data.coco.coco_dataset import mscoco_category2name, mscoco_category2label, mscoco_label2category

    # print(onnx.helper.printable_graph(mm.graph))

    # Load the original image without resizing
    original_im = Image.open('./unprocessed.jpg').convert('RGB')
    original_size = original_im.size

    # Resize the image for model input
    im = original_im.resize((640, 640))
    im_data = ToTensor()(im)[None]
    print(im_data.shape)

    sess = ort.InferenceSession(args.file_name)
    output = sess.run(
        # output_names=['labels', 'boxes', 'scores'],
        output_names=None,
        input_feed={'images': im_data.data.numpy(), "orig_target_sizes": size.data.numpy()}
    )

    # print(type(output))
    # print([out.shape for out in output])

    labels, boxes, scores = output

    draw = ImageDraw.Draw(original_im)  # Draw on the original image
    thrh = 0.5

    for i in range(im_data.shape[0]):

        scr = scores[i]
        lab = labels[i][scr > thrh]
        box = boxes[i][scr > thrh]

        print(scr)

        print(i, sum(scr > thrh))

        for b, l in zip(box, lab):
            # Scale the bounding boxes back to the original image size
            b = [coord * original_size[j % 2] / 640 for j, coord in enumerate(b)]
            # Get the category name from the label
            category_name = mscoco_category2name[mscoco_label2category[l]]
            draw.rectangle(list(b), outline='red', width=2)
            font = ImageFont.truetype("Arial.ttf", 15)
            draw.text((b[0], b[1]), text=category_name, fill='yellow', font=font)

    # Save the original image with bounding boxes
    original_im.save('processed.jpg')
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--config', '-c', type=str, )
    parser.add_argument('--resume', '-r', type=str, )
    parser.add_argument('--file-name', '-f', type=str, default='model_old.onnx')
    parser.add_argument('--check',  action='store_true', default=False,)
    parser.add_argument('--simplify',  action='store_true', default=False,)

    args = parser.parse_args()

    main(args)

The best score for this photo is 0.6

Then I used the command trtexec --onnx=./model.onnx --saveEngine=model.trt --buildOnly --fp16 to convert the model to tensorRT.

The next step was to use the trtinfer.py script extended by photo processing:

import time 
import contextlib
from collections import namedtuple, OrderedDict

import torch
import numpy as np
import tensorrt as trt

class TimeProfiler(contextlib.ContextDecorator):
    def __init__(self, ):
        self.total = 0

    def __enter__(self, ):
        self.start = self.time()
        return self 

    def __exit__(self, type, value, traceback):
        self.total += self.time() - self.start

    def reset(self, ):
        self.total = 0

    def time(self, ):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        return time.time()

class TRTInference(object):
    def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
        self.engine_path = engine_path
        self.device = device
        self.backend = backend
        self.max_batch_size = max_batch_size

        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)  

        self.engine = self.load_engine(engine_path)

        self.context = self.engine.create_execution_context()

        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())

        self.input_names = self.get_input_names()
        self.output_names = self.get_output_names()

        if self.backend == 'cuda':
            self.stream = cuda.Stream()

        self.time_profile = TimeProfiler()

    def init(self, ):
        self.dynamic = False 

    def load_engine(self, path):
        '''load engine
        '''
        trt.init_libnvinfer_plugins(self.logger, '')
        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def get_input_names(self, ):
        names = []
        for _, name in enumerate(self.engine):
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                names.append(name)
        return names

    def get_output_names(self, ):
        names = []
        for _, name in enumerate(self.engine):
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
                names.append(name)
        return names

    def get_bindings(self, engine, context, max_batch_size=32, device=None):
        '''build binddings
        '''
        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
        bindings = OrderedDict()
        # max_batch_size = 1

        for i, name in enumerate(engine):
            shape = engine.get_tensor_shape(name)
            dtype = trt.nptype(engine.get_tensor_dtype(name))

            if shape[0] == -1:
                dynamic = True 
                shape[0] = max_batch_size
                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:  # dynamic
                    context.set_input_shape(name, shape)

            if self.backend == 'cuda':
                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                    data = np.random.randn(*shape).astype(dtype)
                    ptr = cuda.mem_alloc(data.nbytes)
                    bindings[name] = Binding(name, dtype, shape, data, ptr) 
                else:
                    data = cuda.pagelocked_empty(trt.volume(shape), dtype)
                    ptr = cuda.mem_alloc(data.nbytes)
                    bindings[name] = Binding(name, dtype, shape, data, ptr) 

            else:
                data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
                bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())

        return bindings

    def run_torch(self, blob):
        '''torch input
        '''
        for n in self.input_names:
            if self.bindings[n].shape != blob[n].shape:
                self.context.set_input_shape(n, blob[n].shape) 
                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)

        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
        self.context.execute_v2(list(self.bindings_addr.values()))
        outputs = {n: self.bindings[n].data for n in self.output_names}

        return outputs

    def async_run_cuda(self, blob):
        '''numpy input
        '''
        for n in self.input_names:
            cuda.memcpy_htod_async(self.bindings_addr[n], blob[n], self.stream)

        bindings_addr = [int(v) for _, v in self.bindings_addr.items()]
        self.context.execute_async_v2(bindings=bindings_addr, stream_handle=self.stream.handle)

        outputs = {}
        for n in self.output_names:
            cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr, self.stream)
            outputs[n] = self.bindings[n].data

        self.stream.synchronize()

        return outputs

    def __call__(self, blob):
        if self.backend == 'torch':
            return self.run_torch(blob)

        elif self.backend == 'cuda':
            return self.async_run_cuda(blob)

    def synchronize(self, ):
        if self.backend == 'torch' and torch.cuda.is_available():
            torch.cuda.synchronize()

        elif self.backend == 'cuda':
            self.stream.synchronize()

    def warmup(self, blob, n):
        for _ in range(n):
            _ = self(blob)

    def speed(self, blob, n):
        self.time_profile.reset()
        for _ in range(n):
            with self.time_profile:
                _ = self(blob)

        return self.time_profile.total / n 

# Missing import 
import pycuda.driver as cuda 

# New Import 
import cv2 

cuda.init()
device_ctx = cuda.Device(0).make_context()
mpath="./model.trt"
image_file="./unprocessed.jpg"
model = TRTInference(mpath, backend='cuda')
img = cv2.imread(image_file)

im_shape = np.array([[float(img.shape[0]), float(img.shape[1])]]).astype('float32')
size = np.array([[640,640]])
size = np.ascontiguousarray(size).astype(np.int32)
blob = {"images" : np.ascontiguousarray(im_shape), "orig_target_sizes": np.ascontiguousarray(size).astype(np.int32)}

res = model(blob)
print(res)
device_ctx.pop()

In this case, the best 'score' in the photo was 11%, which is incomparable to the result of 60% for onnx.

What's your problem, am I doing something wrong?

lyuwenyu commented 1 month ago

blob = {"images" : np.ascontiguousarray(im_shape), "orig_target_sizes": np.ascontiguousarray(size).astype(np.int32)}

images should be image value not im_shape

Przemyslaw5 commented 1 month ago

You're right, but I made this correction and the output I receive is completely incomprehensible:

{'scores': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan], dtype=float32), 'labels': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,  0,  1,  2,  3,  4,
        5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
       56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
       73, 74, 75, 76, 77, 78, 79,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
       10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
       44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
       61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
       78, 79,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
       15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
       32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
       49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59], dtype=int32), 'boxes': array([nan, nan, nan, ..., nan, nan, nan], dtype=float32)}

My code now looks like this:

cuda.init()
device_ctx = cuda.Device(0).make_context()
mpath="./model.trt"
image_file="./unprocessed.jpg"
model = TRTInference(mpath, backend='cuda')
img = cv2.imread(image_file)

im_shape = np.array([[float(img.shape[0]), float(img.shape[1])]]).astype('float32')
size = np.array([[640,640]])
size = np.ascontiguousarray(size).astype(np.int32)
blob = {"images" : np.ascontiguousarray(img), "orig_target_sizes": np.ascontiguousarray(size).astype(np.int32)}

res = model(blob)
print(res)
device_ctx.pop()

lyuwenyu commented 1 month ago

images's format should be NCHW

Przemyslaw5 commented 1 month ago

@lyuwenyu THANKS a lot!! It helped a little, the score is at 34%, but it's a bit short of tech 60% on onnx, this is what the corrected code looks like:

cuda.init()
device_ctx = cuda.Device(0).make_context()
mpath="./model.trt"
image_file="./unprocessed.jpg"
model = TRTInference(mpath, backend='cuda')

img = cv2.imread(image_file)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.transpose((2, 0, 1))
img = np.expand_dims(img, 0)
img = img.astype(np.float32)
img /= 255

im_shape = np.array([[float(img.shape[0]), float(img.shape[1])]]).astype('float32')
size = np.array([[640,640]])
size = np.ascontiguousarray(size).astype(np.int32)
blob = {"images" : np.ascontiguousarray(img), "orig_target_sizes": np.ascontiguousarray(size).astype(np.int32)}

res = model(blob)
print(res)
device_ctx.pop()

Am I doing something not entirely correct since the results are different for these models?

wantacool commented 1 month ago

Second to this issue. I converted the exported onnx model to tensorrt engine with fp16 flag, and observed significant accuracy drop, while fp32 built works okay. Will the paddle model make any difference? I had no experience with paddle though.

PrinceP commented 1 month ago

With FP16 I don't see any accuracy drop. https://github.com/PrinceP/tensorrt-cpp-for-onnx?tab=readme-ov-file#rt-detr

lyuwenyu / RT-DETR

TensorRT inference problem #383