[CVPR 2024] Official RT-DETR (RTDETR paddle pytorch), Real-Time DEtection TRansformer, DETRs Beat YOLOs on Real-time Object Detection.
TensorRT inference problem #383

Open Przemyslaw5 opened 1 month ago

Przemyslaw5 commented 1 month ago

I trained the pytorch model according to the instruction in the repository and then exported it to onnx and performed photo inference on that model:

import os 
import sys
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))

import argparse
import numpy as np 

from src.core import YAMLConfig

import torch
import torch.nn as nn 

def main(args, ):
    cfg = YAMLConfig(args.config, resume=args.resume)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu') 
        if 'ema' in checkpoint:
            state = checkpoint['ema']['module']
            state = checkpoint['model']
        raise AttributeError('only support resume to load model.state_dict by now.')

    # NOTE load train mode state -> convert to deploy mode

    class Model(nn.Module):
        def __init__(self, ) -> None:
            self.model = cfg.model.deploy()
            self.postprocessor = cfg.postprocessor.deploy()

        def forward(self, images, orig_target_sizes):
            outputs = self.model(images)
            return self.postprocessor(outputs, orig_target_sizes)

    model = Model()

    dynamic_axes = {
        'images': {0: 'N', },
        'orig_target_sizes': {0: 'N'}

    data = torch.rand(1, 3, 640, 640)
    size = torch.tensor([[640, 640]])

        (data, size), 
        input_names=['images', 'orig_target_sizes'],
        output_names=['labels', 'boxes', 'scores'],

    if args.check:
        import onnx
        onnx_model = onnx.load(args.file_name)
        print('Check export onnx model done...')

    if args.simplify:
        import onnxsim
        dynamic = True 
        input_shapes = {'images': data.shape, 'orig_target_sizes': size.shape} if dynamic else None
        onnx_model_simplify, check = onnxsim.simplify(args.file_name, input_shapes=input_shapes, dynamic_input_shape=dynamic), args.file_name)
        print(f'Simplify onnx model {check}...')

    import onnxruntime as ort 
    from PIL import Image, ImageDraw, ImageFont
    from torchvision.transforms import ToTensor
    from import mscoco_category2name, mscoco_category2label, mscoco_label2category

    # print(onnx.helper.printable_graph(mm.graph))

    # Load the original image without resizing
    original_im ='./unprocessed.jpg').convert('RGB')
    original_size = original_im.size

    # Resize the image for model input
    im = original_im.resize((640, 640))
    im_data = ToTensor()(im)[None]

    sess = ort.InferenceSession(args.file_name)
    output =
        # output_names=['labels', 'boxes', 'scores'],
        input_feed={'images':, "orig_target_sizes":}

    # print(type(output))
    # print([out.shape for out in output])

    labels, boxes, scores = output

    draw = ImageDraw.Draw(original_im)  # Draw on the original image
    thrh = 0.5

    for i in range(im_data.shape[0]):

        scr = scores[i]
        lab = labels[i][scr > thrh]
        box = boxes[i][scr > thrh]


        print(i, sum(scr > thrh))

        for b, l in zip(box, lab):
            # Scale the bounding boxes back to the original image size
            b = [coord * original_size[j % 2] / 640 for j, coord in enumerate(b)]
            # Get the category name from the label
            category_name = mscoco_category2name[mscoco_label2category[l]]
            draw.rectangle(list(b), outline='red', width=2)
            font = ImageFont.truetype("Arial.ttf", 15)
            draw.text((b[0], b[1]), text=category_name, fill='yellow', font=font)

    # Save the original image with bounding boxes'processed.jpg')
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--config', '-c', type=str, )
    parser.add_argument('--resume', '-r', type=str, )
    parser.add_argument('--file-name', '-f', type=str, default='model_old.onnx')
    parser.add_argument('--check',  action='store_true', default=False,)
    parser.add_argument('--simplify',  action='store_true', default=False,)

    args = parser.parse_args()


The best score for this photo is 0.6

Then I used the command trtexec --onnx=./model.onnx --saveEngine=model.trt --buildOnly --fp16 to convert the model to tensorRT.

The next step was to use the script extended by photo processing:

import time 
import contextlib
from collections import namedtuple, OrderedDict

import torch
import numpy as np
import tensorrt as trt

class TimeProfiler(contextlib.ContextDecorator):
    def __init__(self, ): = 0

    def __enter__(self, ):
        self.start = self.time()
        return self 

    def __exit__(self, type, value, traceback): += self.time() - self.start

    def reset(self, ): = 0

    def time(self, ):
        if torch.cuda.is_available():
        return time.time()

class TRTInference(object):
    def __init__(self, engine_path, device='cuda:0', backend='torch', max_batch_size=32, verbose=False):
        self.engine_path = engine_path
        self.device = device
        self.backend = backend
        self.max_batch_size = max_batch_size

        self.logger = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger(trt.Logger.INFO)  

        self.engine = self.load_engine(engine_path)

        self.context = self.engine.create_execution_context()

        self.bindings = self.get_bindings(self.engine, self.context, self.max_batch_size, self.device)
        self.bindings_addr = OrderedDict((n, v.ptr) for n, v in self.bindings.items())

        self.input_names = self.get_input_names()
        self.output_names = self.get_output_names()

        if self.backend == 'cuda':
   = cuda.Stream()

        self.time_profile = TimeProfiler()

    def init(self, ):
        self.dynamic = False 

    def load_engine(self, path):
        '''load engine
        trt.init_libnvinfer_plugins(self.logger, '')
        with open(path, 'rb') as f, trt.Runtime(self.logger) as runtime:
            return runtime.deserialize_cuda_engine(

    def get_input_names(self, ):
        names = []
        for _, name in enumerate(self.engine):
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
        return names

    def get_output_names(self, ):
        names = []
        for _, name in enumerate(self.engine):
            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
        return names

    def get_bindings(self, engine, context, max_batch_size=32, device=None):
        '''build binddings
        Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr'))
        bindings = OrderedDict()
        # max_batch_size = 1

        for i, name in enumerate(engine):
            shape = engine.get_tensor_shape(name)
            dtype = trt.nptype(engine.get_tensor_dtype(name))

            if shape[0] == -1:
                dynamic = True 
                shape[0] = max_batch_size
                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:  # dynamic
                    context.set_input_shape(name, shape)

            if self.backend == 'cuda':
                if engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
                    data = np.random.randn(*shape).astype(dtype)
                    ptr = cuda.mem_alloc(data.nbytes)
                    bindings[name] = Binding(name, dtype, shape, data, ptr) 
                    data = cuda.pagelocked_empty(trt.volume(shape), dtype)
                    ptr = cuda.mem_alloc(data.nbytes)
                    bindings[name] = Binding(name, dtype, shape, data, ptr) 

                data = torch.from_numpy(np.empty(shape, dtype=dtype)).to(device)
                bindings[name] = Binding(name, dtype, shape, data, data.data_ptr())

        return bindings

    def run_torch(self, blob):
        '''torch input
        for n in self.input_names:
            if self.bindings[n].shape != blob[n].shape:
                self.context.set_input_shape(n, blob[n].shape) 
                self.bindings[n] = self.bindings[n]._replace(shape=blob[n].shape)

        self.bindings_addr.update({n: blob[n].data_ptr() for n in self.input_names})
        outputs = {n: self.bindings[n].data for n in self.output_names}

        return outputs

    def async_run_cuda(self, blob):
        '''numpy input
        for n in self.input_names:
            cuda.memcpy_htod_async(self.bindings_addr[n], blob[n],

        bindings_addr = [int(v) for _, v in self.bindings_addr.items()]

        outputs = {}
        for n in self.output_names:
            cuda.memcpy_dtoh_async(self.bindings[n].data, self.bindings[n].ptr,
            outputs[n] = self.bindings[n].data

        return outputs

    def __call__(self, blob):
        if self.backend == 'torch':
            return self.run_torch(blob)

        elif self.backend == 'cuda':
            return self.async_run_cuda(blob)

    def synchronize(self, ):
        if self.backend == 'torch' and torch.cuda.is_available():

        elif self.backend == 'cuda':

    def warmup(self, blob, n):
        for _ in range(n):
            _ = self(blob)

    def speed(self, blob, n):
        for _ in range(n):
            with self.time_profile:
                _ = self(blob)

        return / n 

# Missing import 
import pycuda.driver as cuda 

# New Import 
import cv2 

device_ctx = cuda.Device(0).make_context()
model = TRTInference(mpath, backend='cuda')
img = cv2.imread(image_file)

im_shape = np.array([[float(img.shape[0]), float(img.shape[1])]]).astype('float32')
size = np.array([[640,640]])
size = np.ascontiguousarray(size).astype(np.int32)
blob = {"images" : np.ascontiguousarray(im_shape), "orig_target_sizes": np.ascontiguousarray(size).astype(np.int32)}

res = model(blob)

In this case, the best 'score' in the photo was 11%, which is incomparable to the result of 60% for onnx.

What's your problem, am I doing something wrong?

lyuwenyu commented 1 month ago

blob = {"images" : np.ascontiguousarray(im_shape), "orig_target_sizes": np.ascontiguousarray(size).astype(np.int32)}

images should be image value not im_shape

Przemyslaw5 commented 1 month ago

You're right, but I made this correction and the output I receive is completely incomprehensible:

My code now looks like this:

device_ctx = cuda.Device(0).make_context()
model = TRTInference(mpath, backend='cuda')
img = cv2.imread(image_file)

im_shape = np.array([[float(img.shape[0]), float(img.shape[1])]]).astype('float32')
size = np.array([[640,640]])
size = np.ascontiguousarray(size).astype(np.int32)
blob = {"images" : np.ascontiguousarray(img), "orig_target_sizes": np.ascontiguousarray(size).astype(np.int32)}

res = model(blob)
lyuwenyu commented 1 month ago

images's format should be NCHW

Przemyslaw5 commented 1 month ago

@lyuwenyu THANKS a lot!! It helped a little, the score is at 34%, but it's a bit short of tech 60% on onnx, this is what the corrected code looks like:

device_ctx = cuda.Device(0).make_context()
model = TRTInference(mpath, backend='cuda')

img = cv2.imread(image_file)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.transpose((2, 0, 1))
img = np.expand_dims(img, 0)
img = img.astype(np.float32)
img /= 255

im_shape = np.array([[float(img.shape[0]), float(img.shape[1])]]).astype('float32')
size = np.array([[640,640]])
size = np.ascontiguousarray(size).astype(np.int32)
blob = {"images" : np.ascontiguousarray(img), "orig_target_sizes": np.ascontiguousarray(size).astype(np.int32)}

res = model(blob)

Am I doing something not entirely correct since the results are different for these models?

wantacool commented 1 month ago

Second to this issue. I converted the exported onnx model to tensorrt engine with fp16 flag, and observed significant accuracy drop, while fp32 built works okay. Will the paddle model make any difference? I had no experience with paddle though.

PrinceP commented 1 month ago

With FP16 I don't see any accuracy drop.