How to achieve throughput in trtexec with python api?

When I'm using "trtexec" to run the engine, the throughput is about 6 qps, but when I'm using my own python script, the throughput goes down to 3 qps, here's my code, please advice.

import numpy as np
import torch
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit # without this, "LogicError: explicit_context_dependent failed: invalid device context - no currentl
# y active context?"
from time import time

from model_wrapper import DMC_decoder_wrapper

encoder_input = (1, 3, 1600, 2560)
intra_no_ar_y_hat = (1, 256, 100, 160)
dmc_y_hat = (1, 128, 100, 160)
dmc_c1 = (1, 48, 1600, 2560)
dmc_c2 = (1, 64, 800, 1280)
dmc_c3 = (1, 96, 400, 640)

class TRTModel_pip:
    '''
    Generic class to run a TRT engine by specifying engine path and giving input data.
    '''
    class HostDeviceMem(object):
        '''
        Helper class to record host-device memory pointer pairs
        '''
        def __init__(self, host_mem, device_mem):
            self.host = host_mem
            self.device = device_mem

        def __str__(self):
            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

        def __repr__(self):
            return self.__str__()

    def __init__(self, engine_path):
        self.engine_path = engine_path
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(self.logger)

        # load and deserialize TRT engine
        self.engine = self.load_engine()

        # create context
        self.context = self.engine.create_execution_context()
        # self.context.set_input_shape('y_hat', intra_no_ar_y_hat)
        self.context.set_input_shape('ref_y', dmc_y_hat)
        self.context.set_input_shape('c1', dmc_c1)
        self.context.set_input_shape('c2', dmc_c2)
        self.context.set_input_shape('c3', dmc_c3)
        # allocate input/output memory buffers
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(self.engine)

        for i in range(self.engine.num_io_tensors):
            self.context.set_tensor_address(self.engine.get_tensor_name(i), self.bindings[i])

    def load_engine(self):
        with open(self.engine_path, 'rb') as f:
            engine = self.runtime.deserialize_cuda_engine(f.read())
        return engine

    def allocate_buffers(self, engine):
        '''
        Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
        '''
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()

        for i in range(engine.num_io_tensors):
            tensor_name = engine.get_tensor_name(i)
            dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))
            # size = trt.volume(engine.get_tensor_shape(tensor_name))
            if tensor_name == 'y_hat':
                size = trt.volume(intra_no_ar_y_hat)
            elif tensor_name == 'x_hat':
                size = trt.volume(encoder_input)
            elif tensor_name == 'ref_y':
                size = trt.volume(dmc_y_hat)
            elif tensor_name == 'c1':
                size = trt.volume(dmc_c1)
            elif tensor_name == 'c2':
                size = trt.volume(dmc_c2)
            elif tensor_name == 'c3':
                size = trt.volume(dmc_c3)
            elif tensor_name == 'ref_frame':
                size = trt.volume(encoder_input)
            elif tensor_name == 'ref_feature':
                size = trt.volume(dmc_c1)
            else:
                size = trt.volume((1,))
            # Allocate host and device buffers
            host_mem = cuda.aligned_empty(size, dtype) # page-locked memory buffer (won't swapped to disk)
            host_mem = cuda.register_host_memory(host_mem, flags=cuda.mem_host_register_flags.DEVICEMAP)
            device_mem = host_mem.base.get_device_pointer()
            # device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer address to device bindings. When cast to int, it's a linear index into the context's memory (like memory address). See https://documen.tician.de/pycuda/driver.html#pycuda.driver.DeviceAllocation
            bindings.append(int(device_mem))

            # Append to the appropriate input/output list.
            if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
                inputs.append(self.HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(self.HostDeviceMem(host_mem, device_mem))

        return inputs, outputs, bindings, stream

    def __call__(self):
        '''
        Inference step (like forward() in PyTorch).
        '''
        batch_size = 1
        # [cuda.memcpy_htod(inp.device, inp.host) for inp in self.inputs]

        self.context.execute_async_v3(stream_handle=self.stream.handle)
        self.stream.synchronize()

        # [cuda.memcpy_dtoh(out.host, out.device) for out in self.outputs] # output, Device to Host
        return [out.host.reshape(batch_size,-1) for out in self.outputs]

def test_engine():
    # load and deserialize TRT engine
    P_frame_engine_filename = 'FM_P_D_fp16.engine'

    print(f'Running inference on engine {P_frame_engine_filename}')
    model = TRTModel_pip(P_frame_engine_filename)
    y_hat = torch.randn(dmc_y_hat)
    c1 = torch.randn(dmc_c1)
    c2 = torch.randn(dmc_c2)
    c3 = torch.randn(dmc_c3)
    q_index = torch.Tensor([15.5]).reshape(1, ).type(torch.float32)

    inputs = [y_hat.numpy(), c1.numpy(), c2.numpy(), c3.numpy(), q_index.numpy()]
    for i, inp in enumerate(inputs):
        np.copyto(model.inputs[i].host, inp.ravel())

    for i in range(10):
        start = time()
        out1 = model()
        print(time()-start)

if __name__ == "__main__":
    test_engine()

I've used asynchronous syntax with zero copy method: first allocate aligned host memory, then apply "register_host_memory" to pin it, finally use “get_device_pointer” to get mapped device memory pointers and add binding. I don't need any explicit memcpy between host and device, but the performance is not making any difference.

Here's my result with python api:

Running inference on engine FM_P_D_fp16.engine
0.4072411060333252
0.3620748519897461
0.36286306381225586
0.36284446716308594
0.362790584564209
0.3629634380340576
0.3627777099609375
0.36302876472473145
0.362729549407959
0.3620717525482178

Here's my result with trtexec:

(python3.8) PS C:\Users\22715\Desktop\DCVC-main\DCVC-FM> trtexec --loadEngine=FM_P_D_fp16.engine --shapes=ref_y:1x128x100x160,c1:1x48x1600x2560,c2:1x64x800x1280,c3:1x96x400x640
&&&& RUNNING TensorRT.trtexec [TensorRT v100200] # D:\TensorRT-10.2.0.19\bin\trtexec.exe --loadEngine=FM_P_D_fp16.engine --shapes=ref_y:1x128x100x160,c1:1x48x1600x2560,c2:1x64x800x1280,c3:1x96x400x640
[07/30/2024-15:30:36] [I] === Model Options ===
[07/30/2024-15:30:36] [I] Format: *
[07/30/2024-15:30:36] [I] Model:
[07/30/2024-15:30:36] [I] Output:
[07/30/2024-15:30:36] [I]
[07/30/2024-15:30:36] [I] === System Options ===
[07/30/2024-15:30:36] [I] Device: 0
[07/30/2024-15:30:36] [I] DLACore:
[07/30/2024-15:30:36] [I] Plugins:
[07/30/2024-15:30:36] [I] setPluginsToSerialize:
[07/30/2024-15:30:36] [I] dynamicPlugins:
[07/30/2024-15:30:36] [I] ignoreParsedPluginLibs: 0
[07/30/2024-15:30:36] [I]
[07/30/2024-15:30:36] [I] === Inference Options ===
[07/30/2024-15:30:36] [I] Batch: Explicit
[07/30/2024-15:30:36] [I] Input inference shape : ref_y=1x128x100x160
[07/30/2024-15:30:36] [I] Input inference shape : c1=1x48x1600x2560
[07/30/2024-15:30:36] [I] Input inference shape : c2=1x64x800x1280
[07/30/2024-15:30:36] [I] Input inference shape : c3=1x96x400x640
[07/30/2024-15:30:36] [I] Iterations: 10
[07/30/2024-15:30:36] [I] Duration: 3s (+ 200ms warm up)
[07/30/2024-15:30:36] [I] Sleep time: 0ms
[07/30/2024-15:30:36] [I] Idle time: 0ms
[07/30/2024-15:30:36] [I] Inference Streams: 1
[07/30/2024-15:30:36] [I] ExposeDMA: Disabled
[07/30/2024-15:30:36] [I] Data transfers: Enabled
[07/30/2024-15:30:36] [I] Spin-wait: Disabled
[07/30/2024-15:30:36] [I] Multithreading: Disabled
[07/30/2024-15:30:36] [I] CUDA Graph: Disabled
[07/30/2024-15:30:36] [I] Separate profiling: Disabled
[07/30/2024-15:30:36] [I] Time Deserialize: Disabled
[07/30/2024-15:30:36] [I] Time Refit: Disabled
[07/30/2024-15:30:36] [I] NVTX verbosity: 0
[07/30/2024-15:30:36] [I] Persistent Cache Ratio: 0
[07/30/2024-15:30:36] [I] Optimization Profile Index: 0
[07/30/2024-15:30:36] [I] Weight Streaming Budget: 100.000000%
[07/30/2024-15:30:36] [I] Inputs:
[07/30/2024-15:30:36] [I] Debug Tensor Save Destinations:
[07/30/2024-15:30:36] [I] === Reporting Options ===
[07/30/2024-15:30:36] [I] Verbose: Disabled
[07/30/2024-15:30:36] [I] Averages: 10 inferences
[07/30/2024-15:30:36] [I] Percentiles: 90,95,99
[07/30/2024-15:30:36] [I] Dump refittable layers:Disabled
[07/30/2024-15:30:36] [I] Dump output: Disabled
[07/30/2024-15:30:36] [I] Profile: Disabled
[07/30/2024-15:30:36] [I] Export timing to JSON file:
[07/30/2024-15:30:36] [I] Export output to JSON file:
[07/30/2024-15:30:36] [I] Export profile to JSON file:
[07/30/2024-15:30:36] [I]
[07/30/2024-15:30:36] [I] === Device Information ===
[07/30/2024-15:30:36] [I] Available Devices:
[07/30/2024-15:30:36] [I]   Device 0: "NVIDIA GeForce RTX 4070 Ti SUPER" UUID: GPU-510a3b74-1549-d232-1050-6536a63240f4
[07/30/2024-15:30:36] [I] Selected Device: NVIDIA GeForce RTX 4070 Ti SUPER
[07/30/2024-15:30:36] [I] Selected Device ID: 0
[07/30/2024-15:30:36] [I] Selected Device UUID: GPU-510a3b74-1549-d232-1050-6536a63240f4
[07/30/2024-15:30:36] [I] Compute Capability: 8.9
[07/30/2024-15:30:36] [I] SMs: 66
[07/30/2024-15:30:36] [I] Device Global Memory: 16375 MiB
[07/30/2024-15:30:36] [I] Shared Memory per SM: 100 KiB
[07/30/2024-15:30:36] [I] Memory Bus Width: 256 bits (ECC disabled)
[07/30/2024-15:30:36] [I] Application Compute Clock Rate: 2.61 GHz
[07/30/2024-15:30:36] [I] Application Memory Clock Rate: 10.501 GHz
[07/30/2024-15:30:36] [I]
[07/30/2024-15:30:36] [I] Note: The application clock rates do not reflect the actual clock rates that the GPU is currently running at.
[07/30/2024-15:30:36] [I]
[07/30/2024-15:30:36] [I] TensorRT version: 10.2.0
[07/30/2024-15:30:36] [I] Loading standard plugins
[07/30/2024-15:30:36] [I] [TRT] Loaded engine size: 9 MiB
[07/30/2024-15:30:36] [I] Engine deserialized in 0.029496 sec.
[07/30/2024-15:30:36] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +3188, now: CPU 1, GPU 3194 (MiB)
[07/30/2024-15:30:36] [I] Setting persistentCacheLimit to 0 bytes.
[07/30/2024-15:30:36] [I] Set shape of input tensor ref_y to: 1x128x100x160
[07/30/2024-15:30:36] [I] Set shape of input tensor c1 to: 1x48x1600x2560
[07/30/2024-15:30:36] [I] Set shape of input tensor c2 to: 1x64x800x1280
[07/30/2024-15:30:36] [I] Set shape of input tensor c3 to: 1x96x400x640
[07/30/2024-15:30:36] [I] Created execution context with device memory size: 3187.5 MiB
[07/30/2024-15:30:36] [I] Using random values for input ref_y
[07/30/2024-15:30:36] [I] Input binding for ref_y with dimensions 1x128x100x160 is created.
[07/30/2024-15:30:36] [I] Using random values for input c1
[07/30/2024-15:30:41] [I] Input binding for c1 with dimensions 1x48x1600x2560 is created.
[07/30/2024-15:30:41] [I] Using random values for input c2
[07/30/2024-15:30:42] [I] Input binding for c2 with dimensions 1x64x800x1280 is created.
[07/30/2024-15:30:42] [I] Using random values for input c3
[07/30/2024-15:30:43] [I] Input binding for c3 with dimensions 1x96x400x640 is created.
[07/30/2024-15:30:43] [I] Using random values for input q
[07/30/2024-15:30:43] [I] Input binding for q with dimensions 1 is created.
[07/30/2024-15:30:43] [I] Output binding for ref_frame with dimensions 1x3x1600x2560 is created.
[07/30/2024-15:30:43] [I] Output binding for ref_feature with dimensions 1x48x1600x2560 is created.
[07/30/2024-15:30:43] [I] Starting inference
[07/30/2024-15:30:47] [I] Warmup completed 1 queries over 200 ms
[07/30/2024-15:30:47] [I] Timing trace has 21 queries over 3.40382 s
[07/30/2024-15:30:47] [I]
[07/30/2024-15:30:47] [I] === Trace details ===
[07/30/2024-15:30:47] [I] Trace averages of 10 runs:
[07/30/2024-15:30:47] [I] Average on 10 runs - GPU latency: 154.185 ms - Host latency: 308.815 ms (enqueue 0.502057 ms)
[07/30/2024-15:30:47] [I] Average on 10 runs - GPU latency: 154.347 ms - Host latency: 307.723 ms (enqueue 0.445728 ms)
[07/30/2024-15:30:47] [I]
[07/30/2024-15:30:47] [I] === Performance summary ===
[07/30/2024-15:30:47] [I] Throughput: 6.16954 qps
[07/30/2024-15:30:47] [I] Latency: min = 305.199 ms, max = 314.668 ms, mean = 308.122 ms, median = 307.468 ms, percentile(90%) = 308.716 ms, percentile(95%) = 312.771 ms, percentile(99%) = 314.668 ms
[07/30/2024-15:30:47] [I] Enqueue Time: min = 0.393005 ms, max = 0.608154 ms, mean = 0.474543 ms, median = 0.470459 ms, percentile(90%) = 0.535767 ms, percentile(95%) = 0.562378 ms, percentile(99%) = 0.608154 ms
[07/30/2024-15:30:47] [I] H2D Latency: min = 89.5918 ms, max = 97.2084 ms, mean = 90.3088 ms, median = 89.6726 ms, percentile(90%) = 89.7966 ms, percentile(95%) = 95.2426 ms, percentile(99%) = 97.2084 ms
[07/30/2024-15:30:47] [I] GPU Compute Time: min = 151.874 ms, max = 155.407 ms, mean = 154.152 ms, median = 154.055 ms, percentile(90%) = 155.143 ms, percentile(95%) = 155.146 ms, percentile(99%) = 155.407 ms
[07/30/2024-15:30:47] [I] D2H Latency: min = 63.5786 ms, max = 63.8279 ms, mean = 63.6618 ms, median = 63.6525 ms, percentile(90%) = 63.6958 ms, percentile(95%) = 63.719 ms, percentile(99%) = 63.8279 ms
[07/30/2024-15:30:47] [I] Total Host Walltime: 3.40382 s
[07/30/2024-15:30:47] [I] Total GPU Compute Time: 3.23719 s
[07/30/2024-15:30:47] [I] Explanations of the performance metrics are printed in the verbose logs.
[07/30/2024-15:30:47] [I]
&&&& PASSED TensorRT.trtexec [TensorRT v100200] # D:\TensorRT-10.2.0.19\bin\trtexec.exe --loadEngine=FM_P_D_fp16.engine --shapes=ref_y:1x128x100x160,c1:1x48x1600x2560,c2:1x64x800x1280,c3:1x96x400x640

NVIDIA / TensorRT

How to achieve throughput in trtexec with python api? #4036