NVIDIA / TensorRT

NVIDIA® TensorRT™ is an SDK for high-performance deep learning inference on NVIDIA GPUs. This repository contains the open source components of TensorRT.
https://developer.nvidia.com/tensorrt
Apache License 2.0
10.15k stars 2.08k forks source link

Error when creating TRT engine with INT8 calibration #3937

Open labderrafie opened 3 weeks ago

labderrafie commented 3 weeks ago

Hi! I am trying to implement a code to generate TRT engine with INT8 calibration as an option, I'm basing my code on YOLOv8's engine export file (tested the code and it works fine: https://github.com/ultralytics/ultralytics/blob/main/ultralytics/engine/exporter.py#L675)

As I wanted to run it on other models, I wrote the following script after trying to remove all YOLOv8's dependencies between files:

from __future__ import print_function

import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import torch
from base_opts import Opts

from pathlib import Path
import json
import sys, os

from utils import colorstr
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import glob

sys.path.insert(1, os.path.join(sys.path[0], ".."))

# Custom dataset class for image loading and preprocessing
class ImageFolderDataset(Dataset):
    def __init__(self, folder_path, img_size):
        self.img_paths = glob.glob(os.path.join(folder_path, '*'))
        self.transform = transforms.Compose([
            transforms.Resize(img_size),
            transforms.ToTensor(),
        ])

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        img = Image.open(img_path).convert('RGB')
        img = self.transform(img)
        return img

# Function to create a DataLoader for INT8 calibration
def get_int8_calibration_dataloader(folder_path, img_size=(640, 640), batch_size=16, prefix=colorstr("TensorRT:")):
    """Build and return a dataloader suitable for calibration of INT8 models."""
    print(f"{prefix} collecting INT8 calibration images from 'data={folder_path}'")

    dataset = ImageFolderDataset(folder_path, img_size)

    if len(dataset) < 300:
        print(f"WARNING ⚠️ >300 images recommended for INT8 calibration, found {len(dataset)} images.")

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    return dataloader

def export_engine(f_onnx, savedir, is_half, is_int8, data, imgsz, dynamic, batch, n_workspace, verbose, prefix=colorstr("TensorRT:")):
    """YOLOv8 TensorRT export https://developer.nvidia.com/tensorrt."""

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    assert device != "cpu", "export running on CPU but must be on GPU, i.e. use 'device=0'"
    # check_version(trt.__version__, "7.0.0", hard=True)  # require tensorrt>=7.0.0

    # Setup and checks
    print(f"\n{prefix} starting export with TensorRT {trt.__version__}...")
    is_trt10 = int(trt.__version__.split(".")[0]) >= 10  # is TensorRT >= 10

    assert Path(f_onnx).exists(), f"ONNX file not found: {f_onnx}"

    basename = f_onnx.split('/')[-1]
    f = os.path.join(savedir, basename.replace('.onnx', '.engine'))  # TensorRT engine file
    logger = trt.Logger(trt.Logger.VERBOSE if verbose else trt.Logger.INFO)

    # Engine builder
    builder = trt.Builder(logger)
    config = builder.create_builder_config()
    workspace = int(n_workspace * (1 << 30))
    if is_trt10:
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace)
    else:  # TensorRT versions 7, 8
        config.max_workspace_size = workspace
    flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    network = builder.create_network(flag)
    half = builder.platform_has_fast_fp16 and is_half
    int8 = builder.platform_has_fast_int8 and is_int8
    # Read ONNX file
    parser = trt.OnnxParser(network, logger)
    if not parser.parse_from_file(f_onnx):
        raise RuntimeError(f"failed to load ONNX file: {f_onnx}")

    # Network inputs
    inputs = [network.get_input(i) for i in range(network.num_inputs)]
    outputs = [network.get_output(i) for i in range(network.num_outputs)]
    for inp in inputs:
        print(f'{prefix} input "{inp.name}" with shape{inp.shape} {inp.dtype}')
    for out in outputs:
        print(f'{prefix} output "{out.name}" with shape{out.shape} {out.dtype}')

    if dynamic:
        shape = torch.zeros(batch, 3, *imgsz).shape
        if shape[0] <= 1:
            print(f"{prefix} WARNING ⚠️ 'dynamic=True' model requires max batch size, i.e. 'batch=16'")
        profile = builder.create_optimization_profile()
        min_shape = (1, 3, *imgsz)  # minimum input shape
        max_shape = (64, 3, *imgsz)  # max input shape
        for inp in inputs:
            profile.set_shape(inp.name, min=shape, opt=shape, max=shape)
        config.add_optimization_profile(profile)

    print(f"{prefix} building {'INT8' if int8 else 'FP' + ('16' if half else '32')} engine as {f}")
    if int8:
        config.set_flag(trt.BuilderFlag.INT8)
        config.set_calibration_profile(profile)
        config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED

        class EngineCalibrator(trt.IInt8Calibrator):
            def __init__(self, dataloader, batch_size, cache_file=""):
                trt.IInt8Calibrator.__init__(self)
                self.dataloader = dataloader
                self.data_iter = iter(self.dataloader)
                self.batch_size = batch_size
                self.cache_file = cache_file

            def get_algorithm(self):
                return trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2

            def get_batch_size(self):
                return self.batch_size

            def get_batch(self, names):
                try:
                    batch = next(self.data_iter)
                    batch = batch.to('cuda')
                    return [int(batch.data_ptr())]
                except StopIteration:
                    return None

            def read_calibration_cache(self):
                if os.path.exists(self.cache_file):
                    with open(self.cache_file, 'rb') as f:
                        return f.read()
                return None

            def write_calibration_cache(self, cache):
                with open(self.cache_file, 'wb') as f:
                    f.write(cache)

        # Load dataset w/ builder (for batching) and calibrate
        config.int8_calibrator = EngineCalibrator(
            dataloader=get_int8_calibration_dataloader(data, imgsz, 2 * batch),
            batch_size=2 * batch,
            cache_file=str(f_onnx.replace('.onnx', '.cache')),
        )

    elif half:
        config.set_flag(trt.BuilderFlag.FP16)

    # Free CUDA memory
    torch.cuda.empty_cache()

    # Write file
    build = builder.build_serialized_network if is_trt10 else builder.build_engine
    with build(network, config) as engine, open(f, "wb") as t:
        # Model
        t.write(engine if is_trt10 else engine.serialize())

    return f, None

def main():
    """Create a TensorRT engine for ONNX-based YOLOv8 and run inference."""

    opt = Opts().parse()

    export_engine(
        f_onnx=opt.f_onnx,
        savedir=opt.savedir,
        is_half=opt.fp16,
        is_int8=opt.int8,
        data=opt.data,
        imgsz=tuple(opt.imgsz),
        dynamic=opt.dynamic,
        batch=opt.batch,
        n_workspace=opt.workspace,
        verbose=opt.verbose,
    )

if __name__ == '__main__':
    main()

When I run this code the export to FP32 or FP16 engine works fine, but doesn't work with the INT8 option and gives the following error:

root@618fe51132ed:/workspace# python3 quantization/mainv2.py -imgsz 640 640 -int8

TensorRT: starting export with TensorRT 8.6.1...
[06/06/2024-14:41:42] [TRT] [I] [MemUsageChange] Init CUDA: CPU +2, GPU +0, now: CPU 114, GPU 139 (MiB)
[06/06/2024-14:41:45] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +1445, GPU +268, now: CPU 1635, GPU 407 (MiB)
/workspace/quantization/mainv2.py:98: DeprecationWarning: Use set_memory_pool_limit instead.
  config.max_workspace_size = workspace
[06/06/2024-14:41:45] [TRT] [I] ----------------------------------------------------------------
[06/06/2024-14:41:45] [TRT] [I] Input filename:   /models/model.onnx
[06/06/2024-14:41:45] [TRT] [I] ONNX IR version:  0.0.8
[06/06/2024-14:41:45] [TRT] [I] Opset version:    17
[06/06/2024-14:41:45] [TRT] [I] Producer name:    pytorch
[06/06/2024-14:41:45] [TRT] [I] Producer version: 2.3.0
[06/06/2024-14:41:45] [TRT] [I] Domain:           
[06/06/2024-14:41:45] [TRT] [I] Model version:    0
[06/06/2024-14:41:45] [TRT] [I] Doc string:       
[06/06/2024-14:41:45] [TRT] [I] ----------------------------------------------------------------
[06/06/2024-14:41:45] [TRT] [W] onnx2trt_utils.cpp:374: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
TensorRT: input "images" with shape(-1, 3, -1, -1) DataType.FLOAT
TensorRT: output "output0" with shape(-1, 84, -1) DataType.FLOAT
TensorRT: building INT8 engine as /output/model.engine
TensorRT: collecting INT8 calibration images from 'data=/images'
/workspace/quantization/mainv2.py:186: DeprecationWarning: Use build_serialized_network instead.
  with build(network, config) as engine, open(f, "wb") as t:
[06/06/2024-14:41:45] [TRT] [I] Graph optimization time: 0.0022734 seconds.
[06/06/2024-14:41:45] [TRT] [I] Timing cache disabled. Turning it on will improve builder speed.
[06/06/2024-14:41:46] [TRT] [I] Detected 1 inputs and 3 output network tensors.
[06/06/2024-14:41:46] [TRT] [I] Total Host Persistent Memory: 334496
[06/06/2024-14:41:46] [TRT] [I] Total Device Persistent Memory: 1645056
[06/06/2024-14:41:46] [TRT] [I] Total Scratch Memory: 0
[06/06/2024-14:41:46] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 1 MiB, GPU 150 MiB
[06/06/2024-14:41:46] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 308 steps to complete.
[06/06/2024-14:41:46] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 57.1936ms to assign 28 blocks to 308 nodes requiring 206504448 bytes.
[06/06/2024-14:41:46] [TRT] [I] Total Activation Memory: 206504448
[06/06/2024-14:41:46] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +199, now: CPU 0, GPU 213 (MiB)
[06/06/2024-14:41:46] [TRT] [I] Starting Calibration.
[06/06/2024-14:41:46] [TRT] [E] 1: [calibrator.cu::absTensorMax::146] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:46] [TRT] [E] 1: [calibrator.cu::absTensorMax::146] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:46] [TRT] [E] 1: [calibrator.cu::absTensorMax::146] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:46] [TRT] [E] 1: [calibrator.cu::absTensorMax::146] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:46] [TRT] [E] 1: [convBaseRunner.cpp::execute::295] Error Code 1: Cask (Cask convolution execution)
[06/06/2024-14:41:46] [TRT] [E] 1: [checkMacros.cpp::catchCudaError::203] Error Code 1: Cuda Runtime (invalid resource handle)
[06/06/2024-14:41:47] [TRT] [E] 3: [engine.cpp::~Engine::298] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/engine.cpp::~Engine::298, condition: mExecutionContextCounter.use_count() == 1. Destroying an engine object before destroying the IExecutionContext objects it created leads to undefined behavior.
)
[06/06/2024-14:41:47] [TRT] [E] 2: [calibrator.cpp::calibrateEngine::1181] Error Code 2: Internal Error (Assertion context->executeV2(&bindings[0]) failed. )
Traceback (most recent call last):
  File "/workspace/quantization/mainv2.py", line 211, in <module>
    main()
  File "/workspace/quantization/mainv2.py", line 197, in main
    export_engine(
  File "/workspace/quantization/mainv2.py", line 186, in export_engine
    with build(network, config) as engine, open(f, "wb") as t:
AttributeError: __enter__

I only changed the way YOLOv8 create their dataloader as I went for a basic dataloader, I don't know if that's where the error comes from & I can't find a solution for this, any help? Thank you

lix19937 commented 2 weeks ago

When I run this code the export to FP32 or FP16 engine works fine, but doesn't work with the INT8 option and gives the following error:

You should check the calib data path/file right or not. You can try to use torch.randn to generate calib data.

On the other way, if is not file/path issue, maybe CUDA context issue. Usually we do not use torch and pycuda simultaneously.