Open xiatwhu opened 1 year ago
import torch import os import tensorrt as trt from cuda import cudart import numpy as np
class CLIP(torch.nn.Module): def init(self): super().init()
def forward(self, x): mask = torch.empty(x.shape[0], x.shape[1], x.shape[2], device=x.device) mask.fill_(float('-inf')) mask.triu_(1) return mask, mask + x
model = CLIP() device = torch.device('cuda') x = torch.ones(1, 2, 2, device=device)
torch.onnx.export(model, (x), 'test.onnx', export_params=True, opset_version=18, do_constant_folding=True, keep_initializers_as_inputs=True, input_names=['x'], output_names=['mask', 'y'])
os.system("trtexec --onnx=test.onnx --saveEngine=test.plan")
trt_logger = trt.Logger(trt.Logger.INFO) with open('test.plan', 'rb') as f, trt.Runtime(trt_logger) as runtime: trt_engine = runtime.deserialize_cuda_engine(f.read()) trt_ctx = trt_engine.create_execution_context()
mask = torch.empty(1, 2, 2, device=device) y = torch.empty(1, 2, 2, device=device) trt_ctx.set_tensor_address('x', x.data_ptr()) trt_ctx.set_tensor_address('mask', mask.data_ptr()) trt_ctx.set_tensor_address('y', y.data_ptr()) stream = cudart.cudaStreamCreateWithPriority(cudart.cudaStreamNonBlocking, 0)[1] trt_ctx.execute_async_v3(stream) cudart.cudaStreamSynchronize(stream) print('trt mask:', mask) print('trt y:', y)
mask, y = model(x) print('torch mask: ', mask) print('torch y: ', y)
- Expected Behavior 预期 TensorRT 获得与 pytorch 一致的输出
trt mask: tensor([[[0., -inf], [0., 0.]]], device='cuda:0') trt y: tensor([[[1., -inf], [1., 1.]]], device='cuda:0') torch mask: tensor([[[0., -inf], [0., 0.]]], device='cuda:0') torch y: tensor([[[1., -inf], [1., 1.]]], device='cuda:0')
- Actual Behavior 实际上 Trilu 输出的结果中本该为 0 的地方实际上为 nan, 进而导致后续计算结果也为 nan
trt mask: tensor([[[nan, -inf], [nan, nan]]], device='cuda:0') trt y: tensor([[[nan, -inf], [nan, nan]]], device='cuda:0') torch mask: tensor([[[0., -inf], [0., 0.]]], device='cuda:0') torch y: tensor([[[1., -inf], [1., 1.]]], device='cuda:0')
- Additional Notes 引起这一错误的原因在于从 onnx Trilu 转为 TensorRT Layer 时使用了一个工具函数 createZeroTensor,这个函数目的是为了创建一个与输入 Tensor 维度一致数值全为 0 的 Tensor。该函数实现时使用 constant 0 与输入做点乘的方案,这个方案在输入为 -inf 时并不会输出 0,而是输出 nan。 [Trilu 转换代码](https://github.com/onnx/onnx-tensorrt/blob/main/builtin_op_importers.cpp#L4967) ```cpp auto* rows = iota(ctx, iotadims, 0); auto* cols = iota(ctx, iotadims, 1); auto* zero = createZeroTensor(ctx, data);
createZeroTensor 实现
nvinfer1::ITensor* createZeroTensor(IImporterContext* ctx, nvinfer1::ITensor* data) { nvinfer1::ITensor* zero = addConstant(ctx, std::vector<float>{0.f}, ::ONNX_NAMESPACE::TensorProto::FLOAT, {0, {1}})->getOutput(0); zero = castHelper(ctx, zero, data->getType()); broadcastTensors(ctx, zero, data); zero = ctx->network()->addElementWise(*data, *zero, nvinfer1::ElementWiseOperation::kPROD)->getOutput(0); return zero; }
同样遇到这个问题,导出onnx后通过替换一个constant解决了
class CLIP(torch.nn.Module): def init(self): super().init()
model = CLIP() device = torch.device('cuda') x = torch.ones(1, 2, 2, device=device)
torch.onnx.export(model, (x), 'test.onnx', export_params=True, opset_version=18, do_constant_folding=True, keep_initializers_as_inputs=True, input_names=['x'], output_names=['mask', 'y'])
os.system("trtexec --onnx=test.onnx --saveEngine=test.plan")
trt_logger = trt.Logger(trt.Logger.INFO) with open('test.plan', 'rb') as f, trt.Runtime(trt_logger) as runtime: trt_engine = runtime.deserialize_cuda_engine(f.read()) trt_ctx = trt_engine.create_execution_context()
mask, y = model(x) print('torch mask: ', mask) print('torch y: ', y)
trt mask: tensor([[[0., -inf], [0., 0.]]], device='cuda:0') trt y: tensor([[[1., -inf], [1., 1.]]], device='cuda:0') torch mask: tensor([[[0., -inf], [0., 0.]]], device='cuda:0') torch y: tensor([[[1., -inf], [1., 1.]]], device='cuda:0')
trt mask: tensor([[[nan, -inf], [nan, nan]]], device='cuda:0') trt y: tensor([[[nan, -inf], [nan, nan]]], device='cuda:0') torch mask: tensor([[[0., -inf], [0., 0.]]], device='cuda:0') torch y: tensor([[[1., -inf], [1., 1.]]], device='cuda:0')
createZeroTensor 实现