Cuda MEMCPY failure with float16 of TensorRT 10.3 Custom Plugin when running inference on H100 NVL x 2

Description

I am unable to do an identity operation which involves copying out a tensor 1:1 that is float16. It works fine with an int32 and int64 tensor, but this is a basis of a plugin I am trying to create that manipulates float16. However, if the identity (copying out as output) operation doesn't work, I'm not sure what to do.

self.debug_buffer["full_hidden_states"] tensor([[ 4.1797, -0.2051, -1.8369, ..., -2.8945, 1.3564, 0.3196], [ 2.1680, -2.5840, -2.0742, ..., -2.5312, -0.4636, -2.8867], [ 2.1934, -3.1797, 1.9854, ..., 0.7856, -0.0352, -1.5967], ..., [-0.6577, 2.3828, 7.0742, ..., 2.1074, -1.9043, -0.1153], [ 0.9321, -3.4199, 0.9727, ..., 0.4680, -3.3691, -1.2725], [-0.6484, 0.9282, 0.6196, ..., 5.9570, -4.6875, -0.6816]], device='cuda:0', dtype=torch.float16)

This is the tensor I am tryng to copy out in, and this is the result I'm getting with the below tensorrt plugin I have created

self.debug_buffer['output_hidden_states'] tensor([[[ 4.1797, -0.2051, -1.8369, ..., -2.8945, 1.3564, 0.3196], [ 2.1680, -2.5840, -2.0742, ..., -2.5312, -0.4636, -2.8867], [ 2.1934, -3.1797, 1.9854, ..., 0.7856, -0.0352, -1.5967], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]]], device='cuda:0', dtype=torch.float16)

This is bizzare, I'm hoping I have misconfigured some settings or something, because evidently this is rather strange. In the below code I am smply doing a straight up copy out operation. The model I'm running is Meta Llama 3 Instruct 8B, but I feel like this issue is independent of model.

I am building the network with TensorRT-LLM.


 `    cudaMemcpyAsync(outputs[1], inputs[1], hidden_states_size_original, cudaMemcpyDeviceToDevice, stream);`
> 
> `/*
>  * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
>  * SPDX-License-Identifier: Apache-2.0
>  *
>  * Licensed under the Apache License, Version 2.0 (the "License");
>  * you may not use this file except in compliance with the License.
>  * You may obtain a copy of the License at
>  *
>  * http://www.apache.org/licenses/LICENSE-2.0
>  *
>  * Unless required by applicable law or agreed to in writing, software
>  * distributed under the License is distributed on an "AS IS" BASIS,
>  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>  * See the License for the specific language governing permissions and
>  * limitations under the License.
>  */
> #include "updateInferenceInputsPlugin.h"
> #include "tensorrt_llm/runtime/iBuffer.h"
> #include <iostream>
> 
> using namespace nvinfer1;
> using tensorrt_llm::plugins::UpdateInferenceInputsPlugin;
> using tensorrt_llm::plugins::UpdateInferenceInputsPluginCreator;
> 
> static char const *UPDATE_INFERENCE_INPUTS_PLUGIN_VERSION{"1"};
> static char const *UPDATE_INFERENCE_INPUTS_PLUGIN_NAME{"UpdateInferenceInputs"};
> PluginFieldCollection UpdateInferenceInputsPluginCreator::mFC{};
> std::vector<nvinfer1::PluginField> UpdateInferenceInputsPluginCreator::mPluginAttributes;
> 
> UpdateInferenceInputsPlugin::UpdateInferenceInputsPlugin()
> {
>     std::cout << "UpdateInferenceInputsPlugin constructor called" << std::endl;
> }
> 
> UpdateInferenceInputsPlugin::UpdateInferenceInputsPlugin(void const *data, size_t length)
> {
>     std::cout << "UpdateInferenceInputsPlugin parameterized constructor called" << std::endl;
>     char const *d = reinterpret_cast<char const *>(data), *a = d;
>     TLLM_CHECK_WITH_INFO(d == a + length,
>                          "Expected length (%d) != real length (%d). This is often "
>                          "caused by using different TensorRT-LLM version to build "
>                          "engine and run engine.",
>                          (int)length, (int)(d - a));
> }
> 
> nvinfer1::IPluginV2DynamicExt *UpdateInferenceInputsPlugin::clone() const noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::clone called" << std::endl;
>     auto *plugin = new UpdateInferenceInputsPlugin(*this);
>     plugin->setPluginNamespace(mNamespace.c_str());
>     return plugin;
> }
> 
> nvinfer1::DimsExprs UpdateInferenceInputsPlugin::getOutputDimensions(
>     int outputIndex, nvinfer1::DimsExprs const *inputs, int nbInputs, nvinfer1::IExprBuilder &exprBuilder) noexcept
> {
>     // Assuming inputs[0] is the condition, inputs[1] is true_input_ids, inputs[2] is true_hidden_states,
>     // inputs[3] is false_input_ids, inputs[4] is false_hidden_states
>     if (outputIndex == 0)
>     {
> 
>         nvinfer1::DimsExprs output;
>         output.nbDims = 2;
> 
>         output.d[0] = inputs[0].d[0];
>         output.d[1] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUM, *inputs[0].d[1], *exprBuilder.constant(5));
> 
>         // Dimensions for input_ids (same for both true and false branches)
>         return output;
>     }
>     else
>     {
>         nvinfer1::DimsExprs output;
>         output.nbDims = 3;
> 
>         output.d[0] = inputs[1].d[0];
>         output.d[1] = inputs[1].d[1];
>         output.d[2] = inputs[1].d[2];
> 
>         return output;
>     }
> }
> 
> bool UpdateInferenceInputsPlugin::supportsFormatCombination(
>     int pos, nvinfer1::PluginTensorDesc const *inOut, int nbInputs, int nbOutputs) noexcept
> {
> 
>     return true;
>     // Ensure we don't acce ss out of bounds
>     if (pos >= nbInputs + nbOutputs)
>         return false;
> 
>     // Check input tensors
>     if (pos < nbInputs)
>     {
>         if (pos == 0 || pos == 1 || pos == 4 || pos == 5)
>         {
>             return inOut[pos].type == DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR;
>         }
>         else if (pos == 2 || pos == 3 || pos == 8)
>         {
>             return inOut[pos].type == DataType::kHALF && inOut[pos].format == TensorFormat::kLINEAR;
>         }
>         else if (pos == 6 || pos == 7)
>         {
>             return inOut[pos].type == DataType::kINT64 && inOut[pos].format == TensorFormat::kLINEAR;
>         }
>         else
>         {
>             return false; // Unknown input
>         }
>     }
>     // Check output tensors
>     else
>     {
>         int output_index = pos - nbInputs;
>         if (output_index == 0)
>         {
>             return inOut[pos].type == DataType::kINT32 && inOut[pos].format == TensorFormat::kLINEAR;
>         }
>         else if (output_index == 1)
>         {
>             return inOut[pos].type == DataType::kHALF && inOut[pos].format == TensorFormat::kLINEAR;
>         }
>         else
>         {
>             return false; // Unknown output
>         }
>     }
> }
> 
> void UpdateInferenceInputsPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const *in, int nbInputs,
>                                                   nvinfer1::DynamicPluginTensorDesc const *out, int nbOutputs) noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::configurePlugin called" << std::endl;
> }
> 
> size_t UpdateInferenceInputsPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const *inputs, int nbInputs,
>                                                      nvinfer1::PluginTensorDesc const *outputs, int nbOutputs) const noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::getWorkspaceSize called" << std::endl;
>     return 0;
> }
> 
> int UpdateInferenceInputsPlugin::enqueue(nvinfer1::PluginTensorDesc const *inputDesc,
>                                          nvinfer1::PluginTensorDesc const *outputDesc, void const *const *inputs, void *const *outputs, void *workspace,
>                                          cudaStream_t stream) noexcept
> {
> 
>     int64_t *input_ids = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[0]));
> 
>     half *full_hidden_states = reinterpret_cast<half *>(const_cast<void *>(inputs[1]));
> 
>     half *lm_logits = reinterpret_cast<half *>(const_cast<void *>(inputs[2]));
> 
>     int64_t *retrieve_indices = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[3]));
> 
>     int64_t *draft_tokens = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[4]));
> 
>     int64_t *candidates = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[5]));
>     int64_t *accept_length = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[6]));
> 
>     half *sample_p = reinterpret_cast<half *>(const_cast<void *>(inputs[7]));
> 
>     int64_t *best_candidate = reinterpret_cast<int64_t *>(const_cast<void *>(inputs[8]));
> 
>     int batch_size = inputDesc[0].dims.d[0];
>     int sequence_length = inputDesc[0].dims.d[1];
>     int max_candidate_length = inputDesc[5].dims.d[1];
>     int candidate_length = 4;
>     int vocab_size = inputDesc[7].dims.d[0];
>     int hidden_size = inputDesc[1].dims.d[2];
>     int num_candidates = inputDesc[3].dims.d[0];
>     int retrieve_seq_length = inputDesc[3].dims.d[1];
> 
>     size_t input_ids_size = batch_size * sequence_length * sizeof(int64_t);
>     size_t hidden_states_size = batch_size * (1 + candidate_length) * hidden_size * sizeof(half);
>     size_t hidden_states_size_original = batch_size * (sequence_length)*hidden_size * sizeof(half);
> 
>     std::cout << "Simple Case" << std::endl;
>     cudaMemcpyAsync(outputs[0], inputs[0], input_ids_size, cudaMemcpyDeviceToDevice, stream);
>     cudaMemcpyAsync(outputs[1], inputs[1], hidden_states_size_original, cudaMemcpyDeviceToDevice, stream);
> 
>     return 0;
> }
> 
> nvinfer1::DataType UpdateInferenceInputsPlugin::getOutputDataType(
> 
>     int index, nvinfer1::DataType const *inputTypes, int nbInputs) const noexcept
> 
> {
> 
>     if (index == 0)
> 
>     {
> 
>         // For output_input_ids, return INT32
> 
>         return DataType::kINT64;
>     }
> 
>     else
> 
>     {
> 
>         // For output_hidden_states, return HALF
> 
>         return DataType::kHALF;
>     }
> }
> 
> char const *UpdateInferenceInputsPlugin::getPluginType() const noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::getPluginType called" << std::endl;
>     return UPDATE_INFERENCE_INPUTS_PLUGIN_NAME;
> }
> 
> char const *UpdateInferenceInputsPlugin::getPluginVersion() const noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::getPluginVersion called" << std::endl;
>     return UPDATE_INFERENCE_INPUTS_PLUGIN_VERSION;
> }
> 
> int UpdateInferenceInputsPlugin::getNbOutputs() const noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::getNbOutputs called" << std::endl;
>     return 2;
> }
> 
> int UpdateInferenceInputsPlugin::initialize() noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::initialize called" << std::endl;
>     return 0;
> }
> 
> void UpdateInferenceInputsPlugin::terminate() noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::terminate called" << std::endl;
> }
> 
> size_t UpdateInferenceInputsPlugin::getSerializationSize() const noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::getSerializationSize called" << std::endl;
>     return 0;
> }
> 
> void UpdateInferenceInputsPlugin::serialize(void *buffer) const noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::serialize called" << std::endl;
> }
> 
> void UpdateInferenceInputsPlugin::destroy() noexcept
> {
>     std::cout << "UpdateInferenceInputsPlugin::destroy called" << std::endl;
>     delete this;
> }
> 
> ///////////////
> 
> UpdateInferenceInputsPluginCreator::UpdateInferenceInputsPluginCreator()
> {
>     std::cout << "UpdateInferenceInputsPluginCreator constructor called" << std::endl;
>     mPluginAttributes.clear();
>     mFC.nbFields = mPluginAttributes.size();
>     mFC.fields = mPluginAttributes.data();
> }
> 
> char const *UpdateInferenceInputsPluginCreator::getPluginName() const noexcept
> {
>     std::cout << "UpdateInferenceInputsPluginCreator::getPluginName called" << std::endl;
>     return UPDATE_INFERENCE_INPUTS_PLUGIN_NAME;
> }
> 
> char const *UpdateInferenceInputsPluginCreator::getPluginVersion() const noexcept
> {
>     std::cout << "UpdateInferenceInputsPluginCreator::getPluginVersion called" << std::endl;
>     return UPDATE_INFERENCE_INPUTS_PLUGIN_VERSION;
> }
> 
> PluginFieldCollection const *UpdateInferenceInputsPluginCreator::getFieldNames() noexcept
> {
>     std::cout << "UpdateInferenceInputsPluginCreator::getFieldNames called" << std::endl;
>     return &mFC;
> }
> 
> IPluginV2 *UpdateInferenceInputsPluginCreator::createPlugin(char const *name, PluginFieldCollection const *fc) noexcept
> {
>     std::cout << "UpdateInferenceInputsPluginCreator::createPlugin called" << std::endl;
>     try
>     {
>         auto *obj = new UpdateInferenceInputsPlugin();
>         obj->setPluginNamespace(mNamespace.c_str());
>         return obj;
>     }
>     catch (std::exception const &e)
>     {
>         caughtError(e);
>     }
>     return nullptr;
> }
> 
> IPluginV2 *UpdateInferenceInputsPluginCreator::deserializePlugin(
>     char const *name, void const *serialData, size_t serialLength) noexcept
> {
>     // This object will be deleted when the network is destroyed, which will
>     // call UpdateInferenceInputsPlugin::destroy()
> 
>     std::cout << "Plugin being desiarialized" << std::endl;
> 
>     try
>     {
> 
>         std::cout << "suiccess" << std::endl;
> 
>         auto *obj = new UpdateInferenceInputsPlugin(serialData, serialLength);
>         obj->setPluginNamespace(mNamespace.c_str());
>         return obj;
>     }
>     catch (std::exception const &e)
>     {
>         std::cout << "fail" << std::endl;
> 
>         caughtError(e);
>     }
>     return nullptr;
> }
> `

Environment

TensorRT Version: 10.3

NVIDIA GPU: 2x H100 NVL

NVIDIA Driver Version: 5 555.42.06

CUDA Version: 12.5

CUDNN Version:

Operating System:

Python Version (if applicable): 3.10

Tensorflow Version (if applicable):

PyTorch Version (if applicable):

Baremetal or Container (if so, version):

Relevant Files

Model link: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct

Steps To Reproduce

1) Build Llama 3 8B with TensorRT-LLM using official instructions. 2) Create TensorRT-LLM plugin with the code I have above 3) Copy out any float16 tensor 4) Use self.debug_buffer in the Python runtime to log the float16 tensor and observe it will be different.

Commands or scripts:

Have you tried the latest release?:

Yes

Can this model run on other frameworks? For example run ONNX model with ONNXRuntime (polygraphy run <model.onnx> --onnxrt):

Yes.

NVIDIA / TensorRT