NVIDIA / TensorRT-LLM

TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and build TensorRT engines that contain state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to create Python and C++ runtimes that execute those TensorRT engines.
https://nvidia.github.io/TensorRT-LLM
Apache License 2.0
8.71k stars 996 forks source link

Windows C++ Executor on v0.10.0 #2457

Open rifkybujana opened 3 days ago

rifkybujana commented 3 days ago

System Info

Who can help?

@byshiue

Information

Tasks

Reproduction

Building steps:

My CMakeLists.txt for testing the library:

cmake_minimum_required(VERSION 3.14)
project(TestInferenceEngineGPU)

# Set C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Source files
set(SOURCES
    main.cpp
)

# Create the executable
add_executable(TestInferenceEngineGPU ${SOURCES})

#----------------------------------------------------------------------------------
# CUDA Setup
#----------------------------------------------------------------------------------

# Find CUDA Toolkit
find_package(CUDAToolkit REQUIRED)
message(STATUS "CUDA library status:")
message(STATUS "    version: ${CUDAToolkit_VERSION}")
message(STATUS "    libraries: ${CUDAToolkit_LIBRARY_DIR}")
message(STATUS "    include path: ${CUDAToolkit_INCLUDE_DIRS}")

if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11")
  add_definitions("-DENABLE_BF16")
  message(
    STATUS
      "CUDA_VERSION ${CUDA_VERSION} is greater or equal than 11.0, enable -DENABLE_BF16 flag"
  )
endif()

if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "11.8")
  add_definitions("-DENABLE_FP8")
  message(
    STATUS
    "CUDA_VERSION ${CUDA_VERSION} is greater or equal than 11.8, enable -DENABLE_FP8 flag"
    )
endif()

set(TRT_INCLUDE_DIR    ${CMAKE_CURRENT_SOURCE_DIR}/../external/TensorRT/TensorRT-10.0.1.6/include)
set(TRTLLM_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../external/tensorrt_llm/include)
set(TRTLLM_LIB_DIR     ${CMAKE_CURRENT_SOURCE_DIR}/../external/tensorrt_llm/libs)

target_include_directories(TestInferenceEngineGPU PUBLIC ${TRT_INCLUDE_DIR} ${TRTLLM_INCLUDE_DIR} ${CUDAToolkit_INCLUDE_DIRS})

# tensorrt_llm
find_library(TRTLLM NAMES tensorrt_llm PATHS ${TRTLLM_LIB_DIR} NO_DEFAULT_PATH REQUIRED)
add_library(tensorrt_llm SHARED IMPORTED)
set_target_properties(tensorrt_llm PROPERTIES
  IMPORTED_IMPLIB                   "${TRTLLM_LIB_DIR}/tensorrt_llm.lib"
  IMPORTED_LOCATION                 "${TRTLLM_LIB_DIR}/tensorrt_llm.dll"
  IMPORTED_LINK_INTERFACE_LIBRARIES "CUDA::cuda_driver;CUDA::cudart_static;CUDA::nvml"
)

# nvinfer_plugin_tensorrt_llm
find_library(NVINFER_TRTLLM NAMES nvinfer_plugin_tensorrt_llm PATHS ${TRTLLM_LIB_DIR} NO_DEFAULT_PATH REQUIRED)
add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED)
set_target_properties(nvinfer_plugin_tensorrt_llm PROPERTIES
  IMPORTED_IMPLIB                   "${TRTLLM_LIB_DIR}/nvinfer_plugin_tensorrt_llm.lib"
  IMPORTED_LOCATION                 "${TRTLLM_LIB_DIR}/nvinfer_plugin_tensorrt_llm.dll"
  IMPORTED_LINK_INTERFACE_LIBRARIES tensorrt_llm
)

find_library(TENSORRT NAMES nvinfer_10 REQUIRED)
find_library(CUDNN    NAMES cudnn      REQUIRED)

message(STATUS "TensorRT library status:")
message(STATUS "    TensorRT-LLM libraries: ${TRTLLM}")
message(STATUS "    TensorRT-LLM Plugins libraries: ${NVINFER_TRTLLM}")
message(STATUS "    TensorRT libraries: ${TENSORRT}")
message(STATUS "    CuDNN libraries: ${CUDNN}")

target_link_libraries(TestInferenceEngineGPU PUBLIC 
  nvinfer_plugin_tensorrt_llm
  ${TENSORRT}
  ${CUDNN}
  CUDA::cuda_driver
  CUDA::cudart
  CUDA::nvml
)

# Set output directory
set_target_properties(TestInferenceEngineGPU PROPERTIES
    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin/$<CONFIG>"
)

succeeded code:

#include "tensorrt_llm/plugins/api/tllmPlugin.h"

#include <iostream>
#include <chrono>

int main(int argc, char* argv[])
{
    initTrtLlmPlugins();
}

failed code:

#include "tensorrt_llm/executor/executor.h"

#include <iostream>
#include <chrono>

int main(int argc, char* argv[])
{
    tensorrt_llm::executor::ExecutorConfig config;
}

Expected behavior

Expect the compilation of main.cpp to finish without any error when using tensorrt_llm/executor/executor.h classes and structures.

actual behavior

received unresolved external symbol errors when using any tensorrt_llm/executor/executor.h, but run without any problem on any other.

unresolved external symbol "public: __cdecl tensorrt_llm::executor::SchedulerConfig::SchedulerConfig(enum tensorrt_llm::executor::CapacitySchedulerPolicy,class std::optional<enum tensorrt_llm::executor::ContextChunkingPolicy>)" (??0SchedulerConfig@executor@tensorrt_llm@@QEAA@W4CapacitySchedulerPolicy@12@V?$optional@W4ContextChunkingPolicy@executor@tensorrt_llm@@@std@@@Z) referenced in function main
unresolved external symbol "public: __cdecl tensorrt_llm::executor::KvCacheConfig::KvCacheConfig(bool,class std::optional<int> const &,class std::optional<int> const &,class std::optional<int> const &,class std::optional<float> const &,class std::optional<unsigned __int64> const &,bool)" (??0KvCacheConfig@executor@tensorrt_llm@@QEAA@_NAEBV?$optional@H@std@@11AEBV?$optional@M@4@AEBV?$optional@_K@4@0@Z) referenced in function main
unresolved external symbol "public: __cdecl tensorrt_llm::executor::ExecutorConfig::ExecutorConfig(int,class tensorrt_llm::executor::SchedulerConfig const &,class tensorrt_llm::executor::KvCacheConfig const &,bool,bool,int,int,enum tensorrt_llm::executor::BatchingType,class std::optional<class tensorrt_llm::executor::ParallelConfig>,class std::optional<class tensorrt_llm::executor::PeftCacheConfig> const &,class std::optional<class std::unordered_map<class std::basic_string<char,struct std::char_traits<char>,class std::allocator<char> >,class std::function<void __cdecl(unsigned __int64,class tensorrt_llm::executor::Tensor &,class std::vector<class std::vector<int,class std::allocator<int> >,class std::allocator<class std::vector<int,class std::allocator<int> > > > const &,class std::shared_ptr<class tensorrt_llm::runtime::CudaStream> &)>,struct std::hash<class std::basic_string<char,struct std::char_traits<char>,class std::allocator<char> > >,struct std::equal_to<class std::basic_string<char,struct std::char_traits<char>,class std::allocator<char> > >,class std::allocator<struct std::pair<class std::basic_string<char,struct std::char_traits<char>,class std::allocator<char> > const ,class std::function<void __cdecl(unsigned __int64,class tensorrt_llm::executor::Tensor &,class std::vector<class std::vector<int,class std::allocator<int> >,class std::allocator<class std::vector<int,class std::allocator<int> > > > const &,class std::shared_ptr<class tensorrt_llm::runtime::CudaStream> &)> > > > >,class std::optional<class std::vector<class std::vector<int,class std::allocator<int> >,class std::allocator<class std::vector<int,class std::allocator<int> > > > >,class std::optional<enum tensorrt_llm::executor::DecodingMode>,float)" (??0ExecutorConfig@executor@tensorrt_llm@@QEAA@HAEBVSchedulerConfig@12@AEBVKvCacheConfig@12@_N2HHW4BatchingType@12@V?$optional@VParallelConfig@executor@tensorrt_llm@@@std@@AEBV?$optional@VPeftCacheConfig@executor@tensorrt_llm@@@7@V?$optional@V?$unordered_map@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$function@$$A6AX_KAEAVTensor@executor@tensorrt_llm@@AEBV?$vector@V?$vector@HV?$allocator@H@std@@@std@@V?$allocator@V?$vector@HV?$allocator@H@std@@@std@@@2@@std@@AEAV?$shared_ptr@VCudaStream@runtime@tensorrt_llm@@@5@@Z@2@U?$hash@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@U?$equal_to@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@V?$allocator@U?$pair@$$CBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$function@$$A6AX_KAEAVTensor@executor@tensorrt_llm@@AEBV?$vector@V?$vector@HV?$allocator@H@std@@@std@@V?$allocator@V?$vector@HV?$allocator@H@std@@@std@@@2@@std@@AEAV?$shared_ptr@VCudaStream@runtime@tensorrt_llm@@@5@@Z@2@@std@@@2@@std@@@7@V?$optional@V?$vector@V?$vector@HV?$allocator@H@std@@@std@@V?$allocator@V?$vector@HV?$allocator@H@std@@@std@@@2@@std@@@7@V?$optional@W4DecodingMode@executor@tensorrt_llm@@@7@M@Z) referenced in function main

additional notes

Do tensorrt_llm::executor on windows using C++ runtime is not supported on version 0.10.0? Does the newer version support it?