NVIDIA / TensorRT-LLM

TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and build TensorRT engines that contain state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to create Python and C++ runtimes that execute those TensorRT engines.
https://nvidia.github.io/TensorRT-LLM
Apache License 2.0
7.51k stars 818 forks source link

[bug] Offloading to host memory leads to error #1687

Open akhoroshev opened 1 month ago

akhoroshev commented 1 month ago

I'm testing kv reuse feature

Everything works fine until i try to use offloading to host mem

I enable offloading by these lines

 optionalParams.kvCacheConfig.hostCacheSize = 40000000000;
 optionalParams.kvCacheConfig.onboardBlocks = true;

But when I tried to use the server I encountered an error

[TensorRT-LLM][ERROR] Encountered an error in forwardAsync function: [TensorRT-LLM][ERROR] Assertion failed: mFreePrimaryBlocks list has no GPU blocks (/home/jenkins/agent/workspace/LLM/main/L0_MergeRequest/llm/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp:340)
1             0x412b9e tensorrt_llm::common::throwRuntimeError(char const*, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 71
2       0x7fc0e2c2fae6 tensorrt_llm::batch_manager::kv_cache_manager::BlockManager::findBestGPUBlockToFree() + 582
3       0x7fc0e2c320e7 tensorrt_llm::batch_manager::kv_cache_manager::BlockManager::getFreeBlock() + 23
4       0x7fc0e2c33b4c tensorrt_llm::batch_manager::kv_cache_manager::BlockManager::loadOrAllocateBlocks(std::__cxx11::list<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > > const&, tensorrt_llm::batch_manager::kv_cache_manager::GenerationRequest&, int, int) + 572
5       0x7fc0e2c34107 tensorrt_llm::batch_manager::kv_cache_manager::BlockManager::addSequence(tensorrt_llm::batch_manager::kv_cache_manager::GenerationRequest&, int, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> const&) + 295
6       0x7fc0e2c34603 tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager::addSequence(int, int, int, std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> const&) + 707
7       0x7fc0e2c51c7e tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::ssm_state_manager::SsmStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 3774
8       0x7fc0e2c536dd tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep[abi:cxx11](std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::ssm_state_manager::SsmStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 157
9       0x7fc0e2c5ea82 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::setupContext(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int) + 146
10      0x7fc0e2c5ed21 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 577
11      0x7fc0e2c68851 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2177
12      0x7fc0e2c1ae34 tensorrt_llm::batch_manager::GptManager::forwardAsync(std::__cxx11::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&, std::unordered_set<unsigned long, std::hash<unsigned long>, std::equal_to<unsigned long>, std::allocator<unsigned long> >&) + 36
13      0x7fc0e2c21bb7 tensorrt_llm::batch_manager::GptManager::decoupled_execution_loop() + 215

I'm using version from 30 april and the GptManager backend

byshiue commented 1 month ago

Could you share your end to end reproduced steps to help reproducing?

github-actions[bot] commented 2 weeks ago

This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."