Open rtalaricw opened 12 months ago
Follow the instructions in the README from Create the model repository onwards. I needed a couple of iterations, since some packages for the pre/postprocessing were missing. In the end, pip install sentencepiece protobuf
made the tokenizer happy and it started working.
@jfolz Can you please paste your config.pbtxt for Llama2. I am getting the following error when I use the following config.pbtxt:
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
name: "tensorrt_llm"
backend: "tensorrtllm"
max_batch_size: 128
model_transaction_policy {
decoupled: False
}
input [
{
name: "input_ids"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "input_lengths"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
},
{
name: "request_output_len"
data_type: TYPE_UINT32
dims: [ 1 ]
},
{
name: "end_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "pad_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_width"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_k"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "min_length"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "presence_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "random_seed"
data_type: TYPE_UINT64
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "stop"
data_type: TYPE_BOOL
dims: [ 1 ]
optional: true
},
{
name: "streaming"
data_type: TYPE_BOOL
dims: [ 1 ]
optional: true
}
]
output [
{
name: "output_ids"
data_type: TYPE_INT32
dims: [ -1, -1 ]
}
]
instance_group [
{
count: 1
kind : KIND_CPU
}
]
parameters: {
key: "max_beam_width"
value: {
string_value: "1"
}
}
parameters: {
key: "FORCE_CPU_ONLY_INPUT_TENSORS"
value: {
string_value: "no"
}
}
parameters: {
key: "gpt_model_type"
value: {
string_value: "V1"
}
}
parameters: {
key: "gpt_model_path"
value: {
string_value: "/finetune-gpt-neox/triton_model_repo/tensorrt_llm/1"
}
}
parameters: {
key: "max_tokens_in_paged_kv_cache"
value: {
string_value: "${max_tokens_in_paged_kv_cache}"
}
}
parameters: {
key: "batch_scheduler_policy"
value: {
string_value: "${batch_scheduler_policy}"
}
}
parameters: {
key: "kv_cache_free_gpu_mem_fraction"
value: {
string_value: "${kv_cache_free_gpu_mem_fraction}"
}
}
parameters: {
key: "max_num_sequences"
value: {
string_value: "${max_num_sequences}"
}
}
parameters: {
key: "enable_trt_overlap"
value: {
string_value: "${enable_trt_overlap}"
}
}
Error:
E1114 17:20:34.602436 37 backend_model.cc:634] ERROR: Failed to create instance: unexpected error when creating modelInstanceState: [json.exception.out_of_range.403] key 'tokens_per_block' not found
E1114 17:20:34.602463 37 model_lifecycle.cc:621] failed to load 'tensorrt_llm' version 1: Internal: unexpected error when creating modelInstanceState: [json.exception.out_of_range.403] key 'tokens_per_block' not found
I1114 17:20:34.602472 37 model_lifecycle.cc:756] failed to load 'tensorrt_llm'
I1114 17:20:34.602550 37 server.cc:592]
+------------------+------+
| Repository Agent | Path |
+------------------+------+
+------------------+------+
I1114 17:20:34.602584 37 server.cc:619]
+-------------+-----------------------------------------------------------------+------------------------------------------------------------------------------------+
| Backend | Path | Config |
+-------------+-----------------------------------------------------------------+------------------------------------------------------------------------------------+
| tensorrtllm | /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so | {"cmdline":{"auto-complete-config":"true","backend-directory":"/opt/tritonserver/b |
| | | ackends","min-compute-capability":"6.000000","default-max-batch-size":"4"}} |
+-------------+-----------------------------------------------------------------+------------------------------------------------------------------------------------+
I1114 17:20:34.602614 37 server.cc:662]
+--------------+---------+-------------------------------------------------------------------------------------------------------------------------------------------+
| Model | Version | Status |
+--------------+---------+-------------------------------------------------------------------------------------------------------------------------------------------+
| tensorrt_llm | 1 | UNAVAILABLE: Internal: unexpected error when creating modelInstanceState: [json.exception.out_of_range.403] key 'tokens_per_block' not fo |
| | | und |
+--------------+---------+-------------------------------------------------------------------------------------------------------------------------------------------+
I1114 17:20:34.641717 37 metrics.cc:817] Collecting metrics for GPU 0: NVIDIA A40
I1114 17:20:34.642304 37 metrics.cc:710] Collecting CPU metrics
I1114 17:20:34.642439 37 tritonserver.cc:2458]
+----------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
| Option | Value |
+----------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
| server_id | triton |
| server_version | 2.39.0 |
| server_extensions | classification sequence model_repository model_repository(unload_dependents) schedule_policy model_configuration system_shared |
| | _memory cuda_shared_memory binary_tensor_data parameters statistics trace logging |
| model_repository_path[0] | /finetune-gpt-neox/triton_model_repo |
| model_control_mode | MODE_NONE |
| strict_model_config | 0 |
| rate_limit | OFF |
| pinned_memory_pool_byte_size | 268435456 |
| cuda_memory_pool_byte_size{0} | 67108864 |
| min_supported_compute_capability | 6.0 |
| strict_readiness | 1 |
| exit_timeout | 30 |
| cache_enabled | 0 |
+----------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
I1114 17:20:34.642459 37 server.cc:293] Waiting for in-flight requests to complete.
I1114 17:20:34.642464 37 server.cc:309] Timeout 30: Found 0 model versions that have in-flight inferences
I1114 17:20:34.642468 37 server.cc:324] All models are stopped, unloading models
I1114 17:20:34.642471 37 server.cc:331] Timeout 30: Found 0 live models and 0 in-flight non-inference requests
error: creating server: Internal - failed to load all models
Hi, I am able to reproduce building and running the model locally via TensorRT-LLM.
I build using:
I run using:
When I check the model built using TensorRT-LLM, I do not find a config.pbtxt to be able to launch it via Triton Inference Server using the TRT-LLM backend/in_flight_batching backend:
Is there a config.pbtxt I can use? How do I run it on Triton?