NVIDIA / TensorRT-LLM

TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and build TensorRT engines that contain state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to create Python and C++ runtimes that execute those TensorRT engines.
https://nvidia.github.io/TensorRT-LLM
Apache License 2.0
8.6k stars 977 forks source link

How to use batchsize in running llava? #928

Closed jkl375 closed 9 months ago

jkl375 commented 9 months ago

System Info

image

Who can help?

@symphonylyh

Information

Tasks

Reproduction

  1. Generate TRT-LLM engine for LLaMA following example in examples/llama/README.md(I want to use a batch size of 32)
    python ../llama/build.py \
    --model_dir /workspace/examples/multimodal/llava-v1.5-7b \
    --output_dir trt_engines/llava-v1.5-7b/fp16/1-gpu \
    --dtype float16 \
    --remove_input_padding \
    --use_gpt_attention_plugin float16 \
    --enable_context_fmha \
    --use_gemm_plugin float16 \
    --max_batch_size 32 \
    --max_prompt_embedding_table_size 18432 # 576 (visual_feature_dim) * 32 (max_batch_size)
  2. Set maxBS to be 32 in code examples/multimodal/build_visual_engine.py
    def build_trt_engine(part_id,
                     img_height,
                     img_width,
                     output_dir,
                     minBS=1,
                     optBS=2,
                     maxBS=32):
    part_name = 'visual_encoder' if part_id == 0 else 'Qformer'
    onnx_file = '%s/%s.onnx' % (output_dir, part_name)
    engine_file = '%s/%s_fp16.engine' % (output_dir, part_name)
    logger.log(trt.Logger.INFO, "Building TRT engine for %s" % part_name)
  3. Build TensorRT engines for visual components
    python build_visual_engine.py --model_name llava-v1.5-7b --model_path /workspace/examples/multimodal/llava-v1.5-7b
  4. Modify run.py
    
    import argparse
    import os
    import sys
    from pathlib import Path

import numpy as np import requests import tensorrt as trt import torch from PIL import Image from transformers import (AutoConfig, AutoTokenizer, Blip2ForConditionalGeneration, Blip2Processor)

import tensorrt_llm import tensorrt_llm.profiler as profiler from tensorrt_llm import logger from tensorrt_llm._utils import torch_to_numpy from tensorrt_llm.runtime import ModelRunner, Session, TensorInfo

sys.path.append(str(Path(file).parent.parent)) from enc_dec.run import TRTLLMEncDecModel

def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('--max_new_tokens', type=int, default=30) parser.add_argument('--batch_size', type=int, default=1) parser.add_argument('--log_level', type=str, default='info') parser.add_argument('--visual_engine_dir', type=str, default=None, help='Directory containing visual TRT engines') parser.add_argument('--llm_engine_dir', type=str, default=None, help='Directory containing TRT-LLM engines') parser.add_argument('--hf_model_dir', type=str, default=None, help="Directory containing tokenizer") parser.add_argument( '--decoder_llm', action='store_true', help='Whether LLM is decoder-only or an encoder-decoder variant?') parser.add_argument('--blip_encoder', action='store_true', help='Whether visual encoder is a BLIP model') parser.add_argument('--input_text', type=str, default='Question: which city is this? Answer:', help='Text prompt to LLM') parser.add_argument('--num_beams', type=int, help="Use beam search if num_beams >1", default=1) parser.add_argument('--top_k', type=int, default=1)

return parser.parse_args()

def trt_dtype_to_torch(dtype): if dtype == trt.float16: return torch.float16 elif dtype == trt.float32: return torch.float32 elif dtype == trt.int32: return torch.int32 else: raise TypeError("%s is not supported" % dtype)

class MultiModalModel:

def __init__(self, args):
    self.args = args

    runtime_rank = tensorrt_llm.mpi_rank()
    device_id = runtime_rank % torch.cuda.device_count()
    torch.cuda.set_device(device_id)
    self.stream = torch.cuda.current_stream().cuda_stream

    self.init_image_encoder()
    self.init_tokenizer()
    self.init_llm()

def init_tokenizer(self):
    self.tokenizer = AutoTokenizer.from_pretrained(self.args.hf_model_dir,
                                                   use_fast=False,
                                                   use_legacy=False)
    self.tokenizer.padding_side = "right"
    self.tokenizer.pad_token = self.tokenizer.eos_token

def init_image_encoder(self):
    vit_path = os.path.join(self.args.visual_engine_dir,
                            'visual_encoder_fp16.engine')
    logger.info(f'Loading engine from {vit_path}')
    with open(vit_path, 'rb') as f:
        engine_buffer = f.read()
    logger.info(f'Creating session from engine {vit_path}')
    self.vit_session = Session.from_serialized_engine(engine_buffer)

    if self.args.blip_encoder:
        qformer_path = os.path.join(self.args.visual_engine_dir,
                                    'Qformer_fp16.engine')
        logger.info(f'Loading engine from {qformer_path}')
        with open(qformer_path, 'rb') as f:
            engine_buffer_qformer = f.read()
        logger.info(f'Creating session from engine {qformer_path}')
        self.vit_qformer = Session.from_serialized_engine(
            engine_buffer_qformer)

def init_llm(self):
    if self.args.decoder_llm:
        self.model = ModelRunner.from_dir(self.args.llm_engine_dir,
                                          rank=tensorrt_llm.mpi_rank(),
                                          debug_mode=False)
        self.model_config = self.model.session._model_config
    else:
        self.model = TRTLLMEncDecModel.from_engine(
            self.args.hf_model_dir.split('/')[-1],
            self.args.llm_engine_dir,
            debug_mode=False)
        self.model_config = self.model.encoder_model_config

        hf_config = AutoConfig.from_pretrained(self.args.hf_model_dir)
        self.decoder_input_ids = torch.IntTensor(
            [[hf_config.decoder_start_token_id]]).repeat(
                (self.args.batch_size, 1)).to("cuda")

def generate(self, pre_prompt, post_prompt, image, max_new_tokens):
    visual_features, visual_atts = self.get_visual_features(image)

    pre_input_ids = self.tokenizer(pre_prompt,
                                   return_tensors="pt",
                                   padding=True).input_ids.to("cuda")
    if post_prompt is not None:
        post_input_ids = self.tokenizer(post_prompt,
                                        return_tensors="pt",
                                        padding=True).input_ids.to("cuda")
        length = pre_input_ids.shape[1] + post_input_ids.shape[
            1] + visual_atts.shape[1]
    else:
        post_input_ids = None
        length = pre_input_ids.shape[1] + visual_atts.shape[1]

    input_atts = torch.ones((1, length)).to(torch.int32).to("cuda")
    input_lengths = torch.sum(input_atts, dim=1)
    # import pdb; pdb.set_trace()
    input_ids, ptuning_args = self.setup_fake_prompts(
        visual_features, pre_input_ids, post_input_ids, input_lengths)

    if self.args.decoder_llm:
        prompt_table = ptuning_args[0]
        prompt_table = torch.stack([prompt_table])
        np.save('prompt_table.npy', torch_to_numpy(prompt_table))
    profiler.start("LLM")
    if self.args.decoder_llm:
        output_ids = self.model.generate(
            input_ids.to("cpu"),
            sampling_config=None,
            prompt_table_path='prompt_table.npy',
            max_new_tokens=max_new_tokens,
            end_id=self.tokenizer.eos_token_id,
            pad_id=self.tokenizer.pad_token_id,
            top_k=self.args.top_k,
            num_beams=self.args.num_beams,
            output_sequence_lengths=False,
            return_dict=False)
    else:
        output_ids = self.model.generate(
            input_ids,
            self.decoder_input_ids,
            max_new_tokens,
            num_beams=self.args.num_beams,
            bos_token_id=self.tokenizer.bos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
            eos_token_id=self.tokenizer.eos_token_id,
            debug_mode=False,
            prompt_embedding_table=ptuning_args[0],
            prompt_tasks=ptuning_args[1],
            prompt_vocab_size=ptuning_args[2])
        # Clear before batch decode in next step
        input_lengths = torch.zeros(input_lengths.shape,
                                    dtype=input_lengths.dtype)
    profiler.stop("LLM")

    if tensorrt_llm.mpi_rank() == 0:
        # Extract a list of tensors of shape beam_width x output_ids.
        # import pdb; pdb.set_trace()
        output_beams_list = [
            self.tokenizer.batch_decode(
                output_ids[batch_idx, :, input_lengths[batch_idx]:],
                skip_special_tokens=True)
            for batch_idx in range(self.args.batch_size)
        ]

        stripped_text = [[
            output_beams_list[batch_idx][beam_idx].strip()
            for beam_idx in range(self.args.num_beams)
        ] for batch_idx in range(self.args.batch_size)]
        return stripped_text
    else:
        return None

def get_visual_features(self, image):
    features, atts = self.vit_pass(image)
    if self.args.blip_encoder:
        features, atts = self.qformer_pass(features, atts)
    return features, atts

def vit_pass(self, image):
    # import pdb; pdb.set_trace()
    visual_features = {'input': image.half()}
    visual_output_info = self.vit_session.infer_shapes(
        [TensorInfo('input', trt.DataType.HALF, image.shape)])
    visual_outputs = {
        t.name: torch.empty(tuple(t.shape),
                            dtype=trt_dtype_to_torch(t.dtype),
                            device="cuda")
        for t in visual_output_info
    }

    ok = self.vit_session.run(visual_features, visual_outputs, self.stream)
    assert ok, "Runtime execution failed for vit session"

    image_embeds = visual_outputs['output']
    image_atts = torch.ones(image_embeds.size()[:-1],
                            dtype=torch.long).to("cuda")

    return image_embeds, image_atts

def qformer_pass(self, image_embeds, image_atts):
    query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1,
                                            -1).contiguous().to("cuda")
    qformer_inputs = {
        'query_tokens': query_tokens.half(),
        'image_embeds': image_embeds.half(),
        'image_atts': image_atts
    }
    qformer_output_info = self.vit_qformer.infer_shapes([
        TensorInfo('query_tokens', trt.DataType.HALF, query_tokens.shape),
        TensorInfo('image_embeds', trt.DataType.HALF, image_embeds.shape),
        TensorInfo('image_atts', trt.DataType.INT64, image_atts.shape)
    ])
    qformer_outputs = {
        t.name: torch.empty(tuple(t.shape),
                            dtype=trt_dtype_to_torch(t.dtype),
                            device="cuda")
        for t in qformer_output_info
    }
    ok = self.vit_qformer.run(qformer_inputs, qformer_outputs, self.stream)
    assert ok, "Runtime execution failed for Qformer session"

    visual_features = qformer_outputs["query_output"]
    visual_atts = torch.ones(visual_features.size()[:-1],
                             dtype=torch.long).to("cuda")

    return visual_features, visual_atts

def setup_fake_prompts(self, visual_features, pre_input_ids, post_input_ids,
                       input_lengths):
    # Assemble fake prompts which points to image embedding actually
    fake_prompt_id = torch.arange(
        self.model_config.vocab_size,
        self.model_config.vocab_size +
        visual_features.shape[0] * visual_features.shape[1],
        device="cuda")
    fake_prompt_id = fake_prompt_id.reshape(visual_features.shape[0],
                                            visual_features.shape[1])

    if post_input_ids is not None:
        input_ids = [pre_input_ids, fake_prompt_id, post_input_ids]
    else:
        input_ids = [fake_prompt_id, pre_input_ids]
    input_ids = torch.cat(input_ids,
                          dim=1).contiguous().to(torch.int32).cuda()

    if self.args.decoder_llm or self.model.encoder_runtime_mapping.is_first_pp_rank(
    ):
        ptuning_args = self.ptuning_setup(visual_features, input_ids,
                                          input_lengths)
    else:
        ptuning_args = [None, None, None]

    return input_ids, ptuning_args

def ptuning_setup(self, prompt_table, input_ids, input_lengths):
    if prompt_table is not None:
        task_vocab_size = torch.tensor([prompt_table.shape[1]],
                                       dtype=torch.int32,
                                       device="cuda")
        prompt_table = prompt_table.view(
            (prompt_table.shape[0] * prompt_table.shape[1],
             prompt_table.shape[2]))

        hidden_size = self.model_config.hidden_size
        if not self.args.decoder_llm:
            hidden_size *= self.model.encoder_runtime_mapping.tp_size
        assert prompt_table.shape[
            1] == hidden_size, "Prompt table dimensions do not match hidden size"

        prompt_table = prompt_table.cuda().to(
            dtype=tensorrt_llm._utils.str_dtype_to_torch(
                self.model_config.dtype))
    else:
        prompt_table = torch.empty([1, hidden_size]).cuda()
        task_vocab_size = torch.zeros([1]).cuda()

    if self.model_config.remove_input_padding:
        tasks = torch.zeros([torch.sum(input_lengths)],
                            dtype=torch.int32).cuda()
        if args.decoder_llm: tasks = tasks.unsqueeze(0)
    else:
        tasks = torch.zeros(input_ids.shape, dtype=torch.int32).cuda()

    return [prompt_table, tasks, task_vocab_size]

def setup_llava_prompt(query):

Import these here to avoid installing llava when running blip models only

from llava.constants import DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates

query = DEFAULT_IMAGE_TOKEN + "\n" + query

conv_mode = 'llava_v1'
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], query)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

image_token_index = prompt.find(DEFAULT_IMAGE_TOKEN)
pre_prompt = prompt[:image_token_index]
post_prompt = prompt[image_token_index + len(DEFAULT_IMAGE_TOKEN):]

return pre_prompt, post_prompt

def load_test_image(): img_url = 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' return Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

if name == 'main': os.environ["TOKENIZERS_PARALLELISM"] = "false" args = parse_arguments() tensorrt_llm.logger.set_level(args.log_level) runtime_rank = tensorrt_llm.mpi_rank()

image = load_test_image()
if args.blip_encoder:
    if 'opt-2.7b' in args.hf_model_dir:
        model_type = 'Salesforce/blip2-opt-2.7b'
    else:
        model_type = 'Salesforce/blip2-flan-t5-xl'

    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
    processor = Blip2Processor.from_pretrained(model_type)
    model = Blip2ForConditionalGeneration.from_pretrained(
        model_type, torch_dtype=torch.float16)
    model.to(device)

    prompt = "Question: which city is this in? Answer:"
    inputs = processor(image, prompt, return_tensors="pt").to(device)
    image = inputs['pixel_values']
    image = image.expand(args.batch_size, -1, -1,
                         -1).contiguous().to("cuda")

    query_tokens = model.query_tokens

    pre_prompt = [args.input_text] * args.batch_size
    post_prompt = None
else:
    pre_prompt, post_prompt = setup_llava_prompt(args.input_text)
    pre_prompt = [pre_prompt] * args.batch_size
    post_prompt = [post_prompt] * args.batch_size
    from llava.mm_utils import get_model_name_from_path
    from llava.model.builder import load_pretrained_model

    model_path = '/workspace/examples/multimodal/llava-v1.5-7b'
    model_name = get_model_name_from_path(model_path)
    _, _, image_processor, _ = load_pretrained_model(
        model_path, None, model_name)

    image = image_processor(image, return_tensors='pt')['pixel_values']
    image = image.expand(args.batch_size, -1, -1,
                         -1).contiguous()

    image = image.half().to("cuda")

    query_tokens = None

model = MultiModalModel(args)
model.query_tokens = query_tokens

num_iters = 100
for _ in range(num_iters):
    stripped_text = model.generate(pre_prompt, post_prompt, image,
                                   args.max_new_tokens)

if runtime_rank == 0:
    logger.info("---------------------------------------------------------")
    logger.info(f"\n[Q] {args.input_text}")
    logger.info(f"\n[A] {stripped_text}")
    logger.info(
        f'TensorRT-LLM LLM latency: {profiler.elapsed_time_in_sec("LLM") / num_iters} sec'
    )
    logger.info("---------------------------------------------------------")
5. Add `--decoder-llm` argument to inference script, since LLaMA is a decoder-only LLM.

python run.py \ --max_new_tokens 50 \ --batch_size 32 \ --input_text "Question: 图里面有什么? Answer:" \ --hf_model_dir /workspace/examples/multimodal/llava-v1.5-7b \ --visual_engine_dir visual_engines/llava-v1.5-7b \ --llm_engine_dir trt_engines/llava-v1.5-7b/fp16/1-gpu \ --decoder_llm

6. error occurred

root@f0d49b6037c6:/workspace/examples/multimodal# python run.py --max_new_tokens 50 --batch_size 32 --input_text "Question: 图里面有什么? Answer:" --hf_model_dir /workspace/examples/multimodal/llava-v1.5-7b --visual_engine_dir visual_engines/llava-v1.5-7b --llm_engine_dir trt_engines/llava-v1.5-7b/fp16/1-gpu --decoder_llm Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:12<00:00, 6.38s/it] [01/22/2024-03:31:15] [TRT-LLM] [I] Loading engine from visual_engines/llava-v1.5-7b/visual_encoder_fp16.engine [01/22/2024-03:31:15] [TRT-LLM] [I] Creating session from engine visual_engines/llava-v1.5-7b/visual_encoder_fp16.engine [01/22/2024-03:31:15] [TRT] [I] Loaded engine size: 599 MiB [01/22/2024-03:31:15] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +595, now: CPU 0, GPU 595 (MiB) [01/22/2024-03:31:16] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +396, now: CPU 0, GPU 991 (MiB) [01/22/2024-03:31:26] [TRT] [I] Loaded engine size: 12855 MiB [01/22/2024-03:31:28] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 13055, GPU 27837 (MiB) [01/22/2024-03:31:28] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +2, GPU +10, now: CPU 13057, GPU 27847 (MiB) [01/22/2024-03:31:28] [TRT] [W] TensorRT was linked against cuDNN 8.9.6 but loaded cuDNN 8.9.4 [01/22/2024-03:31:28] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +12853, now: CPU 0, GPU 13844 (MiB) [01/22/2024-03:31:28] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 13076, GPU 28179 (MiB) [01/22/2024-03:31:28] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +1, GPU +8, now: CPU 13077, GPU 28187 (MiB) [01/22/2024-03:31:28] [TRT] [W] TensorRT was linked against cuDNN 8.9.6 but loaded cuDNN 8.9.4 [01/22/2024-03:31:28] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 13844 (MiB) [01/22/2024-03:31:28] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 13100, GPU 28205 (MiB) [01/22/2024-03:31:28] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 13100, GPU 28215 (MiB) [01/22/2024-03:31:28] [TRT] [W] TensorRT was linked against cuDNN 8.9.6 but loaded cuDNN 8.9.4 [01/22/2024-03:31:28] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 13844 (MiB) [01/22/2024-03:31:28] [TRT-LLM] [I] Load engine takes: 12.56920576095581 sec [01/22/2024-03:31:29] [TRT] [W] Using default stream in enqueue()/enqueueV2()/enqueueV3() may lead to performance issues due to additional cudaDeviceSynchronize() calls by TensorRT to ensure correct synchronizations. Please use non-default stream instead. /usr/local/lib/python3.10/dist-packages/torch/nested/init.py:165: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/NestedTensorImpl.cpp:178.) return _nested.nested_tensor( [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::setInputShape::2309] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::setInputShape::2309, condition: satisfyProfile Runtime dimension does not satisfy any optimization profile.) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::setInputShape::2309] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::setInputShape::2309, condition: satisfyProfile Runtime dimension does not satisfy any optimization profile.) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::setInputShape::2309] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::setInputShape::2309, condition: satisfyProfile Runtime dimension does not satisfy any optimization profile.) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:30] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) [01/22/2024-03:31:31] [TRT] [E] 3: [executionContext.cpp::resolveSlots::2991] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::resolveSlots::2991, condition: allInputDimensionsSpecified(routine) ) Traceback (most recent call last): File "/workspace/examples/multimodal/run.py", line 399, in stripped_text = model.generate(pre_prompt, post_prompt, image, File "/workspace/examples/multimodal/run.py", line 159, in generate output_ids = self.model.generate( File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/runtime/model_runner.py", line 584, in generate outputs = self.session.decode( File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/runtime/generation.py", line 746, in wrapper ret = func(self, *args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/runtime/generation.py", line 2789, in decode return self.decode_regular( File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/runtime/generation.py", line 2450, in decode_regular should_stop, next_step_tensors, tasks, context_lengths, host_context_lengths, attention_mask, logits, encoder_input_lengths = self.handle_per_step( File "/usr/local/lib/python3.10/dist-packages/tensorrt_llm/runtime/generation.py", line 2136, in handle_per_step raise RuntimeError(f"Executing TRT engine failed step={step}!") RuntimeError: Executing TRT engine failed step=0! root@f0d49b6037c6:/workspace/examples/multimodal#



### Expected behavior

The code can run normally.

### actual behavior

error occurred

### additional notes

None
jdemouth-nvidia commented 9 months ago

@symphonylyh , can you re-assign this issue to someone in the team, please?

jkl375 commented 9 months ago

Later I try again and when setting batch size from 1 to 6, it can run the results normally. But when the batchsize is greater than 6, the error will be occurred. Is any parameter that has not been set properly?

root@f0d49b6037c6:/workspace/examples/multimodal# python run.py             --max_new_tokens 50             --batch_size 6             --input_text "Question: 图里面有什么? Answer:"        
     --hf_model_dir /workspace/examples/multimodal/llava-v1.5-7b             --visual_engine_dir visual_engines/llava-v1.5-7b             --llm_engine_dir trt_engines/llava-v1.5-7b/fp16/1-g
pu             --decoder_llm
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.39s/it]
[01/23/2024-06:59:16] [TRT-LLM] [I] Loading engine from visual_engines/llava-v1.5-7b/visual_encoder_fp16.engine
[01/23/2024-06:59:17] [TRT-LLM] [I] Creating session from engine visual_engines/llava-v1.5-7b/visual_encoder_fp16.engine
[01/23/2024-06:59:17] [TRT] [I] Loaded engine size: 599 MiB
[01/23/2024-06:59:17] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +595, now: CPU 0, GPU 595 (MiB)
[01/23/2024-06:59:17] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +396, now: CPU 0, GPU 991 (MiB)
[01/23/2024-06:59:29] [TRT] [I] Loaded engine size: 12855 MiB
[01/23/2024-06:59:32] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 13055, GPU 27837 (MiB)
[01/23/2024-06:59:32] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +2, GPU +10, now: CPU 13057, GPU 27847 (MiB)
[01/23/2024-06:59:32] [TRT] [W] TensorRT was linked against cuDNN 8.9.6 but loaded cuDNN 8.9.4
[01/23/2024-06:59:32] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +12853, now: CPU 0, GPU 13844 (MiB)
[01/23/2024-06:59:32] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 13076, GPU 28179 (MiB)
[01/23/2024-06:59:32] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +1, GPU +8, now: CPU 13077, GPU 28187 (MiB)
[01/23/2024-06:59:32] [TRT] [W] TensorRT was linked against cuDNN 8.9.6 but loaded cuDNN 8.9.4
[01/23/2024-06:59:33] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 13844 (MiB)
[01/23/2024-06:59:33] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 13100, GPU 28205 (MiB)
[01/23/2024-06:59:33] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 13100, GPU 28215 (MiB)
[01/23/2024-06:59:33] [TRT] [W] TensorRT was linked against cuDNN 8.9.6 but loaded cuDNN 8.9.4
[01/23/2024-06:59:33] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 13844 (MiB)
[01/23/2024-06:59:33] [TRT-LLM] [I] Load engine takes: 16.056230783462524 sec
[01/23/2024-06:59:35] [TRT] [W] Using default stream in enqueue()/enqueueV2()/enqueueV3() may lead to performance issues due to additional cudaDeviceSynchronize() calls by TensorRT to ensure correct synchronizations. Please use non-default stream instead.
/usr/local/lib/python3.10/dist-packages/torch/nested/__init__.py:165: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/NestedTensorImpl.cpp:178.)
  return _nested.nested_tensor(
[01/23/2024-07:01:12] [TRT-LLM] [I] ---------------------------------------------------------
[01/23/2024-07:01:12] [TRT-LLM] [I] 
[Q] Question: 图里面有什么? Answer:
[01/23/2024-07:01:12] [TRT-LLM] [I] 
[A] [['图里面有一个巨大的喷泉,喷泉水从高处喷出,周围是一座建筑,还有一个巨大'], ['图里面有一个巨大的喷泉,喷泉水从高处喷出,周围是一座建筑,还有一个巨大'], ['图里面有一个巨大的喷泉,喷泉水从高处喷出,周围是一座建筑,还有一个巨大'], ['图里面有一个巨大的喷泉,喷泉水从高处喷出,周围是一座建筑,还有一个巨大'], ['图里面有一个巨大的喷泉,喷泉水从高处喷出,周围是一座建筑,还有一个巨大'], ['图里面有一个巨大的喷泉,喷泉水从高处喷出,周围是一座建筑,还有一个巨大']]
[01/23/2024-07:01:12] [TRT-LLM] [I] TensorRT-LLM LLM latency: 0.8995234870910644 sec
[01/23/2024-07:01:12] [TRT-LLM] [I] ---------------------------------------------------------
jkl375 commented 9 months ago

who can help me?

symphonylyh commented 9 months ago

Hi @jkl375 , thanks for providing the detaile reproducer, and sorry for delaying my response. From your post, I have two action items planned: (1) build_visual_engine.py should take cmdline args for the max batch size (2) reproduce bs>6 failure on our end next, currently it shows it passes the visual encoder part and fails at the LLM part.

One request I have from you, can you please edit the description above and briefly list what modifications you did to run.py? That'd be clearer for us to understand what changes you made

jkl375 commented 9 months ago

OK. Here are my modifications:

  1. set pre_prompt's and post_prompt's batchsize
    pre_prompt = [pre_prompt] * args.batch_size
    post_prompt = [post_prompt] * args.batch_size

    under pre_prompt, post_prompt = setup_llava_prompt(args.input_text)

  2. set image's batchsize
    image = image.expand(args.batch_size, -1, -1,
                             -1).contiguous()

    under image = image_processor(image, return_tensors='pt')['pixel_values']

  3. set input_lengths repeat(batchsize, 1)
    input_lengths = input_lengths.repeat(32, 1)

    under profiler.stop("LLM")

amukkara commented 9 months ago

Hi @jkl375, I tested up to batch size 32 and could run the example without error.

Your changes to pre_prompt, post_prompt and image dimensions are correct. But input_lengths should be repeated batch_size times before the call to setup_fake_prompts(). In summary, you should replace

input_atts = torch.ones((1, length)).to(torch.int32).to("cuda")
input_lengths = torch.sum(input_atts, dim=1)

with

input_lengths = torch.IntTensor([length] * args.batch_size).to(torch.int32).to("cuda")

Make sure to remove your last change input_lengths = input_lengths.repeat(32, 1)

Let us know if you still face errors.

fhudson96 commented 9 months ago

Hi @amukkara, I am experiencing exactly the same problem that @jkl375 describes. I have setup the prompts and lengths in an almost identical way as you have both described but I am unable to get a model inference with a batch_size > 6. All values under 6 work fine.

My error message is identical to @jkl375. And again the visual embedder works fine, tested up to batch size of 64.

My engine building command is:

python ../llama/build.py \
    --model_dir hf_models/llava-v1.5-13b \
    --output_dir llava/8_batch/trt_engines/fp16/llava-v1.5-13b \
    --dtype float16 \
    --remove_input_padding \
    --use_gpt_attention_plugin float16 \
    --enable_context_fmha \
    --use_gemm_plugin float16 \
    --max_batch_size 8 \
    --max_prompt_embedding_table_size 4608 #576*8

Is it possible to be a fault in the engine building process at all rather than the inference runner? As if batchsize > 6 is working correctly, this suggests that the inference script is fine. Running on A100 GPU.

amukkara commented 9 months ago

Hi @fhudson96, can you test using the latest scripts? There have been several changes in the last few weeks and it is possible an issue with building process was fixed in latest release.

jkl375 commented 9 months ago

Hi, @amukkara. There is no "example/llama/build.py" in the latest branch. And no max_multimodal_len in example/llama/convert_checkpoint.py.

amukkara commented 9 months ago

Hi @jkl375, the build process for llama model has changed in recent update. Please use the following commands to build llama as explained in examples/llama/README.md

python ../llama/convert_checkpoint.py \
    --model_dir tmp/hf_models/${MODEL_NAME} \
    --output_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \
    --dtype float16

trtllm-build \
    --checkpoint_dir tmp/trt_models/${MODEL_NAME}/fp16/1-gpu \
    --output_dir trt_engines/${MODEL_NAME}/fp16/1-gpu \
    --gpt_attention_plugin float16 \
    --gemm_plugin float16 \
    --max_batch_size 2 \
    --max_input_len 2048 \
    --max_output_len 512 \
    --max_multimodal_len 1152 # 2 (max_batch_size) * 576 (num_visual_features)
jkl375 commented 9 months ago

When I try to run the following code:

python ../llama/convert_checkpoint.py \
    --model_dir /workspace/examples/multimodal/llava-v1.5-7b \
    --output_dir ./tllm_checkpoint \
    --dtype float16

the following error occurred:

Traceback (most recent call last):
  File "/workspace/examples/multimodal/../llama/convert_checkpoint.py", line 1973, in <module>
    main()
  File "/workspace/examples/multimodal/../llama/convert_checkpoint.py", line 1700, in main
    'architecture': hf_config.architectures[0]
TypeError: 'NoneType' object is not subscriptable

I found that when transformer==4.36.1, the value of hf_config is

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.36.1",
  "use_cache": true,
  "vocab_size": 32000
}

when executing the code hf_config = LlavaConfig.from_pretrained(args.model_dir).text_config There is no architectures in hf_config.

jkl375 commented 9 months ago

Sorry, I downloaded the wrong model. It should be llava-1.5-7b-hf. It works!

bleedingfight commented 8 months ago

@jkl375 what is your environment?my:

ganliqiang commented 7 months ago

hi , can you run llava-v1.5-7b model successfully?