NVIDIA / TensorRT-LLM

TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and build TensorRT engines that contain state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs. TensorRT-LLM also contains components to create Python and C++ runtimes that execute those TensorRT engines.
https://nvidia.github.io/TensorRT-LLM
Apache License 2.0
8.11k stars 896 forks source link

Different output with transformers lib and tensorrt llm when using lora #2022

Open Alireza3242 opened 1 month ago

Alireza3242 commented 1 month ago

System Info

A100

Who can help?

@juney-nvidia @ncomly-nvidia @kaiyux @byshiue

Information

Tasks

Reproduction

I want to set lora weights in run time for a llama3 8b based model. But output of transformers library and tensorrt_llm are different. When i use transformers library, my code is like this:

import json
import numpy as np
import os
from peft import PeftModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd

class RunWithTransformer:
    def __init__(self):
        self.file_path = os.path.dirname(__file__)
        # self.model_config = json.loads(args["model_config"])
        self.model = self.get_model()
        self.tokenizer = self.get_tokenizer()

    def get_model(self):
        base_model_path = self.file_path + "/../../data/base_model/"
        base_model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16)

        lora_model_path = self.file_path + "/../../data/lora/torch/"
        ft_model = PeftModel.from_pretrained(base_model, lora_model_path)

        ft_model.eval()
        return ft_model

    def get_tokenizer(self):
        tokenizer_path = self.file_path + "/../../data/base_model/"
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        return tokenizer

    def execute(self, prompts):
        model_input = self.tokenizer(prompts, return_tensors="pt").input_ids.to("cuda")
        config = dict()
        config["do_sample"] = True
        config["max_new_tokens"] = 512
        config["temperature"] = 0.1
        config["top_p"] = 0.96
        config["top_k"] = 1
        config["repetition_penalty"] = 1.2
        config["pad_token_id"] = self.tokenizer.eos_token_id

        model_output = self.model.generate(input_ids=model_input, **config)
        predicted_texts = self.tokenizer.batch_decode(model_output)
        return predicted_texts

if __name__ == "__main__":
    run_with_transformer = RunWithTransformer()

    prompts = ["### Human: \nپیشنهاد غذا برای دورهمی های ایرانی ### Assistant:"]
    run_with_transformer.execute(prompts)

For running with tensorrt_llm: I am using docker tritonserver 24.06-trtllm-python-py3. My tensorrt_llm version is 0.10. and i run this python file: https://github.com/NVIDIA/TensorRT-LLM/blob/v0.10.0/examples/llama/convert_checkpoint.py

python3 src/convert/convert_checkpoint.py --model_dir ./data/base_model \
                              --output_dir ./data/tllm_checkpoint \
                              --dtype float16
trtllm-build --checkpoint_dir ./data/tllm_checkpoint \
            --output_dir ./data/trt_engines \
            --gpt_attention_plugin float16 \
            --gemm_plugin float16 \
            --remove_input_padding enable \
            --context_fmha enable \
            --lora_plugin float16 \
            --lora_dir ./data/lora/torch \
            --max_lora_rank 256 \
            --lora_target_modules "attn_q" "attn_k" "attn_v" "attn_dense" "mlp_h_to_4h" "mlp_4h_to_h" "mlp_gate"

Convert lora weights:

python3 src/convert/hf_lora_convert.py -i ./data/lora/torch -o ./data/lora/tensorrt --storage-type float16

In inference time, i run this code:

import tensorrt_llm.bindings.executor as trtllm
import torch
from torch import from_numpy
import numpy as np
import os
import importlib

def import_lib(path, file_name, package_name):
    file_path = path + "/" + file_name + ".py"
    spec = importlib.util.spec_from_file_location(file_name, file_path)
    imported_file = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(imported_file)
    return getattr(imported_file, package_name)

load_tokenizer = import_lib(os.path.dirname(__file__) , "/../run/utils", "load_tokenizer")

file_path = os.path.dirname(__file__)

def load_base_model():
    engine_dir = file_path + "/../../data/trt_engines"
    kv_cache_config = trtllm.KvCacheConfig(
                free_gpu_memory_fraction=0.1,
                max_attention_window=None,
                sink_token_length=None)

    executor = trtllm.Executor(
                engine_dir, trtllm.ModelType.DECODER_ONLY,
                trtllm.ExecutorConfig(max_beam_width=1,
                                      kv_cache_config=kv_cache_config,
                                      medusa_choices=None))
    return executor

def get_sampling_config(**kwargs):
    accepted_parameters = [
        "num_beams", "top_k", "top_p", "top_p_min", "top_p_reset_ids",
        "top_p_decay", "random_seed", "temperature", "min_length",
        "beam_search_diversity_rate", "repetition_penalty",
        "presence_penalty", "frequency_penalty", "length_penalty",
        "early_stopping"
    ]
    rename_params = {"num_beams": "beam_width"}
    sampling_params = {
        k: v
        for k, v in kwargs.items() if k in accepted_parameters
    }
    for k, v in rename_params.items():
        if k in sampling_params:
            sampling_params[v] = sampling_params.pop(k)
    if "top_p" in sampling_params and sampling_params["top_p"] == 0.0:
        sampling_params["top_p"] = None

    if "temperature" in sampling_params and sampling_params[
            "temperature"] == 0.0:
        print("Convert `temperature=0.0` to `temperature=None` and `top_k=1` to prevent overflow.")
        sampling_params['temperature'] = None
        sampling_params['top_k'] = 1

    sampling_params['top_k'] = 1
    sampling_params['top_p'] = 0.96
    sampling_params['temperature'] = 0.1
    sampling_params['repetition_penalty'] = 1.2
    sampling_config = trtllm.SamplingConfig(**sampling_params)
    return sampling_config

def get_output_config():
    output_config = trtllm.OutputConfig(
            return_context_logits=False,
            return_generation_logits=False,
            return_log_probs=False,
        )
    return output_config

def get_lora_config_from_request():
    task_id = int(1)
    weights = torch.tensor(np.load(file_path + "/../../data/lora/tensorrt/model.lora_weights.npy")[0])
    config = torch.tensor(np.load(file_path + "/../../data/lora/tensorrt/model.lora_config.npy")[0])
    lora_config = trtllm.LoraConfig(task_id=task_id, weights=weights, config=config)
    return lora_config

def get_tokenizer():
    # tokenizer_dir = file_path + "/../../data/merged_model"
    tokenizer_dir = file_path + "/../../data/base_model"
    tokenizer, pad_id, end_id = load_tokenizer(
        tokenizer_dir=tokenizer_dir,
        vocab_file=None,
        model_name=None,
        model_version=None,
        tokenizer_type=None,
    )
    return tokenizer, pad_id, end_id

def prepare_inputs(batch_input_ids, pad_id, remove_input_padding):
    # Cast to int32
    batch_input_ids = [x.type(torch.int32) for x in batch_input_ids]
    input_lengths = [x.size(0) for x in batch_input_ids]
    max_length = max(input_lengths)

    if remove_input_padding:
        batch_input_ids = torch.concat(batch_input_ids)
    else:
        # Right padding for trt-llm
        paddings = [
            torch.ones(max_length - l, dtype=torch.int32) * pad_id
            for l in input_lengths
        ]
        batch_input_ids = [
            torch.cat([x, pad]) for x, pad in zip(batch_input_ids, paddings)
        ]
        batch_input_ids = torch.stack(batch_input_ids)
    input_lengths = torch.tensor(input_lengths, dtype=torch.int32)
    return batch_input_ids, input_lengths

def make_a_request(executor, preper_input_ids, loar_enable):
    input_ids = preper_input_ids
    max_new_tokens=512
    pad_id=2
    end_id=2
    stop_words_list=None
    bad_words_list=None
    sampling_config=get_sampling_config()
    streaming=False
    output_config=get_output_config()
    prompt_tuning_config=None
    lora_config = None
    if loar_enable == True:
        lora_config = get_lora_config_from_request()

    requests = [trtllm.Request(input_token_ids=input_ids,
                               max_new_tokens=max_new_tokens,
                               pad_id=pad_id,
                               end_id=end_id,
                               stop_words=stop_words_list,
                               bad_words=bad_words_list,
                               sampling_config=sampling_config,
                               streaming=streaming,
                               output_config=output_config,
                               prompt_tuning_config=prompt_tuning_config,
                               lora_config=lora_config)]

    request_ids = executor.enqueue_requests(requests)
    multi_responses = executor.await_responses(request_ids)
    response = multi_responses[0][0]

    output_ids = [[[]] for _ in range(len(multi_responses))]
    reqid_pos = request_ids.index(response.request_id)
    for beam, output_tokens in enumerate(
            response.result.output_token_ids):
        output_ids[reqid_pos][beam] += output_tokens

    print(output_ids[0][0][-10:-1])

    with torch.no_grad():
        output_ids = torch.tensor(output_ids,
                                          dtype=torch.int32,
                                          device="cuda:0")
        torch.cuda.synchronize()

    return output_ids

prompts = "### Human: \nپیشنهاد غذا برای دورهمی های ایرانی ### Assistant:"
executor = load_base_model()
tokenizer, pad_id, end_id = get_tokenizer()
preper_input_ids = tokenizer.encode(prompts)
output_ids = make_a_request(executor, preper_input_ids, loar_enable=True)

tokenizer.decode(output_ids.tolist()[0][0][:])

If i manually set lora weights to zero, both responses are the same. But if i use real lora weights, outputs are defferent.

Expected behavior

Get the same answer from both transformers and tensorrt

actual behavior

Get a different answer

additional notes

my adapter_config.json:

{
  "alpha_pattern": {},
  "auto_mapping": null,
  "base_model_name_or_path": "/path/to/my/model",
  "bias": "none",
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layers_pattern": null,
  "layers_to_transform": null,
  "loftq_config": {},
  "lora_alpha": 128,
  "lora_dropout": 0.1,
  "megatron_config": null,
  "megatron_core": "megatron.core",
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 256,
  "rank_pattern": {},
  "revision": null,
  "target_modules": [
    "down_proj",
    "gate_proj",
    "up_proj",
    "k_proj",
    "v_proj",
    "q_proj",
    "o_proj"
  ],
  "task_type": "CAUSAL_LM",
  "use_rslora": false
}
QiJune commented 1 month ago

Hi @Alireza3242 , tritonserver 24.07-trtllm-python-py3 is released, which contains tensorrt_llm 0.11. Could you please try with it?

Alireza3242 commented 1 month ago

Hi @QiJune This problem solved with tritonserver 24.07-trtllm-python-py3. I have also another question. tensorrt_llm supports ['attn_q', 'attn_v', 'attn_k', 'attn_qkv', ...] layers in lora. But not support "lm_head". Why?