use tensorrt inference bert, speed slow than onnxruntime

yan123456jie commented 8 months ago

Use tensorrt inference bert, speed slow than onnxruntime，tensorrt is 10ms，onnx is 6ms，model just simple bert classification model. Could some one help me? onnx code

import numpy as np
import onnxruntime as ort
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification
from fastapi import FastAPI, Request
import uvicorn
import time

app = FastAPI()

label_list = ["finance", "realty", "stocks", "education", "science", "society", "politics", "sports", "game", "entertainment", ]

pretrained_bert_dir = "/var/log/model_repository/bert_classification_v1/1/"
tokenizer = BertTokenizer.from_pretrained(pretrained_bert_dir)
bert_config = BertConfig.from_pretrained(pretrained_bert_dir, num_labels=len(label_list))

sess = ort.InferenceSession(pretrained_bert_dir+'model.onnx', providers=['CUDAExecutionProvider'])
@app.get("/predictSingle")
def query(q):
    costs = []
    t0 = time.time()
    t1 = time.time()

    inputs = tokenizer(q, max_length=32, padding="max_length", truncation="longest_first", return_tensors="pt")
    costs.append(f"token={time.time() - t1}")
    t1 = time.time()

    input_dict = {"input_ids": inputs["input_ids"].numpy(), "token_type_ids": inputs["token_type_ids"].numpy(),
                  "attention_mask": inputs["attention_mask"].numpy()}

    outs = sess.run(None, input_dict)
    num = np.argmax(outs)

    costs.append(f"trt={time.time() - t1}")
    costs.append(f"all={time.time() - t0}")

    return {f"predictions={label_list[num]}  cost={costs}"}

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', '--port', default="8582", help='port')
    args, _ = parser.parse_known_args()
    print(args.port)
    uvicorn.run(app, host='0.0.0.0', port=int(args.port), workers=1)

tensorrt code

from fastapi import FastAPI, HTTPException
import tensorrt as trt
from transformers import BertTokenizer
import torch
import numpy as np
import uvicorn
import pycuda.driver as cuda
import collections
import time
import pycuda.autoinit
from flask import Flask, request
import os
import ctypes

app = Flask(__name__)

handle = ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)
if not handle:
    raise RuntimeError(
        "Could not load plugin library.")

_Feature = collections.namedtuple(  # pylint: disable=invalid-name
            "Feature",
            ["input_ids", "attention_mask", "token_type_ids", "batch_size"])

# app = FastAPI()

max_seq_length=32
class TRTModule(torch.nn.Module):
    def __init__(self, engine=None, input_names=None, output_names=None):
        super(TRTModule, self).__init__()
        # self._register_state_dict_hook(TRTModule._on_state_dict)
        self.engine = engine
        if self.engine is not None:
            self.context = self.engine.create_execution_context()
        self.input_names = input_names
        self.output_names = output_names
    def forward(self, features):
        outputs = []
        # print(features[0])
        batch_size = features[0].batch_size
        # batch_size = 2
        print(f"batch_size={batch_size}")
        # We always use batch size 1.
        input_shape = (batch_size, max_seq_length)
        input_nbytes = trt.volume(input_shape) * trt.int32.itemsize

        d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
        stream = cuda.Stream()

        for binding in range(3):
            self.context.set_binding_shape(binding, input_shape)
        assert self.context.all_binding_shapes_specified

        # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
        h_output = cuda.pagelocked_empty(tuple(self.context.get_binding_shape(3)), dtype=np.float32)
        d_output = cuda.mem_alloc(h_output.nbytes)

        eval_time_elapsed = 0

        for feature_index, feature in enumerate(features):
            # Register host memory
            input_ids = cuda.register_host_memory(np.ascontiguousarray(feature.input_ids.ravel()))
            segment_ids = cuda.register_host_memory(np.ascontiguousarray(feature.token_type_ids.ravel()))
            input_mask = cuda.register_host_memory(np.ascontiguousarray(feature.attention_mask.ravel()))
            try:
                eval_start_time = time.time()
                cuda.memcpy_htod_async(d_inputs[engine.get_binding_index("input_ids")], input_ids, stream)
                cuda.memcpy_htod_async(d_inputs[engine.get_binding_index("token_type_ids")], segment_ids, stream)
                cuda.memcpy_htod_async(d_inputs[engine.get_binding_index("attention_mask")], input_mask, stream)

                # Run inference
                self.context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)],
                                         stream_handle=stream.handle)
                # Synchronize the stream
                stream.synchronize()
                eval_time_elapsed += (time.time() - eval_start_time)

                # Transfer predictions back from GPU
                cuda.memcpy_dtoh_async(h_output, d_output, stream)
                stream.synchronize()

                for index, batch in enumerate(h_output):
                    # Data Post-processing
                    predicted_classes = np.argmax(batch, axis=0)
                    print(predicted_classes)
                    # selected_labels = [label_list[i] for i in predicted_classes]
                    print({f"predictions={label_list[predicted_classes]}"})
                    outputs.append(label_list[predicted_classes])

            finally:
                # Unregister host memory
                # 删除变量并显式调用垃圾收集器。
                del input_ids
                del segment_ids
                del input_mask

        return outputs

label_list = ["finance", "realty", "stocks", "education", "science", "society", "politics", "sports", "game", "entertainment", ]

dir = "/var/log/model_repository/bert_classification_tensorrt_843/1/"
engine_file_path = dir+"model.plan"
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
logger = trt.Logger(trt.Logger.INFO)
model_all_names = []
with open(engine_file_path, "rb") as f, trt.Runtime(logger) as runtime:
    engine=runtime.deserialize_cuda_engine(f.read())

trt_model = TRTModule(engine, ["input_ids", "attention_mask", "token_type_ids"],['logits']).to(device)
tokenizer = BertTokenizer.from_pretrained(dir)
@app.get('/predictSingle')
def predict():
    try:
        costs = []
        t0 = time.time()
        t1 = time.time()

        sentences = ["股票情况", "大学教授"]

        data_org = tokenizer(sentences,
                         max_length=32,
                         padding="max_length",
                         truncation=True,
                         return_tensors="pt")
        costs.append(f"token={(time.time() - t1):.3f}")
        t1 = time.time()
        # 转换为int32位
        data = {k: torch.tensor(v, dtype=torch.int32) for k, v in data_org.items()}
        input_ids = data['input_ids']
        attention_mask = data['attention_mask']
        token_type_ids = data['token_type_ids']

        features = []

        features.append(_Feature(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            batch_size=len(sentences)
        ))

        outputs = trt_model(features)
        costs.append(f"trt={(time.time() - t1):.3f}")
        costs.append(f"all={(time.time() - t0):.3f}")
        return f"predictions={outputs}  costs={costs}"
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', '--port', default="8580", help='port')
    args, _ = parser.parse_known_args()
    print(args.port)
    app.run(host='0.0.0.0', port=int(args.port), threaded=False)
    # uvicorn.run(app, host='0.0.0.0', port=int(args.port), workers=1)

zerollzeng commented 8 months ago

How many iteration you are using, first few iteration will take longer time due to warm up GPU and initialization. I would highly recommend that use our trtexec tool to test the perf.

ttyio commented 7 months ago

closing since no activity for more than 3 weeks, pls reopen if you still have question, thanks!

geraldstanje1 commented 6 months ago

hi, can sentence-transformers e.g. https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 already used with tensorRT-LLM? my goal is to compile a sentence-transformers/all-MiniLM-L6-v2 model without quantization using tensorRT-LLM and serve with triton... are there any docs how to make the model ready for tensorRT as well as onnx? cc @ttyio @zerollzeng

NVIDIA / TensorRT

use tensorrt inference bert, speed slow than onnxruntime #3673