pytriton is slower than triton

yan123456jie commented 9 months ago

My model is a bert classification model. Triton inference is 17ms per request. Pytriton is 34ms per request. Could someone can help me?

blow is pytriton server code

import torch
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification
import os
import json
import os
from pytriton.decorators import batch
from pytriton.model_config import ModelConfig, Tensor
from pytriton.triton import Triton
import numpy as np
from transformers import BertTokenizer
import requests

label_list = ["finance", "realty", "stocks", "education", "science", "society", "politics", "sports", "game",
                  "entertainment", ]

pretrained_bert_dir = "/models"
tokenizer = BertTokenizer.from_pretrained(pretrained_bert_dir)
bert_config = BertConfig.from_pretrained(pretrained_bert_dir, num_labels=len(label_list))

model = BertForSequenceClassification(bert_config)
model.load_state_dict(torch.load(os.path.join(pretrained_bert_dir, "bert_model.pth")))
model.to("cuda")
model.eval()

@batch
def _infer_fn(sentence: np.ndarray):
    print(sentence)
    sequences_batch = np.char.decode(sentence.astype("bytes"), "utf-8")
    print(sequences_batch)

    labels = []
    for s in sequences_batch:
        inputs = tokenizer(s[0], max_length=32, truncation="longest_first", return_tensors="pt")
        inputs = inputs.to("cuda")

        outputs = model(**inputs)
        print(outputs)

        logits = outputs[0]
        print(logits)
        label_pro = torch.max(logits.data, 1)[1].tolist()
        print(label_pro)
        label = label_list[label_pro[0]]
        print(label)
        labels.append(label)
    return {"label": np.char.encode(labels, "utf-8")}

with Triton() as triton:
    triton.bind(
        model_name="BERT",
        infer_func=_infer_fn,
        inputs=[
            Tensor(name="sentence", dtype=np.bytes_, shape=(1,)),
        ],
        outputs=[
            Tensor(name="label", dtype=np.bytes_, shape=(1,)),
        ],
        config=ModelConfig(max_batch_size=10)
    )
    triton.serve()

blow is pytriton client code

import requests

sequence = ["明天要考试了"]
d = {
  "id": "0",
  "inputs": [
    {
      "name": "sentence",
      "shape": [1,1],
      "datatype": "BYTES",
      "data": sequence
    }
  ]
}
res = requests.post(url="http://114.116.17.243:31800/v2/models/BERT/infer",json=d).json()
r = res["outputs"][0]["data"][0]
print(r)

ptarasiewiczNV commented 9 months ago

Hi @yan123456jie,

thank you for raising the issue. Could you please share with us your Triton server code that is running the model so that we can try to reproduce this problem?

yan123456jie commented 8 months ago

Hi @yan123456jie,

thank you for raising the issue. Could you please share with us your Triton server code that is running the model so that we can try to reproduce this problem?

@ptarasiewiczNV Thanks for reply, my model train process is blow: 1、Download base model from https://huggingface.co/bert-base-chinese/tree/main； 2、git clone https://github.com/zejunwang1/bert_text_classification change train.py "torch.save(model.state_dict(), config.saved_model)" to "model.save_pretrained(config.saved_model_dir)" 3、train model: python main.py --mode train --data_dir ./data --pretrained_bert_dir ./pretrained_bert 4、copy vocab.txt、config.json to ./data/model/ 5、python -m transformers.onnx --model=./data/model/ --feature=sequence-classification --framework=pt onnx_output/

Then start trion server: use onnx_output/model.onnx to start triton

/var/log/model_repository/
    bert_classification_v1
        1
            model.onnx
        config.pbtxt

config.pbtx is

platform: "onnxruntime_onnx"
max_batch_size: 1
input [
  {
    name: "token_type_ids"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "attention_mask"
    data_type: TYPE_INT64
    dims: [ -1 ]
  },
  {
    name: "input_ids"
    data_type: TYPE_INT64
    dims: [ -1 ]
  }
]
output [
  {
    name: "logits"
    data_type: TYPE_FP32
    dims: [ 10 ]
  }
]

start triton

docker run --rm --gpus 1 \
-p8100:8000 -p8101:8001 -p8102:8002 \
-v /var/log/model_repository:/models \
nvcr.io/nvidia/tritonserver:22.05-py3 \
tritonserver --strict-model-config=true --model-repository=/models

http requst is:

import numpy as np
import tritonclient.http as httpclient
from transformers import BertTokenizer
import requests

tokenizer = BertTokenizer.from_pretrained("/Users/nali/Downloads/onnx_model",do_lower_case= True)
sentence = "明天要考试了"
inputs = tokenizer(sentence, max_length=32, truncation="longest_first", return_tensors="pt")
input_ids = inputs["input_ids"].numpy().tolist()
token_type_ids = inputs["token_type_ids"].numpy().tolist()
attention_mask = inputs["attention_mask"].numpy().tolist()
print(input_ids)
shape_list = list(inputs["input_ids"].shape)
print(shape_list)
# input_dict = {"input_ids":inputs["input_ids"].numpy(), "token_type_ids":inputs["token_type_ids"].numpy(), "attention_mask":inputs["attention_mask"].numpy()}

request_data = {
"inputs": [
    {
        "name": "input_ids",
        "shape": shape_list,
        "datatype": "INT64",
        "data": input_ids
    },
    {
        "name": "token_type_ids",
        "shape": shape_list,
        "datatype": "INT64",
        "data": token_type_ids
    },
    {
        "name": "attention_mask",
        "shape": shape_list,
        "datatype": "INT64",
        "data": attention_mask
    },
],
"outputs": [{"name": "logits"}]
}
label_list = ["finance", "realty", "stocks", "education", "science", "society", "politics", "sports", "game",
                  "entertainment", ]
res = requests.post(url="http://192.168.41.59:8100/v2/models/bert_classification_v1/versions/1/infer",json=request_data).json()
outs = res["outputs"][0]["data"]

num = np.argmax(outs)
print(num)
print(label_list[num])

ptarasiewiczNV commented 8 months ago

@yan123456jie Thank you for this, I was able to reproduce your deployment process and here are my conclusions:

The main difference between your PyTriton and Triton implementations is that on Triton you deploy the ONNX model that is run by onnxruntime library. On PyTriton you deploy vanilla PyTorch model. You can easily deploy the ONNX model in PyTriton as well using another tool we develop called Model Navigator. All you need to add to your script are these few lines under your model definition:

import model_navigator as nav

# set up the optimization config
# we can skip the config and let the navigator choose the best formats
optimize_config = nav.OptimizeConfig(
    target_formats=(nav.Format.ONNX,),
    runners=("OnnxCUDA",)
    # target_formats=(nav.Format.TENSORRT,),
    # runners=("TensorRT",)
)

# wrap model with nav.Module
model = nav.Module(model, optimize_config=optimize_config)

# run inference example
dummy_input = ["明天要考试了"] * batch_size
inputs = tokenizer(dummy_input, max_length=32, truncation="longest_first", return_tensors="pt")
inputs = inputs.to("cuda")
outputs_torch = model(**inputs)

# optimize the model
nav.optimize()

# confirm that the optimized model is working
outputs_optimized = model(**inputs)

Now the model will be run with onnxruntime as well. You can also used the commented lines to optimize the model to TensorRT engine for even better performance.

Your _infer_fn does not utilize batching and you iterate over each sample in the batch and run multiple inferences when they can be merged into a single model call, I have slightly rewritten your inference function to do that:

@batch
def _infer_fn(sentence: np.ndarray):
    sequences_batch = np.char.decode(sentence.astype("bytes"), "utf-8")
    sequences_batch = [s[0] for s in sequences_batch]
    inputs = tokenizer(sequences_batch, max_length=32, truncation="longest_first", return_tensors="pt")
    inputs = inputs.to("cuda")
    outputs = model(**inputs) # single model call
    logits = outputs["logits"] # changed to use dict interface instead of list so it works with optimized model
    labels_pro = torch.max(logits.data, 1)[1].tolist()
    labels = [label_list[label_pro] for label_pro in labels_pro]
    return {"label": np.char.encode(labels, "utf-8")}

There are two factors that impact the difference between PyTriton and Triton performance.
1. When using Triton with ONNX backend you have to write your postprocessing code that is exctracting the labels on the client side. The server returns all of the logits from the model, this is quite a large tensor for larger batch sizes. In the PyTriton case everything is done on the server side and i/o is smaller. This means that for larger batch sizes you might get better performance with PyTriton in your case.
2. PyTriton adds a Python layer to Triton, this introduces a small overhead of a few milliseconds. This might mean that when computations are not heavy (e.g. a small model with a small batch size) you will get a slightly better performance with pure Triton.
When measuring performance remember to run a few "warm up" requests and then measure latency on multiple requests to get a reliable numbers, here is my very simple modification to the code calling PyTriton server:

import time
import requests

batch_size = 1024

sequence = [["明天要考试了"]] * batch_size
d = {
  "id": "0",
  "inputs": [
    {
      "name": "sentence",
      "shape": [batch_size,1],
      "datatype": "BYTES",
      "data": sequence
    }
  ]
}

# warm up
for _ in range(10):
    res = requests.post(url="http://0.0.0.0:8000/v2/models/BERT/infer",json=d).json()
    r = res["outputs"][0]["data"][0]

start_time = time.monotonic()
num_requests = 100
for _ in range(num_requests):
    res = requests.post(url="http://0.0.0.0:8000/v2/models/BERT/infer",json=d).json()
end_time = time.monotonic()

print("time: ", (end_time - start_time) / num_requests * 1000, "ms")

I have run your model in all configurations and also get the best performance with Triton for batch size 1, but PyTriton quickly catch up and even gets ahead as batch size gets bigger. Also you get the best performance with TensorRT engine instead of ONNX models. You can run your model with modifications I mentioned for different batch sizes and decide which configuration is optimal for your specific use case. I hope this would be helpful.

yan123456jie commented 8 months ago

@ptarasiewiczNV Thank you very much for reply. You recommend is helpful. I test your code and get good result by use onnx type. (I not use model_navigator, model_navigator has two error.) The above time is long, because i send requests from my native computer. I copy my request code to server machine and get time cost result blow:	parallel requests\frame	triton(ms)
1	10	14
2	8	17
3	12	17
5	21	20
10	29	29
20	66	36

Triton is faster when parallel requests is small. I guess may be there python web reason. When i send requests to empyt python fastapi web, also cost 4ms.

model_navigator two error is blow: 1、nav.optimize() error，my triton-model-navigator== 0.7.5 and nvidia-pytriton==0.2.5

    nav.optimize()
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/inplace/__init__.py", line 21, in optimize
    module_registry.optimize()
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/inplace/registry.py", line 57, in optimize
    module.load_optimized()
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/inplace/wrapper.py", line 147, in load_optimized
    self._wrapper = OptimizedModule(
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/inplace/model.py", line 255, in __init__
    runner = package.get_runner(return_type=TensorType.TORCH, strategy=inplace_config.strategy)
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/package/package.py", line 169, in get_runner
    runtime_result = self._get_best_runtime(strategy=strategy, include_source=include_source)
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/package/package.py", line 271, in _get_best_runtime
    runtime_result = RuntimeAnalyzer.get_runtime(self.status.models_status, strategy=strategy, formats=formats)
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/runtime_analyzer/analyzer.py", line 122, in get_runtime
    raise ModelNavigatorRuntimeAnalyzerError("No matching results found.")
model_navigator.exceptions.ModelNavigatorRuntimeAnalyzerError: No matching results found.

2、When i remove nav.optimize(), code could run , but when my requests reached 100, these is blow error:

INFO:pytriton.triton:Infer function available as model: `/v2/models/BERT`
INFO:pytriton.triton:  Status:         `GET  /v2/models/BERT/ready/`
INFO:pytriton.triton:  Model config:   `GET  /v2/models/BERT/config/`
INFO:pytriton.triton:  Inference:      `POST /v2/models/BERT/infer/`
INFO:pytriton.triton:Read more about configuring and serving models in documentation: https://triton-inference-server.github.io/pytriton.
2024-01-12 17:38:11 INFO     Navigator: Removing exiting workspace at /root/.cache/model_navigator/transformers.models.bert.modeling_bert.BertForSequenceClassification/0
2024-01-12 17:38:11 INFO     Navigator: Creating workspace at /root/.cache/model_navigator/transformers.models.bert.modeling_bert.BertForSequenceClassification/0
2024-01-12 17:38:11 INFO     Navigator: Initializing log file.
/opt/conda/lib/python3.8/site-packages/model_navigator/pipelines/validation.py:85: ModelNavigatorConfigurationWarning: Custom configuration for format Format.TENSORRT is provided, but Format.TENSORRT is not in target formats. Custom configuration will be ignored.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/model_navigator/pipelines/validation.py:85: ModelNavigatorConfigurationWarning: Custom configuration for format Format.TORCH_TRT is provided, but Format.TORCH_TRT is not in target formats. Custom configuration will be ignored.
  warnings.warn(
2024-01-12 17:38:14 INFO     Navigator: ========================================== Pipeline 'Preprocessing' started ==========================================
2024-01-12 17:38:14 INFO     Navigator: ======================================== Command 'InferInputMetadata' started ========================================
2024-01-12 17:38:14 INFO     Navigator: ======================================= Command 'FetchInputModelData' started ========================================
2024-01-12 17:38:14 WARNING  Navigator: Requested sample_count (100) is larger than the number of available samples (1). Using 1 samples.
2024-01-12 17:38:14 INFO     Navigator: Collecting input samples for model.
2024-01-12 17:38:14 INFO     Navigator: Saving samples into the workspace.
2024-01-12 17:38:14 INFO     Navigator: ======================================= Command 'InferOutputMetadata' started ========================================
2024-01-12 17:38:14 INFO     Navigator: ======================================= Command 'FetchOutputModelData' started =======================================
2024-01-12 17:38:15 INFO     Navigator: ========================================= Pipeline 'PyTorch Export' started ==========================================
2024-01-12 17:38:15 INFO     Navigator: ========================================= Command 'ExportTorch2ONNX' started =========================================
2024-01-12 17:38:15 INFO     Navigator: PyTorch to ONNX export started
2024-01-12 17:38:15 INFO     Navigator: Command: /opt/conda/bin/python onnx/reproduce_export.py --exported_model_path 'onnx/model.onnx' --opset '17' --input_metadata '{"metadata": [{"name": "input__1", "shape": (-1, 32), "dtype": "int64"}, {"name": "input__2", "shape": (-1, 32), "dtype": "int64"}, {"name": "input__0", "shape": (-1, 32), "dtype": "int64"}], "pytree_metadata": {"metadata": ({"attention_mask": "input__0", "input_ids": "input__1", "token_type_ids": "input__2"},), "tensor_type": "torch"}, "is_legacy": False}' --input_names '["input__1", "input__2", "input__0"]' --output_names '["output__0"]' --dynamic_axes '{"input__0": [0], "input__1": [0], "input__2": [0]}' --batch_dim '0' --target_device 'cuda' --custom_args '{}'
2024-01-12 17:38:15 WARNING  Navigator: Command finished with ModelNavigatorUserInputError. The error is considered as external error. Usually caused by incompatibilities between the model and the target formats and/or runtimes. Please review the command output.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/commands/execution_context.py", line 169, in execute_local_runtime_script
    fire.Fire(func, unwrapped_args)
  File "/opt/conda/lib/python3.8/site-packages/fire/core.py", line 141, in Fire
    component_trace = _Fire(component, args, parsed_flag_args, context, name)
  File "/opt/conda/lib/python3.8/site-packages/fire/core.py", line 475, in _Fire
    component, remaining_args = _CallAndUpdateTrace(
  File "/opt/conda/lib/python3.8/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
    component = fn(*varargs, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/commands/export/exporters/torch2onnx.py", line 96, in export
    torch.onnx.export(
  File "/opt/conda/lib/python3.8/site-packages/torch/onnx/__init__.py", line 350, in export
    return utils.export(
  File "/opt/conda/lib/python3.8/site-packages/torch/onnx/utils.py", line 163, in export
    _export(
  File "/opt/conda/lib/python3.8/site-packages/torch/onnx/utils.py", line 1051, in _export
    symbolic_helper._set_opset_version(opset_version)
  File "/opt/conda/lib/python3.8/site-packages/torch/onnx/symbolic_helper.py", line 1309, in _set_opset_version
    GLOBALS.export_onnx_opset_version = opset_version
  File "/opt/conda/lib/python3.8/site-packages/torch/onnx/_globals.py", line 40, in export_onnx_opset_version
    raise ValueError(f"Unsupported ONNX opset version: {value}")
ValueError: Unsupported ONNX opset version: 17

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/pipelines/pipeline.py", line 108, in _execute_unit
    command_output = execution_unit.command().run(**input_parameters)  # pytype: disable=not-instantiable
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/commands/base.py", line 116, in run
    output = self._run(*args, **_filter_dict_for_func(kwargs, self._run))
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/commands/export/torch.py", line 226, in _run
    context.execute_local_runtime_script(exporters.torch2onnx.__file__, exporters.torch2onnx.export, args)
  File "/opt/conda/lib/python3.8/site-packages/model_navigator/commands/execution_context.py", line 172, in execute_local_runtime_script
    raise ModelNavigatorUserInputError(f"Command to reproduce error: {' '.join(cmd)}") from e
model_navigator.exceptions.ModelNavigatorUserInputError: Command to reproduce error: /bin/bash onnx/reproduce_export.sh

my server code is blow:

# -*- coding: UTF-8 -*-
import torch
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification
import os
import json
import os
from pytriton.decorators import batch
from pytriton.model_config import ModelConfig, Tensor
from pytriton.triton import Triton
import numpy as np
from transformers import BertTokenizer
import requests
import model_navigator as nav

label_list = ["finance", "realty", "stocks", "education", "science", "society", "politics", "sports", "game",
                  "entertainment", ]

pretrained_bert_dir = "/models"
tokenizer = BertTokenizer.from_pretrained(pretrained_bert_dir)
bert_config = BertConfig.from_pretrained(pretrained_bert_dir, num_labels=len(label_list))

# set up the optimization config
# we can skip the config and let the navigator choose the best formats
optimize_config = nav.OptimizeConfig(
    target_formats=(nav.Format.ONNX,),
    runners=("OnnxCUDA",)
    # target_formats=(nav.Format.TENSORRT,),
    # runners=("TensorRT",)
)
model = BertForSequenceClassification(bert_config)
model.load_state_dict(torch.load(os.path.join(pretrained_bert_dir, "bert_model.pth")))
model.to("cuda")
model.eval()
# wrap model with nav.Module
model = nav.Module(model, optimize_config=optimize_config)
# optimize the model
# nav.optimize()

@batch
def _infer_fn(sentence: np.ndarray):
    sequences_batch = np.char.decode(sentence.astype("bytes"), "utf-8")
    sequences_batch = [s[0] for s in sequences_batch]
    inputs = tokenizer(sequences_batch, max_length=32, padding="max_length", truncation=True, return_tensors="pt")
    inputs = inputs.to("cuda")
    outputs = model(**inputs) # single model call
    logits = outputs["logits"] # changed to use dict interface instead of list so it works with optimized model
    labels_pro = torch.max(logits.data, 1)[1].tolist()
    labels = [label_list[label_pro] for label_pro in labels_pro]
    return {"label": np.char.encode(labels, "utf-8")}

with Triton() as triton:
    triton.bind(
        model_name="BERT",
        infer_func=_infer_fn,
        inputs=[
            Tensor(name="sentence", dtype=np.bytes_, shape=(1,)),
        ],
        outputs=[
            Tensor(name="label", dtype=np.bytes_, shape=(1,)),
        ],
        config=ModelConfig(max_batch_size=1024)
    )
    triton.serve()

ptarasiewiczNV commented 8 months ago

Hi,

it looks like a problem with the environment and old onnx version.

I would recommend using the latest NGC container.

You could also downgrade the opset version manually with this modification:

optimize_config = nav.OptimizeConfig(
    target_formats=(nav.Format.ONNX,),
    runners=("OnnxCUDA",),
    custom_configs=nav.OnnxConfig(opset=15) # you would have to try which one works
)

I would also recommend running Model Navigator without a specified target format so that it can choose the best one:

optimize_config = nav.OptimizeConfig(
    custom_configs=nav.OnnxConfig(opset=15) # you would have to try which one works
)

As the PyTriton issue seems to be resolved I would like to close this issue and if you have any more problem with Model Navigator please open an issue in it's repository.

yan123456jie commented 8 months ago

Thanks, issue has bean resolved.

triton-inference-server / pytriton

pytriton is slower than triton #52