Pobby321 commented 5 months ago

optimize_config = nav.OptimizeConfig( target_formats=(nav.Format.ONNX,), runners=("OnnxCUDA",), custom_configs=nav.OnnxConfig(opset=15) # you would have to try which one works )

model = nav.Module(model, optimize_config=optimize_config)

nav.optimize()

jkosek commented 5 months ago

Hi @Pobby321. Could you please provide full path to reproduce the issue? Thanks.

Pobby321 commented 5 months ago

my code

-- coding: UTF-8 --

import torch from transformers import BertConfig, BertTokenizer, BertForSequenceClassification import os import json import os from pytriton.decorators import batch from pytriton.model_config import ModelConfig, Tensor from pytriton.triton import Triton import numpy as np from transformers import BertTokenizer import requests import model_navigator as nav

label_list = ["finance", "realty", "stocks", "education", "science", "society", "politics", "sports", "game", "entertainment", ]

pretrained_bert_dir = "/models" tokenizer = BertTokenizer.from_pretrained(pretrained_bert_dir) bert_config = BertConfig.from_pretrained(pretrained_bert_dir, num_labels=len(label_list))

optimize_config = nav.OptimizeConfig( target_formats=(nav.Format.ONNX,), runners=("OnnxCUDA",) ) model = BertForSequenceClassification(bert_config) model.load_state_dict(torch.load(os.path.join(pretrained_bert_dir, "bert_model.pth"))) model.to("cuda") model.eval()

model = nav.Module(model, optimize_config=optimize_config) optimize the model nav.optimize()

@batch def _infer_fn(sentence: np.ndarray): sequences_batch = np.char.decode(sentence.astype("bytes"), "utf-8") sequences_batch = [s[0] for s in sequences_batch] inputs = tokenizer(sequences_batch, max_length=32, padding="max_length", truncation=True, return_tensors="pt") inputs = inputs.to("cuda") outputs = model(**inputs) # single model call logits = outputs["logits"] # changed to use dict interface instead of list so it works with optimized model labels_pro = torch.max(logits.data, 1)[1].tolist() labels = [label_list[label_pro] for label_pro in labels_pro] return {"label": np.char.encode(labels, "utf-8")}

with Triton() as triton: triton.bind( model_name="BERT", infer_func=_inferfn, inputs=[ Tensor(name="sentence", dtype=np.bytes, shape=(1,)), ], outputs=[ Tensor(name="label", dtype=np.bytes_, shape=(1,)), ], config=ModelConfig(max_batch_size=1024) ) triton.serve()

jkosek commented 5 months ago

Hey @Pobby321,

Instead of explicit call nav.optimize you can use simple the OPTIMIZE and RUN modes. When using the OPTIMIZE the Model Navigator will automatically start optimizing the model to ONNX after collecting 100 data samples for inference. Then you can re-run the server with the RUN mode where optimized version will be automatically used.

In the example the mode is controlled through MODEL_NAVIGATOR_MODE environment variable. Example:

export MODEL_NAVIGATOR_MODE=OPTIMIZE

Here is yours code with adjustment:

import torch
import os
import json
import os
import requests

import numpy as np

from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

import model_navigator as nav

from pytriton.decorators import batch
from pytriton.model_config import ModelConfig, Tensor
from pytriton.triton import Triton

# Control the inplace mode
if os.environ.get("MODEL_NAVIGATOR_MODE") == "OPTIMIZE":
    nav.inplace_config.mode = nav.Mode.OPTIMIZE
else:
    nav.inplace_config.mode = nav.Mode.RUN

label_list = ["finance", "realty", "stocks", "education", "science", "society", "politics", "sports", "game",
"entertainment", ]

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_config = BertConfig.from_pretrained("bert-base-uncased", num_labels=len(label_list))

optimize_config = nav.OptimizeConfig(
    target_formats=(nav.Format.ONNX,),
    runners=("OnnxCUDA",),
    verbose=True
)

model = BertForSequenceClassification(bert_config)
model.to("cuda")
model.eval()

model = nav.Module(model, name="bert-base-uncased-for-sequence", optimize_config=optimize_config)

@batch
def _infer_fn(sentence: np.ndarray):
    print("Infer")
    sequences_batch = np.char.decode(sentence.astype("bytes"), "utf-8")
    sequences_batch = [s[0] for s in sequences_batch]
    inputs = tokenizer(sequences_batch, max_length=32, padding="max_length", truncation=True, return_tensors="pt")
    inputs = inputs.to("cuda")
    outputs = model(**inputs) # single model call
    logits = outputs["logits"] # changed to use dict interface instead of list so it works with optimized model
    labels_pro = torch.max(logits.data, 1)[1].tolist()
    labels = [label_list[label_pro] for label_pro in labels_pro]
    print(labels)
    return {"label": np.char.encode(labels, "utf-8")}

with Triton() as triton:
    triton.bind(
        model_name="BERT",
        infer_func=_infer_fn,
        inputs=[
            Tensor(name="sentence", dtype=np.bytes_, shape=(1,)),
        ],
        outputs=[
            Tensor(name="label", dtype=np.bytes_, shape=(1,)),
        ],
        config=ModelConfig(max_batch_size=1024)
    )
    triton.serve()

and the client that could be used to test the flow (run it when the server side is up and running):

import argparse
import logging

import numpy as np

from pytriton.client import ModelClient

logger = logging.getLogger("examples.huggingface_bart_pytorch.client")

def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--url",
        default="localhost",
        help=(
            "Url to Triton server (ex. grpc://localhost:8001)."
            "HTTP protocol with default port is used if parameter is not provided"
        ),
        required=False,
    )
    parser.add_argument(
        "--init-timeout-s",
        type=float,
        default=600.0,
        help="Server and model ready state timeout in seconds",
        required=False,
    )
    parser.add_argument(
        "--iterations",
        type=int,
        default=100,
        help="Number of requests per client.",
        required=False,
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
    )
    args = parser.parse_args()

    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(name)s: %(message)s")

    sequence = np.array(
        [
            ["one day I will see the world"],
            ["I would love to learn cook the Asian street food"],
            ["Carnival in Rio de Janeiro"],
            ["William Shakespeare was a great writer"],
        ]
    )
    sequence = np.char.encode(sequence, "utf-8")
    logger.info(f"Sequence: {sequence}")

    with ModelClient(args.url, "BERT", init_timeout_s=args.init_timeout_s) as client:
        for req_idx in range(1, args.iterations + 1):
            logger.info(f"Sending request ({req_idx}).")
            result_dict = client.infer_batch(sequence)
            for output_name, output_data in result_dict.items():
                output_data = np.array2string(
                    output_data, threshold=np.inf, max_line_width=np.inf, separator=","
                ).replace("\n", "")
                logger.info(f"{output_name}: {output_data} for request ({req_idx}).")

if __name__ == "__main__":
    main()

Let me know if that helps.

Pobby321 commented 5 months ago

thanks，but have another new bug！

WARNING:pytriton.proxy.inference:Exception while performing inference on requests=00000064: Traceback (most recent call last): File "/usr/local/lib/python3.10/dist-packages/pytriton/proxy/inference.py", line 386, in _handle_requests async for responses in self._model_callable(requests): File "/usr/local/lib/python3.10/dist-packages/pytriton/proxy/inference.py", line 80, in _callable yield inference_callable(requests) File "/usr/local/lib/python3.10/dist-packages/pytriton/decorators.py", line 206, in batch outputs = wrapped(*args, *new_kwargs) File "/workspace/main/test2.py", line 138, in _infer_fn output = model(text_iter).data.cpu() File "/usr/local/lib/python3.10/dist-packages/model_navigator/inplace/wrapper.py", line 116, in call output = self._wrapper(args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/model_navigator/inplace/model.py", line 232, in call module_registry.optimize() File "/usr/local/lib/python3.10/dist-packages/model_navigator/inplace/registry.py", line 57, in optimize module.load_optimized() File "/usr/local/lib/python3.10/dist-packages/model_navigator/inplace/wrapper.py", line 147, in load_optimized self._wrapper = OptimizedModule( File "/usr/local/lib/python3.10/dist-packages/model_navigator/inplace/model.py", line 255, in init runner = package.get_runner(return_type=TensorType.TORCH, strategy=inplace_config.strategy) File "/usr/local/lib/python3.10/dist-packages/model_navigator/package/package.py", line 169, in get_runner runtime_result = self._get_best_runtime(strategy=strategy, include_source=include_source) File "/usr/local/lib/python3.10/dist-packages/model_navigator/package/package.py", line 271, in _get_best_runtime runtime_result = RuntimeAnalyzer.get_runtime(self.status.models_status, strategy=strategy, formats=formats) File "/usr/local/lib/python3.10/dist-packages/model_navigator/runtime_analyzer/analyzer.py", line 122, in get_runtime raise ModelNavigatorRuntimeAnalyzerError("No matching results found.") model_navigator.exceptions.ModelNavigatorRuntimeAnalyzerError: No matching results found.

jkosek commented 5 months ago

Hey @Pobby321. Could you share a bit more info about your environment and full log? The issue you may experience is using ONNX Runtime with CUDA 12 as there is no official build at the moment and runtime is failing during initialization.

What I can suggest in such case is to use nvcr.io/nvidia/pytorch:22.12-py3 container which is latest NVIDIA container released with CUDA 11.8.

github-actions[bot] commented 5 months ago

This issue is stale because it has been open 21 days with no activity. Remove stale label or comment or this will be closed in 7 days.

github-actions[bot] commented 4 months ago

This issue was closed because it has been stalled for 7 days with no activity.

triton-inference-server / pytriton

nav.optimize() bug #59

-- coding: UTF-8 --