[Bug] Fail to deploy serving model on the Azure Machine Learning Platform. Exited with failure (confusing error information and exit code)

keli-wen commented 6 months ago

Description

I really appreciate the PyTriton framework for its ease of use. I've successfully deployed it in a local environment.

Subsequently, I attempted to deploy a model on the Azure Machine Learning Platform. However, I encountered failure, accompanied by an uninformative error message (Triton Inference Server exited with failure. Please wait.).

Upon activating the debug mode, I received an exit code of -11. Unfortunately, my efforts to find the significance of this code -11 yielded no results.

I think the meaning of -11 will help us figure out the error.

2024-04-01 18:18:31,262 - DEBUG - pytriton.client.utils: Creating InferenceServerClient for http://127.0.0.1:8015 with {}
2024-04-01 18:18:31,264 - DEBUG - pytriton.client.utils: Waiting for server to be ready (timeout=119.99996995925903)
2024-04-01 18:18:31,964 - WARNING - pytriton.server.triton_server: Triton Inference Server exited with failure. Please wait.
2024-04-01 18:18:31,964 - DEBUG - pytriton.server.triton_server: Triton Inference Server exit code -11
2024-04-01 18:18:31,964 - DEBUG - pytriton.triton: Got callback that tritonserver process finished

To reproduce

Sorry, I may not be able to give fully reproducible code due to modeling and environment etc. I will give my client and server side code snapshot.

# Server
import argparse
import logging
from pathlib import Path
from typing import Any, Dict

import numpy as np
import torch
from pytriton.decorators import batch
from pytriton.model_config import ModelConfig, Tensor
from pytriton.triton import Triton, TritonConfig

...

logger = logging.getLogger("serving.model1")

model1: Model1 = None  # type: ignore
model2: Model2 = None  # type: ignore
temperature = C.model_serving.temperature

class ModelFactory:
    """Utility class to load model."""

    @staticmethod
    def load_model(model_type: str) -> Any:
        """Utility function to load model."""
        if model_type == "model1":
            return ModelFactory._load_model1()
        elif model_type == "model2":
            return ModelFactory._load_model2()
        else:
            raise ValueError(f"Unknown model type: {model_type}")

    @staticmethod
    def _load_model1() -> Model1:
        """Load model1."""
        ...

    @staticmethod
    def _load_model2() -> Model2:
        """Load model2."""
        ...

@torch.no_grad()
@batch
def _infer_fn_1(input: np.ndarray) -> Dict[str, np.ndarray]:
    """Inference function for Model1 Triton Inference Server."""
    ...

@torch.no_grad()
@batch
def _infer_fn_2(input: np.ndarray) -> Dict[str, np.ndarray]:
    """Inference function for Model2 Triton Inference Server."""
    ...

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="PyTriton Inference Server Config.")
    parser.add_argument("--model1", action="store_true")
    parser.add_argument("--model2", action="store_true")
    args = parser.parse_args()

    # We should have at least one model to serve.
    if not (args.model1 or args.model2):
        parser.error("No action requested, add --model1 or --model2")

    # Using logging.WARN level to avoid verbose logs from triton.
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(name)s: %(message)s", force=True)

    if args.model1:
        model1 = ModelFactory.load_model("model1").cuda().eval()
    if args.model2:
        model2 = ModelFactory.load_model("model2").cuda().eval()

    # Triton support two protocols: HTTP and GRPC. Here we use HTTP.
    triton_config = TritonConfig(
        allow_http=True, allow_grpc=True, exit_on_error=True, log_verbose=True, http_port=8015, grpc_port=8016, metrics_port=8017
    )
    with Triton(config=triton_config) as triton:
        if args.model1:
            triton.bind(
                model_name="model1",
                infer_func=_infer_fn_1,
                inputs=[
                    Tensor(name="input", dtype=np.int32, shape=(C.pretrain2.max_seq_len * 5,)),
                ],
                outputs=[
                    Tensor(name="output", dtype=np.int32, shape=(1,)),
                ],
                config=ModelConfig(batching=True, max_batch_size=16),
                strict=True,
            )

        if args.model2:
            triton.bind(
                model_name="model2",
                infer_func=_infer_fn_2,
                inputs=[
                    Tensor(name="input", dtype=np.float32, shape=(-1, 3, 32, 32)),
                ],
                outputs=[
                    Tensor(name="output", dtype=np.int64, shape=(3, 32, 32)),
                ],
                config=ModelConfig(batching=True, max_batch_size=16),
                strict=True,
            )
        triton.serve()

# Client1
class Client1:
    """Inference client for tritonserver."""

    def __init__(self) -> None:
        # `lazy_init=True` means the client will not connect to the server until the first request is made.
        self.client = ModelClient(url="localhost:8015", model_name="model1", lazy_init=False, ensure_model_is_ready=True)

    def get_prediction(self, state: np.ndarray):
        """Get predictin from torchserve."""
        output_dict = self.client.infer_batch(input=np.array([state], dtype=np.int32))  # type: ignore
        output: npt.NDArray[np.int32] = output_dict["output"]
        output = output.reshape(-1)
        assert len(output) == 1
        order_index = output[0]
        return order_index

# Client2
class Client2:
    """Inference client for tritonserver."""

    def __init__(self) -> None:
        self.client = ModelClient(url="localhost:8015", model_name="model2", lazy_init=False, ensure_model_is_ready=True)

    def get_prediction(self, state: np.ndarray):
        """Get predictin from torchserve."""
        # The shape of state is [S x 3 x 32 x 32]
        output_dict = self.client.infer_batch(input=np.array([state], dtype=np.float32))  # type: ignore
        output: npt.NDArray[np.int64] = output_dict["output"]
        return output[0]

Observed results and expected behavior

I am encountering an error where the server times out. However, I believe there is something wrong with the initialization of the triton inference server.

// Error in client.
Traceback (most recent call last):
  File "infer_serving/triton_inference_serving.py", line 122, in <module>
    with Triton(config=triton_config) as triton:
  File "/home/aiscuser/.local/lib/python3.8/site-packages/pytriton/triton.py", line 597, in __enter__
    super().__enter__()
  File "/home/aiscuser/.local/lib/python3.8/site-packages/pytriton/triton.py", line 465, in __enter__
    self.connect()
  File "/home/aiscuser/.local/lib/python3.8/site-packages/pytriton/triton.py", line 402, in connect
    self._wait_for_server()
  File "/home/aiscuser/.local/lib/python3.8/site-packages/pytriton/triton.py", line 487, in _wait_for_server
    self._log_level_checker.check()
  File "/home/aiscuser/.local/lib/python3.8/site-packages/pytriton/triton.py", line 305, in check
    wait_for_server_ready(client, timeout_s=DEFAULT_TRITON_STARTUP_TIMEOUT_S)
  File "/home/aiscuser/.local/lib/python3.8/site-packages/pytriton/client/utils.py", line 240, in wait_for_server_ready
    raise PyTritonClientTimeoutError("Waiting for server to be ready timed out.")
pytriton.client.exceptions.PyTritonClientTimeoutError: Waiting for server to be ready timed out.
Exception ignored in: <function InferenceServerClient.__del__ at 0x7fa5032294c0>
Traceback (most recent call last):
  File "/home/aiscuser/.local/lib/python3.8/site-packages/tritonclient/http/_client.py", line 199, in __del__
    self.close()
  File "/home/aiscuser/.local/lib/python3.8/site-packages/tritonclient/http/_client.py", line 206, in close
    self._pool.join()
  File "/home/aiscuser/.local/lib/python3.8/site-packages/gevent/pool.py", line 430, in join
    result = self._empty_event.wait(timeout=timeout)
  File "src/gevent/event.py", line 163, in gevent._gevent_cevent.Event.wait
  File "src/gevent/_abstract_linkable.py", line 509, in gevent._gevent_c_abstract_linkable.AbstractLinkable._wait
  File "src/gevent/_abstract_linkable.py", line 206, in gevent._gevent_c_abstract_linkable.AbstractLinkable._capture_hub
gevent.exceptions.InvalidThreadUseError: (<Hub '' at 0x7fa26dd90db0 epoll pending=0 ref=0 fileno=69 resolver=<gevent.resolver.thread.Resolver at 0x7fa280107ee0 pool=<ThreadPool at 0x7fa26de989e0 tasks=0 size=1 maxsize=10 hub=<Hub at 0x7fa26dd90db0 thread_ident=0x7fa4baff4640>>> threadpool=<ThreadPool at 0x7fa26de989e0 tasks=0 size=1 maxsize=10 hub=<Hub at 0x7fa26dd90db0 thread_ident=0x7fa4baff4640>> thread_ident=0x7fa4baff4640>, <Hub '' at 0x7fa26de7e9f0 epoll default pending=0 ref=0 fileno=67 resolver=<gevent.resolver.thread.Resolver at 0x7fa26de78b20 pool=<ThreadPool at 0x7fa26de982e0 tasks=0 size=1 maxsize=10 hub=<Hub at 0x7fa26de7e9f0 thread_ident=0x7fa52aa00740>>> threadpool=<ThreadPool at 0x7fa26de982e0 tasks=0 size=1 maxsize=10 hub=<Hub at 0x7fa26de7e9f0 thread_ident=0x7fa52aa00740>> thread_ident=0x7fa52aa00740>, <greenlet.greenlet object at 0x7fa26de047c0 (otid=0x7fa26ddfd900) current active started main>)

// Error in server. (Non-debug mode, the single GPU exp has the same log) 
2024-04-01 17:10:48,945 - infer_serving/triton_inference_serving.py:64 - INFO - Loaded model from /home/aiscuser/data/serving-models/model/1/model.pt.
I0401 09:10:49.605169 2591 pinned_memory_manager.cc:241] Pinned memory pool is created at '0x7fef80000000' with size 268435456
I0401 09:10:49.608547 2591 cuda_memory_manager.cc:107] CUDA memory pool is created on device 0 with size 67108864
I0401 09:10:49.608563 2591 cuda_memory_manager.cc:107] CUDA memory pool is created on device 1 with size 67108864
I0401 09:10:49.608572 2591 cuda_memory_manager.cc:107] CUDA memory pool is created on device 2 with size 67108864
I0401 09:10:49.608580 2591 cuda_memory_manager.cc:107] CUDA memory pool is created on device 3 with size 67108864
2024-04-01 17:10:50,974 - /home/aiscuser/.local/lib/python3.8/site-packages/pytriton/server/triton_server.py:270 - WARNING - Triton Inference Server exited with failure. Please wait.

Environment

OS/container version: [Ubuntu 22.04.3 LTS]
- glibc version: [ldd (Ubuntu GLIBC 2.35-0ubuntu3.5) 2.35]
Python interpreter distribution and version: [Python 3.8 environment]
pip version: [?]
PyTriton version: [nvidia-pytriton==0.5.0]
Deployment details: [single-GPU single-node setup in Azure Machine Learning]

piotrm-nvidia commented 6 months ago

I appreciate your efforts in troubleshooting the issue with Triton server. To further assist, I recommend starting by ensuring your PyTriton is updated to the latest version (0.5.3).

To debug, let's start with inspecting your PyTriton installation and then proceed with steps to manually run the Triton server.

Inspect PyTriton Installation

Run pip show -f nvidia-pytriton to list the files in your installation. This helps verify the installation and locate the Triton server binary.

Output can look like this:

Name: nvidia-pytriton
Version: 0.5.3
Summary: PyTriton - Flask/FastAPI-like interface to simplify Triton's deployment in Python environments.
Home-page:
Author:
Author-email:
License: Apache 2.0
Location: <YOUR LOCATION>/site-packages
Requires: numpy, protobuf, pyzmq, sh, tritonclient, typing-inspect, wrapt
Required-by:
Files:
  nvidia_pytriton-0.5.3.dist-info/INSTALLER
  ...
  nvidia_pytriton.libs/libacl-b35d4bbf.so.1.1.2301
  ...
  nvidia_pytriton.libs/libzstd-5df4f4df.so.1.4.8
  pytriton/__init__.py
  ...
  pytriton/triton.py
  ...
  pytriton/tritonserver/bin/tritonserver
  pytriton/tritonserver/caches/local/libtritoncache_local.so
  pytriton/tritonserver/python_backend_stubs/3.10/triton_python_backend_stub
  pytriton/tritonserver/python_backend_stubs/3.11/triton_python_backend_stub
  pytriton/tritonserver/python_backend_stubs/3.8/triton_python_backend_stub
  pytriton/tritonserver/python_backend_stubs/3.9/triton_python_backend_stub
  ...
  pytriton/utils/workspace.py

The tritonserver binary is located in the pytriton/tritonserver/bin directory.

Manually Run Triton Server

Using the path from the output, start Triton server with enhanced logging to capture more detailed information about its operation:

<YOUR LOCATION>/site-packages/pytriton/tritonserver/bin/tritonserver --exit-on-error false --log-verbose=1 --http-port 8015 --grpc-port 8016 --metrics-port 8017

This run will ignore the exit-on-error flag and enable verbose logging. The server will listen on ports 8015, 8016, and 8017 for HTTP, gRPC, and metrics, respectively so you can verify that the server is running by visiting http://localhost:8015/v2/health/ready in curl:

root@be23d00f0b57:/opt/workspace# curl -v http://localhost:8015/v2/health/ready
*   Trying 127.0.0.1:8015...
* Connected to localhost (127.0.0.1) port 8015 (#0)
> GET /v2/health/ready HTTP/1.1
> Host: localhost:8015
> User-Agent: curl/7.81.0
> Accept: */*
>
I0405 18:02:56.609648 2229 http_server.cc:4523] HTTP request: 0 /v2/health/ready
* Mark bundle as not supporting multiuse
< HTTP/1.1 400 Bad Request
< Content-Length: 0
< Content-Type: text/plain
<
* Connection #0 to host localhost left intact

If this fails for you, then it is pure Tritonserver issue and please consider posting an issue on the Tritonserver GitHub page.

If tritonserver works for you, then you can proceed with the next steps.

Modify PyTriton for Debugging

To prevent PyTriton from removing the workspace after failure, add a time.sleep(1000000) call in the triton_server.py file as shown. This is a diagnostic step, allowing you to manually inspect the Triton server's behavior without PyTriton's interference.

Code

https://github.com/triton-inference-server/pytriton/blob/54b85de6723c010065d94f9d772c6b58c8d596e1/pytriton/server/triton_server.py#L157

Change

            self._tritonserver_running_cmd = tritonserver_cmd(
                *tritonserver_args,
                _env=env,
                _err_to_out=True,
                _out=self._record_logs,
                _out_bufsize=0,
                _err_bufsize=0,
                _bg=True,
                _bg_exc=False,
                _done=self._handle_exit,
                _preexec_fn=_preexec_fn,
            )
            time.sleep(1000000) #Large sleep here

It will prevent PyTriton from removing workspace after Triton failure.

Utilize Diagnostic Tools

If Tritonserver runs but you still encounter issues, consider using strace or gdb to debug the process. These tools can provide insights into system calls and potential errors at a lower level.

Comparing Logs

I'll share my logs for you to compare. Look for discrepancies in initialization sequences, error messages, or warnings that might point towards the cause of your issue.

Remember, sharing more detailed logs or configurations (while ensuring no sensitive information is disclosed) can greatly improve the chances of identifying the problem. The community or I might spot something specific to your setup that's causing the issue.

My local reproduction

I can't reproduce the error in my machine but I can share with you my logs as reference.

root@edc23c727574:/opt/tritonserver# python3 azure_model.py --model1
2024-04-05 18:43:53,358 - DEBUG - pytriton.utils.workspace: Workspace path /root/.cache/pytriton/workspace_behuk4ac
2024-04-05 18:43:53,360 - DEBUG - pytriton.triton: Preparing Triton Inference Server binaries and libs for execution.
2024-04-05 18:43:53,368 - DEBUG - pytriton.triton: Triton Inference Server binaries copied to /root/.cache/pytriton/workspace_behuk4ac/tritonserver without stubs.
2024-04-05 18:43:53,368 - DEBUG - pytriton.utils.distribution: Obtained pytriton module path: /usr/local/lib/python3.10/dist-packages/pytriton
2024-04-05 18:43:53,368 - DEBUG - pytriton.utils.distribution: Obtained pytriton stubs path for 3.10: /usr/local/lib/python3.10/dist-packages/pytriton/tritonserver/python_backend_stubs/3.10/triton_python_backend_stub
2024-04-05 18:43:53,368 - DEBUG - pytriton.triton: Copying stub for version 3.10 from /usr/local/lib/python3.10/dist-packages/pytriton/tritonserver/python_backend_stubs/3.10/triton_python_backend_stub to /root/.cache/pytriton/workspace_behuk4ac/tritonserver/backends/python/triton_python_backend_stub
2024-04-05 18:43:53,369 - DEBUG - pytriton.triton: Triton Inference Server binaries ready in /root/.cache/pytriton/workspace_behuk4ac/tritonserver
2024-04-05 18:43:53,369 - DEBUG - pytriton.utils.distribution: Obtained pytriton module path: /usr/local/lib/python3.10/dist-packages/pytriton
2024-04-05 18:43:53,369 - DEBUG - pytriton.utils.distribution: Obtained pytriton module path: /usr/local/lib/python3.10/dist-packages/pytriton
2024-04-05 18:43:53,369 - DEBUG - pytriton.utils.distribution: pytriton is installed in editable mode: False
2024-04-05 18:43:53,369 - DEBUG - pytriton.utils.distribution: Obtained nvidia_pytriton.libs path: /usr/local/lib/python3.10/dist-packages/nvidia_pytriton.libs
2024-04-05 18:43:53,369 - DEBUG - pytriton.triton: Starting Triton Inference
2024-04-05 18:43:53,369 - DEBUG - pytriton.server.triton_server: Triton Server binary /root/.cache/pytriton/workspace_behuk4ac/tritonserver/bin/tritonserver. Environment:
{
    "NPP_VERSION": "12.2.5.2",
    "SHELL": "/bin/bash",
    "NVIDIA_VISIBLE_DEVICES": "all",
    "DALI_BUILD": "12768324",
    "CUSOLVER_VERSION": "11.6.0.99",
    "CUBLAS_VERSION": "12.4.2.65",
    "HOSTNAME": "edc23c727574",
    "DCGM_VERSION": "3.2.6",
    "NVIDIA_REQUIRE_CUDA": "cuda>=9.0",
    "CUFFT_VERSION": "11.2.0.44",
    "CUDA_CACHE_DISABLE": "1",
    "NCCL_VERSION": "2.20.5",
    "CUSPARSE_VERSION": "12.3.0.142",
    "ENV": "/etc/shinit_v2",
    "PWD": "/opt/tritonserver",
    "OPENUCX_VERSION": "1.16.0",
    "NSIGHT_SYSTEMS_VERSION": "2024.2.1.38",
    "NVIDIA_DRIVER_CAPABILITIES": "compute,utility,video",
    "POLYGRAPHY_VERSION": "0.49.7",
    "TF_ENABLE_WINOGRAD_NONFUSED": "1",
    "TRT_VERSION": "8.6.3.1+cuda12.2.2.009",
    "NVIDIA_PRODUCT_NAME": "Triton Server",
    "RDMACORE_VERSION": "39.0",
    "HOME": "/root",
    "LS_COLORS": "rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:",
    "CUDA_VERSION": "12.4.0.041",
    "CURAND_VERSION": "10.3.5.119",
    "TCMALLOC_RELEASE_RATE": "200",
    "CUTENSOR_VERSION": "2.0.1.2",
    "TRITON_SERVER_GPU_ENABLED": "1",
    "HPCX_VERSION": "2.18",
    "LESSCLOSE": "/usr/bin/lesspipe %s %s",
    "TERM": "xterm",
    "TRITON_SERVER_VERSION": "2.44.0",
    "GDRCOPY_VERSION": "2.3.1-1",
    "LESSOPEN": "| /usr/bin/lesspipe %s",
    "OPENMPI_VERSION": "4.1.7",
    "NVJPEG_VERSION": "12.3.1.89",
    "LIBRARY_PATH": "/usr/local/cuda/lib64/stubs:",
    "SHLVL": "1",
    "BASH_ENV": "/etc/bash.bashrc",
    "TF_AUTOTUNE_THRESHOLD": "2",
    "CUDNN_VERSION": "9.0.0.306+cuda12.3",
    "NVIDIA_TRITON_SERVER_BASE_VERSION": "24.03",
    "NSIGHT_COMPUTE_VERSION": "2024.1.0.13",
    "DALI_VERSION": "1.35.0",
    "NVIDIA_TRITON_SERVER_VERSION": "24.03",
    "LD_LIBRARY_PATH": "/opt/hpcx/ucc/lib/:/opt/hpcx/ucx/lib/:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/lib/python3.10/dist-packages/nvidia_pytriton.libs",
    "NVIDIA_BUILD_ID": "86102629",
    "OMPI_MCA_coll_hcoll_enable": "0",
    "OPAL_PREFIX": "/opt/hpcx/ompi",
    "CUDA_DRIVER_VERSION": "550.54.14",
    "TRANSFORMER_ENGINE_VERSION": "1.4",
    "_CUDA_COMPAT_PATH": "/usr/local/cuda/compat",
    "NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS": "",
    "PATH": "/usr/bin:/opt/tritonserver/bin:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin",
    "TRITON_SERVER_USER": "triton-server",
    "MOFED_VERSION": "5.4-rdmacore39.0",
    "TRTOSS_VERSION": "23.11",
    "DEBIAN_FRONTEND": "noninteractive",
    "TF_ADJUST_HUE_FUSED": "1",
    "TF_ADJUST_SATURATION_FUSED": "1",
    "UCX_MEM_EVENTS": "no",
    "_": "/usr/bin/python3",
    "LC_CTYPE": "C.UTF-8"
}
2024-04-05 18:43:53,381 - DEBUG - pytriton.client.utils: Creating InferenceServerClient for http://127.0.0.1:8015 with {}
2024-04-05 18:43:53,382 - DEBUG - pytriton.client.utils: Waiting for server to be ready (timeout=119.99999451637268)
I0405 18:43:53.386108 191 cache_manager.cc:480] Create CacheManager with cache_dir: '/opt/tritonserver/caches'
I0405 18:43:55.092733 191 pinned_memory_manager.cc:275] Pinned memory pool is created at '0x205000000' with size 268435456
I0405 18:43:55.092790 191 cuda_memory_manager.cc:107] CUDA memory pool is created on device 0 with size 67108864
I0405 18:43:55.096790 191 server.cc:607]
+------------------+------+
| Repository Agent | Path |
+------------------+------+
+------------------+------+

I0405 18:43:55.096825 191 server.cc:634]
+---------+------+--------+
| Backend | Path | Config |
+---------+------+--------+
+---------+------+--------+

I0405 18:43:55.096848 191 server.cc:677]
+-------+---------+--------+
| Model | Version | Status |
+-------+---------+--------+
+-------+---------+--------+

I0405 18:43:55.135342 191 metrics.cc:877] Collecting metrics for GPU 0: NVIDIA GeForce RTX 3060 Laptop GPU
I0405 18:43:55.138485 191 metrics.cc:770] Collecting CPU metrics
I0405 18:43:55.138632 191 tritonserver.cc:2508]
+----------------------------------+------------------------------------------+
| Option                           | Value                                    |
+----------------------------------+------------------------------------------+
| server_id                        | triton                                   |
| server_version                   | 2.43.0                                   |
| server_extensions                | classification sequence model_repository |
|                                  |  model_repository(unload_dependents) sch |
|                                  | edule_policy model_configuration system_ |
|                                  | shared_memory cuda_shared_memory binary_ |
|                                  | tensor_data parameters statistics trace  |
|                                  | logging                                  |
| model_repository_path[0]         | /root/.cache/pytriton/workspace_behuk4ac |
|                                  | /model-store                             |
| model_control_mode               | MODE_EXPLICIT                            |
| startup_models_0                 | *                                        |
| strict_model_config              | 0                                        |
| rate_limit                       | OFF                                      |
| pinned_memory_pool_byte_size     | 268435456                                |
| cuda_memory_pool_byte_size{0}    | 67108864                                 |
| min_supported_compute_capability | 6.0                                      |
| strict_readiness                 | 1                                        |
| exit_timeout                     | 30                                       |
| cache_enabled                    | 0                                        |
+----------------------------------+------------------------------------------+

I0405 18:43:55.139038 191 grpc_server.cc:2426]
+----------------------------------------------+---------+
| GRPC KeepAlive Option                        | Value   |
+----------------------------------------------+---------+
| keepalive_time_ms                            | 7200000 |
| keepalive_timeout_ms                         | 20000   |
| keepalive_permit_without_calls               | 0       |
| http2_max_pings_without_data                 | 2       |
| http2_min_recv_ping_interval_without_data_ms | 300000  |
| http2_max_ping_strikes                       | 2       |
+----------------------------------------------+---------+

I0405 18:43:55.139427 191 grpc_server.cc:102] Ready for RPC 'Check', 0
I0405 18:43:55.139451 191 grpc_server.cc:102] Ready for RPC 'ServerLive', 0
I0405 18:43:55.139456 191 grpc_server.cc:102] Ready for RPC 'ServerReady', 0
I0405 18:43:55.139459 191 grpc_server.cc:102] Ready for RPC 'ModelReady', 0
I0405 18:43:55.139472 191 grpc_server.cc:102] Ready for RPC 'ServerMetadata', 0
I0405 18:43:55.139485 191 grpc_server.cc:102] Ready for RPC 'ModelMetadata', 0
I0405 18:43:55.139489 191 grpc_server.cc:102] Ready for RPC 'ModelConfig', 0
I0405 18:43:55.139503 191 grpc_server.cc:102] Ready for RPC 'SystemSharedMemoryStatus', 0
I0405 18:43:55.139507 191 grpc_server.cc:102] Ready for RPC 'SystemSharedMemoryRegister', 0
I0405 18:43:55.139520 191 grpc_server.cc:102] Ready for RPC 'SystemSharedMemoryUnregister', 0
I0405 18:43:55.139526 191 grpc_server.cc:102] Ready for RPC 'CudaSharedMemoryStatus', 0
I0405 18:43:55.139538 191 grpc_server.cc:102] Ready for RPC 'CudaSharedMemoryRegister', 0
I0405 18:43:55.139541 191 grpc_server.cc:102] Ready for RPC 'CudaSharedMemoryUnregister', 0
I0405 18:43:55.139545 191 grpc_server.cc:102] Ready for RPC 'RepositoryIndex', 0
I0405 18:43:55.139548 191 grpc_server.cc:102] Ready for RPC 'RepositoryModelLoad', 0
I0405 18:43:55.139550 191 grpc_server.cc:102] Ready for RPC 'RepositoryModelUnload', 0
I0405 18:43:55.139553 191 grpc_server.cc:102] Ready for RPC 'ModelStatistics', 0
I0405 18:43:55.139557 191 grpc_server.cc:102] Ready for RPC 'Trace', 0
I0405 18:43:55.139560 191 grpc_server.cc:102] Ready for RPC 'Logging', 0
I0405 18:43:55.139584 191 grpc_server.cc:359] Thread started for CommonHandler
I0405 18:43:55.139671 191 infer_handler.cc:680] New request handler for ModelInferHandler, 0
I0405 18:43:55.139699 191 infer_handler.h:1312] Thread started for ModelInferHandler
I0405 18:43:55.139760 191 infer_handler.cc:680] New request handler for ModelInferHandler, 0
I0405 18:43:55.139807 191 infer_handler.h:1312] Thread started for ModelInferHandler
I0405 18:43:55.139871 191 stream_infer_handler.cc:128] New request handler for ModelStreamInferHandler, 0
I0405 18:43:55.139913 191 infer_handler.h:1312] Thread started for ModelStreamInferHandler
I0405 18:43:55.139935 191 grpc_server.cc:2519] Started GRPCInferenceService at 0.0.0.0:8016
I0405 18:43:55.140128 191 http_server.cc:4637] Started HTTPService at 0.0.0.0:8015
I0405 18:43:55.256652 191 http_server.cc:320] Started Metrics Service at 0.0.0.0:8017
I0405 18:43:55.389600 191 http_server.cc:4523] HTTP request: 0 /v2/health/ready
2024-04-05 18:43:55,394 - WARNING - pytriton.triton: Triton Inference Server is running with enabled verbose logs (log_verbose_level=1). It may affect inference performance.
2024-04-05 18:43:55,394 - DEBUG - pytriton.client.utils: Creating InferenceServerClient for http://127.0.0.1:8015 with {}
2024-04-05 18:43:55,394 - DEBUG - pytriton.client.utils: Waiting for server to be ready (timeout=119.999995470047)
I0405 18:43:55.389879 191 http_server.cc:4523] HTTP request: 0 /v2/health/live
2024-04-05 18:43:55,401 - WARNING - pytriton.triton: Triton Inference Server is running with enabled verbose logs (log_verbose_level=1). It may affect inference performance.
I0405 18:43:55.390333 191 http_server.cc:4523] HTTP request: 0 /v2/logging
2024-04-05 18:43:55,402 - INFO - pytriton.triton: Read more about configuring and serving models in documentation: https://triton-inference-server.github.io/pytriton.
2024-04-05 18:43:55,403 - INFO - pytriton.triton: (Press CTRL+C or use the command `kill -SIGINT 175` to send a SIGINT signal and quit)
2024-04-05 18:43:55,403 - DEBUG - pytriton.models.manager: Adding model1 (1) to registry under ('model1', 1).
2024-04-05 18:43:55,406 - DEBUG - pytriton.models.manager: Creating model model1 with version 1.
I0405 18:43:55.397234 191 http_server.cc:4523] HTTP request: 0 /v2/health/ready
I0405 18:43:55.398766 191 http_server.cc:4523] HTTP request: 0 /v2/health/live
2024-04-05 18:43:55,410 - INFO - pytriton.client.client: Patch ModelClient http
2024-04-05 18:43:55,410 - DEBUG - pytriton.client.client: Creating InferenceServerClient for http://127.0.0.1:8015 with {'network_timeout': 60.0, 'connection_timeout': 60.0}
2024-04-05 18:43:55,410 - DEBUG - pytriton.client.client: Creating InferenceServerClient for http://127.0.0.1:8015 with {'network_timeout': 60.0, 'connection_timeout': 60.0}
2024-04-05 18:43:55,411 - DEBUG - pytriton.client.utils: Waiting for server to be ready (timeout=119.99999666213989)
I0405 18:43:55.411645 191 http_server.cc:4523] HTTP request: 0 /v2/health/ready
I0405 18:43:55.411983 191 http_server.cc:4523] HTTP request: 0 /v2/health/live
I0405 18:43:55.414709 191 http_server.cc:4523] HTTP request: 2 /v2/repository/models/model1/load
I0405 18:43:55.426823 191 model_config_utils.cc:680] Server side auto-completed config: name: "model1"
max_batch_size: 16
input {
  name: "input"
  data_type: TYPE_INT32
  dims: 640
}
output {
  name: "output"
  data_type: TYPE_INT32
  dims: 1
}
instance_group {
  count: 1
  kind: KIND_CPU
}
default_model_filename: "model.py"
dynamic_batching {
}
parameters {
  key: "workspace-path"
  value {
    string_value: "/root/.cache/pytriton/workspace_behuk4ac"
  }
}
backend: "python"

I0405 18:43:55.426898 191 model_lifecycle.cc:469] loading: model1:1
I0405 18:43:55.427023 191 backend_model.cc:502] Adding default backend config setting: default-max-batch-size,4
I0405 18:43:55.427060 191 shared_library.cc:112] OpenLibraryHandle: /root/.cache/pytriton/workspace_behuk4ac/tritonserver/backends/python/libtriton_python.so
I0405 18:43:55.427878 191 python_be.cc:2075] 'python' TRITONBACKEND API version: 1.18
I0405 18:43:55.427907 191 python_be.cc:2097] backend configuration:
{"cmdline":{"auto-complete-config":"true","backend-directory":"/root/.cache/pytriton/workspace_behuk4ac/tritonserver/backends","min-compute-capability":"6.000000","shm-default-byte-size":"4194304","shm-growth-byte-size":"1048576","shm-region-prefix-name":"pytrtion175-1c1552ad","default-max-batch-size":"4"}}
I0405 18:43:55.427926 191 python_be.cc:2236] Shared memory configuration is shm-default-byte-size=4194304,shm-growth-byte-size=1048576,stub-timeout-seconds=30
I0405 18:43:55.428087 191 python_be.cc:2559] TRITONBACKEND_GetBackendAttribute: setting attributes
I0405 18:43:55.428128 191 python_be.cc:2337] TRITONBACKEND_ModelInitialize: model1 (version 1)
I0405 18:43:55.428424 191 model_config_utils.cc:1902] ModelConfig 64-bit fields:
I0405 18:43:55.428446 191 model_config_utils.cc:1904]   ModelConfig::dynamic_batching::default_priority_level
I0405 18:43:55.428449 191 model_config_utils.cc:1904]   ModelConfig::dynamic_batching::default_queue_policy::default_timeout_microseconds
I0405 18:43:55.428463 191 model_config_utils.cc:1904]   ModelConfig::dynamic_batching::max_queue_delay_microseconds
I0405 18:43:55.428465 191 model_config_utils.cc:1904]   ModelConfig::dynamic_batching::priority_levels
I0405 18:43:55.428467 191 model_config_utils.cc:1904]   ModelConfig::dynamic_batching::priority_queue_policy::key
I0405 18:43:55.428469 191 model_config_utils.cc:1904]   ModelConfig::dynamic_batching::priority_queue_policy::value::default_timeout_microseconds
I0405 18:43:55.428471 191 model_config_utils.cc:1904]   ModelConfig::ensemble_scheduling::step::model_version
I0405 18:43:55.428473 191 model_config_utils.cc:1904]   ModelConfig::input::dims
I0405 18:43:55.428476 191 model_config_utils.cc:1904]   ModelConfig::input::reshape::shape
I0405 18:43:55.428478 191 model_config_utils.cc:1904]   ModelConfig::instance_group::secondary_devices::device_id
I0405 18:43:55.428479 191 model_config_utils.cc:1904]   ModelConfig::model_warmup::inputs::value::dims
I0405 18:43:55.428481 191 model_config_utils.cc:1904]   ModelConfig::optimization::cuda::graph_spec::graph_lower_bound::input::value::dim
I0405 18:43:55.428482 191 model_config_utils.cc:1904]   ModelConfig::optimization::cuda::graph_spec::input::value::dim
I0405 18:43:55.428484 191 model_config_utils.cc:1904]   ModelConfig::output::dims
I0405 18:43:55.428486 191 model_config_utils.cc:1904]   ModelConfig::output::reshape::shape
I0405 18:43:55.428487 191 model_config_utils.cc:1904]   ModelConfig::sequence_batching::direct::max_queue_delay_microseconds
I0405 18:43:55.428489 191 model_config_utils.cc:1904]   ModelConfig::sequence_batching::max_sequence_idle_microseconds
I0405 18:43:55.428491 191 model_config_utils.cc:1904]   ModelConfig::sequence_batching::oldest::max_queue_delay_microseconds
I0405 18:43:55.428492 191 model_config_utils.cc:1904]   ModelConfig::sequence_batching::state::dims
I0405 18:43:55.428494 191 model_config_utils.cc:1904]   ModelConfig::sequence_batching::state::initial_state::dims
I0405 18:43:55.428505 191 model_config_utils.cc:1904]   ModelConfig::version_policy::specific::versions
I0405 18:43:55.429231 191 stub_launcher.cc:388] Starting Python backend stub:  exec /root/.cache/pytriton/workspace_behuk4ac/tritonserver/backends/python/triton_python_backend_stub /tmp/folderfvQ0o3/1/model.py pytrtion175-1c1552ad1 4194304 1048576 191 /root/.cache/pytriton/workspace_behuk4ac/tritonserver/backends/python 336 model1 DEFAULT
W0405 18:43:56.140971 191 metrics.cc:631] Unable to get power limit for GPU 0. Status:Success, value:0.000000
I0405 18:43:56.770503 191 python_be.cc:2031] model configuration:
{
    "name": "model1",
    "platform": "",
    "backend": "python",
    "runtime": "",
    "version_policy": {
        "latest": {
            "num_versions": 1
        }
    },
    "max_batch_size": 16,
    "input": [
        {
            "name": "input",
            "data_type": "TYPE_INT32",
            "format": "FORMAT_NONE",
            "dims": [
                640
            ],
            "is_shape_tensor": false,
            "allow_ragged_batch": false,
            "optional": false
        }
    ],
    "output": [
        {
            "name": "output",
            "data_type": "TYPE_INT32",
            "dims": [
                1
            ],
            "label_filename": "",
            "is_shape_tensor": false
        }
    ],
    "batch_input": [],
    "batch_output": [],
    "optimization": {
        "priority": "PRIORITY_DEFAULT",
        "input_pinned_memory": {
            "enable": true
        },
        "output_pinned_memory": {
            "enable": true
        },
        "gather_kernel_buffer_threshold": 0,
        "eager_batching": false
    },
    "dynamic_batching": {
        "preferred_batch_size": [
            16
        ],
        "max_queue_delay_microseconds": 0,
        "preserve_ordering": false,
        "priority_levels": 0,
        "default_priority_level": 0,
        "priority_queue_policy": {}
    },
    "instance_group": [
        {
            "name": "model1_0",
            "kind": "KIND_CPU",
            "count": 1,
            "gpus": [],
            "secondary_devices": [],
            "profile": [],
            "passive": false,
            "host_policy": ""
        }
    ],
    "default_model_filename": "model.py",
    "cc_model_filenames": {},
    "metric_tags": {},
    "parameters": {
        "workspace-path": {
            "string_value": "/root/.cache/pytriton/workspace_behuk4ac"
        }
    },
    "model_warmup": []
}
I0405 18:43:56.770674 191 python_be.cc:2381] TRITONBACKEND_ModelInstanceInitialize: model1_0_0 (CPU device 0)
I0405 18:43:56.770699 191 backend_model_instance.cc:69] Creating instance model1_0_0 on CPU using artifact 'model.py'
I0405 18:43:56.771113 191 stub_launcher.cc:388] Starting Python backend stub:  exec /root/.cache/pytriton/workspace_behuk4ac/tritonserver/backends/python/triton_python_backend_stub /tmp/folderfvQ0o3/1/model.py pytrtion175-1c1552ad2 4194304 1048576 191 /root/.cache/pytriton/workspace_behuk4ac/tritonserver/backends/python 336 model1_0_0 DEFAULT
I0405 18:43:56.980122 191 model.py:378] Model instance name: model1_0_0
I0405 18:43:56.980249 191 model.py:379] Decoupled model: False
I0405 18:43:56.980380 191 model.py:380] Workspace path: /root/.cache/pytriton/workspace_behuk4ac
I0405 18:43:56.980531 191 model.py:381] Model inputs: {'input'}
I0405 18:43:56.980677 191 model.py:382] Model outputs: {'output': {'name': 'output', 'data_type': 'TYPE_INT32', 'dims': [1], 'label_filename': '', 'is_shape_tensor': False}}
I0405 18:43:57.110983 191 data.py:560] Started remote block store at /root/.cache/pytriton/workspace_behuk4ac/model1-data.sock (pid=327)
I0405 18:43:57.112423 191 model.py:408] Using BatchResponsesHandler for handling responses
I0405 18:43:57.118279 191 python_be.cc:2402] TRITONBACKEND_ModelInstanceInitialize: instance initialization successful model1_0_0 (device 0)
2024-04-05 18:43:57,124 - DEBUG - pytriton.client.client: Closing ModelClient
2024-04-05 18:43:57,124 - DEBUG - pytriton.models.manager: Done.
I0405 18:43:57.118824 191 communication.py:149] Binding socket to url='ipc:///root/.cache/pytriton/workspace_behuk4ac/model1_0_0-server.sock'
2024-04-05 18:43:57,126 - DEBUG - pytriton.proxy.communication: Waiting for config file /root/.cache/pytriton/workspace_behuk4ac/model1-config.sock
I0405 18:43:57.123385 191 backend_model_instance.cc:772] Starting backend thread for model1_0_0 at nice 0 on device 0...
I0405 18:43:57.123640 191 dynamic_batch_scheduler.cc:297] Starting dynamic-batcher thread for model1 at nice 0...
I0405 18:43:57.123646 191 model_lifecycle.cc:835] successfully loaded 'model1'
2024-04-05 18:43:57,141 - DEBUG - pytriton.proxy.data: Connected to remote block store at /root/.cache/pytriton/workspace_behuk4ac/model1-data.sock)
2024-04-05 18:43:57,141 - DEBUG - asyncio: Using selector: EpollSelector
2024-04-05 18:43:57,142 - DEBUG - asyncio: Using selector: EpollSelector
2024-04-05 18:43:57,143 - DEBUG - pytriton.proxy.communication.client: Connecting requests_server_client-59b6 to server listening on ipc:///root/.cache/pytriton/workspace_behuk4ac/model1_0_0-server.sock
2024-04-05 18:43:57,143 - DEBUG - pytriton.triton: Triton Inference already connected.
W0405 18:43:57.143991 191 metrics.cc:631] Unable to get power limit for GPU 0. Status:Success, value:0.000000
W0405 18:43:58.144410 191 metrics.cc:631] Unable to get power limit for GPU 0. Status:Success, value:0.000000

My script azure_model.py:

import argparse
import logging
from pathlib import Path
from typing import Any, Dict

import numpy as np
from pytriton.decorators import batch
from pytriton.model_config import ModelConfig, Tensor
from pytriton.triton import Triton, TritonConfig

logger = logging.getLogger("serving.model1")

class Model1:
    pass

class Model2:
    pass

model1 = 1  # type: ignore
model2 = 2  # type: ignore
temperature = 0.3# C.model_serving.temperature

class ModelFactory:
    """Utility class to load model."""

    @staticmethod
    def load_model(model_type: str) -> Any:
        """Utility function to load model."""
        if model_type == "model1":
            return ModelFactory._load_model1()
        elif model_type == "model2":
            return ModelFactory._load_model2()
        else:
            raise ValueError(f"Unknown model type: {model_type}")

    @staticmethod
    def _load_model1() -> Model1:
        """Load model1."""
        return 1

    @staticmethod
    def _load_model2() -> Model2:
        """Load model2."""
        return 2

@batch
def _infer_fn_1(input: np.ndarray) -> Dict[str, np.ndarray]:
    """Inference function for Model1 Triton Inference Server."""
    pass

@batch
def _infer_fn_2(input: np.ndarray) -> Dict[str, np.ndarray]:
    """Inference function for Model1 Triton Inference Server."""
    pass

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="PyTriton Inference Server Config.")
    parser.add_argument("--model1", action="store_true")
    parser.add_argument("--model2", action="store_true")
    args = parser.parse_args()

    # We should have at least one model to serve.
    if not (args.model1 or args.model2):
        parser.error("No action requested, add --model1 or --model2")

    # Using logging.WARN level to avoid verbose logs from triton.
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(name)s: %(message)s", force=True)

    if args.model1:
        model1 = 1 #ModelFactory.load_model("model1").cuda().eval()
    if args.model2:
        model2 = 2 #ModelFactory.load_model("model2").cuda().eval()

    # Triton support two protocols: HTTP and GRPC. Here we use HTTP.
    triton_config = TritonConfig(
        allow_http=True, allow_grpc=True, exit_on_error=True, log_verbose=True, http_port=8015, grpc_port=8016, metrics_port=8017
    )
    with Triton(config=triton_config) as triton:
        if args.model1:
            triton.bind(
                model_name="model1",
                infer_func=_infer_fn_1,
                inputs=[
                    Tensor(name="input", dtype=np.int32, shape=(128 * 5,)),
                ],
                outputs=[
                    Tensor(name="output", dtype=np.int32, shape=(1,)),
                ],
                config=ModelConfig(batching=True, max_batch_size=16),
                strict=True,
            )

        if args.model2:
            triton.bind(
                model_name="model2",
                infer_func=_infer_fn_2,
                inputs=[
                    Tensor(name="input", dtype=np.float32, shape=(-1, 3, 32, 32)),
                ],
                outputs=[
                    Tensor(name="output", dtype=np.int64, shape=(3, 32, 32)),
                ],
                config=ModelConfig(batching=True, max_batch_size=16),
                strict=True,
            )
        triton.serve()

github-actions[bot] commented 4 months ago

This issue is stale because it has been open 21 days with no activity. Remove stale label or comment or this will be closed in 7 days.

github-actions[bot] commented 4 months ago

This issue was closed because it has been stalled for 7 days with no activity.

triton-inference-server / pytriton