Open zchenyu opened 11 months ago
Hi @zchenyu, can you try replicating the issue on a pre-build Triton container? It is because we may not be able to provide support for custom build Triton server. You can pull one from our NGC catalog: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags.
Interesting, I can't reproduce with nvcr.io/nvidia/tritonserver:23.10-py3
I don't have the exact commit that I built the image with, but I have a rough timestamp. Here's some debug information:
Build command:
python build.py \
--backend=python \
--repoagent=checksum \
--cache=local \
--endpoint=grpc \
--enable-gpu \
--enable-logging \
--enable-stats \
--enable-metrics \
--enable-gpu-metrics \
--enable-cpu-metrics \
--enable-tracing \
--enable-nvtx
$ docker inspect xxx
...
"Created": "2023-11-13T22:11:21.355410303Z",
"DockerVersion": "23.0.3",
"Config": {
...
"Env": [
"PATH=/opt/tritonserver/bin:/usr/local/mpi/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin",
"CUDA_VERSION=12.2.2.009",
"CUDA_DRIVER_VERSION=535.104.05",
"CUDA_CACHE_DISABLE=1",
"NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS=",
"_CUDA_COMPAT_PATH=/usr/local/cuda/compat",
"ENV=/etc/shinit_v2",
"BASH_ENV=/etc/bash.bashrc",
"SHELL=/bin/bash",
"NVIDIA_REQUIRE_CUDA=cuda>=9.0",
"NCCL_VERSION=2.19.3",
"CUBLAS_VERSION=12.2.5.6",
"CUFFT_VERSION=11.0.8.103",
"CURAND_VERSION=10.3.3.141",
"CUSPARSE_VERSION=12.1.2.141",
"CUSOLVER_VERSION=11.5.2.141",
"CUTENSOR_VERSION=1.7.0.1",
"NPP_VERSION=12.2.1.4",
"NVJPEG_VERSION=12.2.2.4",
"CUDNN_VERSION=8.9.5.29",
"TRT_VERSION=8.6.1.6+cuda12.0.1.011",
"TRTOSS_VERSION=23.10",
"NSIGHT_SYSTEMS_VERSION=2023.3.1.92",
"NSIGHT_COMPUTE_VERSION=2023.2.2.3",
"DALI_VERSION=1.30.0",
"DALI_BUILD=9783408",
"POLYGRAPHY_VERSION=0.49.0",
"TRANSFORMER_ENGINE_VERSION=0.12",
"LD_LIBRARY_PATH=/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_DRIVER_CAPABILITIES=compute,utility,video",
"NVIDIA_PRODUCT_NAME=Triton Server",
"GDRCOPY_VERSION=2.3",
"HPCX_VERSION=2.16rc4",
"MOFED_VERSION=5.4-rdmacore39.0",
"OPENUCX_VERSION=1.15.0",
"OPENMPI_VERSION=4.1.5rc2",
"RDMACORE_VERSION=39.0",
"OPAL_PREFIX=/opt/hpcx/ompi",
"OMPI_MCA_coll_hcoll_enable=0",
"LIBRARY_PATH=/usr/local/cuda/lib64/stubs:",
"NVIDIA_TRITON_SERVER_BASE_VERSION=23.10",
"NVIDIA_BUILD_ID=<unknown>",
"TRITON_SERVER_VERSION=2.41.0dev",
"NVIDIA_TRITON_SERVER_VERSION=23.12dev",
"UCX_MEM_EVENTS=no",
"TF_ADJUST_HUE_FUSED=1",
"TF_ADJUST_SATURATION_FUSED=1",
"TF_ENABLE_WINOGRAD_NONFUSED=1",
"TF_AUTOTUNE_THRESHOLD=2",
"TRITON_SERVER_GPU_ENABLED=1",
"TRITON_SERVER_USER=triton-server",
"DEBIAN_FRONTEND=noninteractive",
"TCMALLOC_RELEASE_RATE=200",
"DCGM_VERSION=3.2.6"
],
"Image": "sha256:93942c41b1e9a43d902d758a5f4faab1c47e816895ab6a6c52cc23346e64beec",
"Labels": {
"com.nvidia.build.id": "<unknown>",
"com.nvidia.build.ref": "",
"com.nvidia.cublas.version": "12.2.5.6",
"com.nvidia.cuda.version": "9.0",
"com.nvidia.cudnn.version": "8.9.5.29",
"com.nvidia.cufft.version": "11.0.8.103",
"com.nvidia.curand.version": "10.3.3.141",
"com.nvidia.cusolver.version": "11.5.2.141",
"com.nvidia.cusparse.version": "12.1.2.141",
"com.nvidia.cutensor.version": "1.7.0.1",
"com.nvidia.nccl.version": "2.19.3",
"com.nvidia.npp.version": "12.2.1.4",
"com.nvidia.nsightcompute.version": "2023.2.2.3",
"com.nvidia.nsightsystems.version": "2023.3.1.92",
"com.nvidia.nvjpeg.version": "12.2.2.4",
"com.nvidia.tensorrt.version": "8.6.1.6+cuda12.0.1.011",
"com.nvidia.tensorrtoss.version": "23.10",
"com.nvidia.tritonserver.version": "2.41.0dev",
"com.nvidia.volumes.needed": "nvidia_driver",
"org.opencontainers.image.ref.name": "ubuntu",
"org.opencontainers.image.version": "22.04"
}
},
"Architecture": "amd64",
"Os": "linux",
"Size": 9401638502,
"VirtualSize": 9401638502,
"GraphDriver": {
"Data": {
"LowerDir": "/var/lib/docker/overlay2/d75813542e3e7b15f52c3c0c6fb641d9f2bc337a2cacf2038e4213eb4cd0886e/diff:/var/lib/docker/overlay2/c1282e41843a03fc801713e21e9064633658ea5a884fba3ba337f039909255f0/diff:/var/lib/docker/overlay2/5f1a4cd28dfc0be0d82d7469a42b5a7339d634685d8c0779106f42aeff4348fe/diff:/var/lib/docker/overlay2/450e3dc583323e273d2430e8733b15e757a187b5006b44e156e88b2dd92ef72a/diff:/var/lib/docker/overlay2/dbbff2a12bf5ef2f8c0130fd0333512760231313a1a4281566e8e4eefbb87a45/diff:/var/lib/docker/overlay2/c5472de0761a7efdb700e86a64a6cc93054124e09c27380102a6d119e170b67c/diff:/var/lib/docker/overlay2/983abd7136948d8ab6d59997c73a05526d9b3d3e7402e3670b82bb4a32af1cf8/diff:/var/lib/docker/overlay2/210d322c12846d5ec02fb9ba16396fbeb1c383be0e058eeea33f499ce50c7c26/diff:/var/lib/docker/overlay2/be06ef52bdaa2cf8790fb5d0f1ed343d468fd7dcf181fa8739c09667fd569a6c/diff:/var/lib/docker/overlay2/ec9fa773be1a2e420114e6d2691ef9f7c0bc857a5cc54f2134c7cafd5704f936/diff:/var/lib/docker/overlay2/1fa153d3254394bc272cd2a06965eedc7e5142864044e23bc057370b82a3078a/diff:/var/lib/docker/overlay2/29de04475bb4f4da2e3978b1765eb7f86b11e20d3878ff9b7939bad8969195a6/diff:/var/lib/docker/overlay2/21556fc658849fd12f08f93f25e6698adb5cd26f31dfcdcd44bde10e32957765/diff:/var/lib/docker/overlay2/abe3a3aa52ccad7b7a15df930732bd203b543d19ac622cc6152e7dd7f4aa66da/diff:/var/lib/docker/overlay2/6fc7c812b1db034372e837b6f14ed5a81fa56fee316e7d5994cc92588543b5f5/diff:/var/lib/docker/overlay2/7cc2edfaa36054a101d5cf8c1d9698ab5194f0c6c9479b20622c00a88aed50ad/diff:/var/lib/docker/overlay2/9769f387c96aa5f11aaae6380dbc23abbf198000fe58939874c4163dbc00d700/diff:/var/lib/docker/overlay2/fc8ba61ddd51ecc3bbfee57f6f5075f48ae2939bf97ba98375d5231c3417d644/diff:/var/lib/docker/overlay2/59de1b5b1244e06a8847bd0afc8cba1d2c2131903cd1bf5b8e196ac4a55c0e18/diff:/var/lib/docker/overlay2/42574c2d7fa9eb78f023f49aea2bc40c2e5f486e6293159ab5a712eec9f8be6d/diff:/var/lib/docker/overlay2/5368bd3a8f358af8f2ae913c3a47fb377cbc14d560388a1b7bbf9bd5c5049374/diff:/var/lib/docker/overlay2/78c9533874522fecf81a6d5a245ca21e07b7415df9311d45a3f1615fcc090fb2/diff:/var/lib/docker/overlay2/bafa55c90a3758f7c8c4dcb99b3495bb4bdabc1582822c17c576107a36b346f9/diff:/var/lib/docker/overlay2/f56ddd480b5d89eaca181b8679b2d7a72fb85b2f019a5cc71208f5215e64dafc/diff",
"MergedDir": "/var/lib/docker/overlay2/13b180020399dbfc5d6c1c51db27f3eb83b32c9863ba0b10511523728eec35e7/merged",
"UpperDir": "/var/lib/docker/overlay2/13b180020399dbfc5d6c1c51db27f3eb83b32c9863ba0b10511523728eec35e7/diff",
"WorkDir": "/var/lib/docker/overlay2/13b180020399dbfc5d6c1c51db27f3eb83b32c9863ba0b10511523728eec35e7/work"
},
"Name": "overlay2"
},
"RootFS": {
"Type": "layers",
"Layers": [
"sha256:01d4e4b4f381ac5a9964a14a650d7c074a2aa6e0789985d843f8eb3070b58f7d",
"sha256:28dedf274dd82e720c1e5c651c5fdbcafff8f9687d4a3d95af147e0e3caa8bb2",
"sha256:d12750ac86fb921cdc38acb4d5154a52de2e4e70f42330837f01bc421194ba6c",
"sha256:f12ed123586ee818842e5bd0db946e04f39d8d679c18b275df913498feb4281c",
"sha256:13b41d13cbee5b050a1d526c2ee6b2643ad86b4998e69a5348dc43819f3fb8e0",
"sha256:dddc962cd9c168b2b79181f7c266e35ff0cc87701ab6d9fb676f9bf6561513b0",
"sha256:5f8dc6c4d9ac310ba9c68b79b12a46409e112490eeae38019187ac4a83ea6199",
"sha256:f70bf143e96789fb387d63da983c432d2717de7f2df20c90cfe0c66aeedb3041",
"sha256:f7f51f3bc96eea52cf5019d3cce2900b02797f48e7ce3ac947b74e9662ea9dd6",
"sha256:0161fdcfae36218edd6eed2c11675216b65bb5d2d0a0aef3b8c51a626ecb60c1",
"sha256:54788d2d82c90da0d14e3aa9852003bc801d68836f19b9f64d4386b8c14b2349",
"sha256:6bcd3616240bb782802144d3b62423cf7f40dedf28772113620fa294887b4ec0",
"sha256:09e624be0b72bb2a021c9d78329d9715881af55036a1cef08dd2f0a2ba5c3410",
"sha256:1db7c5886680d09dfb46e0364b27f30e29e853b64c412ed4133faf090dee8a92",
"sha256:17d01486a53eed207a7583c37e093e44ad7867c9fc3d5f39c2ba73ddf2fda447",
"sha256:338bf418d8108fc4767eab3d499b826f08dbcd655d8f211f144d3dc83c576118",
"sha256:4e7b1830b19e20994594e13200b8f34ef934bb765156700c8f3c8cc5aceebfaa",
"sha256:2f692708fcd456117d4add4bef906864317f3bbcdd1cfb787303bf1587796218",
"sha256:8e17adb029ca5f860b1f67d4b665b0e22b5decb92f6b8cf7a36d0dd95854359b",
"sha256:be9535508c17c3a28c660f3ce02dfa3cc52ce3af7a3823a26bccf725ccb7c204",
"sha256:83eab2f33db5c2072b53e4781bb4fb6acab8fa1523532fbd5a00087389fe701a",
"sha256:f9f338fcea43073298e5108ebafc74c2239ace0da58447f79529f5bde8a59616",
"sha256:f23b0b1e8134ca1451ae2838e017a6716f4d41fa4ecaa4d2e25266392f373a70",
"sha256:606e4be5087966c9f32c68a31429d1bcf350c4908c1cb51d84b2c0606d58f936",
"sha256:d009aede8f5fdb65caf00162bafa3510499f85c54b0ecbdeb230276ae937d75c"
]
},
Okay I spoke a little too soon. With the 23.10-py3
base image, I'm still getting the segfault, but only at the end of my load test.
Maybe there's some small bug in how I'm implementing my Python backend?
Here's my full model.py, mostly modeled off of https://github.com/triton-inference-server/vllm_backend/blob/main/src/model.py :
import asyncio
import json
import numpy as np
import requests
import threading
import time
import triton_python_backend_utils as pb_utils
from typing import Dict, List
import aiohttp
class TritonPythonModel:
@staticmethod
def auto_complete_config(auto_complete_model_config: pb_utils.ModelConfig) -> pb_utils.ModelConfig:
inputs = [] # We use parameters instead of tensors for inputs for ease of use.
outputs = [{
"name": "text_output",
"data_type": "TYPE_STRING",
"dims": [1],
}]
for input in inputs:
auto_complete_model_config.add_input(input)
for output in outputs:
auto_complete_model_config.add_output(output)
# We handle batching on our own.
auto_complete_model_config.set_max_batch_size(0)
# Generated results may be returned out of order.
#auto_complete_model_config.set_model_transaction_policy({"decoupled": True})
return auto_complete_model_config
def initialize(self, args: Dict[str, str]):
self._logger = pb_utils.Logger
self.model_config = model_config = json.loads(args["model_config"])
using_decoupled = pb_utils.using_decoupled_model_transaction_policy(
model_config
)
assert using_decoupled
while True:
print("Probing...")
try:
time.sleep(1)
response = requests.post("http://localhost:80/v1/completions", json={
"prompt": "The sky is",
})
if response.status_code == 200:
break
print(response)
except Exception as e:
print(e)
self._loop = asyncio.get_event_loop()
self._loop_thread = threading.Thread(
target=self._engine_loop, args=(self._loop,)
)
self._shutdown_event = asyncio.Event()
self._loop_thread.start()
def execute(self, requests: List["pb_utils.InferenceRequest"]) -> List["pb_utils.InferenceResponse"]:
for request in requests:
asyncio.run_coroutine_threadsafe(
self._completions(request),
self._loop)
return None
def finalize(self):
self._shutdown_event.set()
if self._loop_thread is not None:
self._loop_thread.join()
self._loop_thread = None
def _engine_loop(self, loop):
asyncio.set_event_loop(loop)
self._loop.run_until_complete(self._await_shutdown())
async def _await_shutdown(self):
while self._shutdown_event.is_set() is False:
await asyncio.sleep(5)
for task in asyncio.all_tasks(loop=self._loop):
if task is not asyncio.current_task():
task.cancel()
async def _completions(self, request: "pb_utils.InferenceRequest"):
response_sender = request.get_response_sender()
params = json.loads(request.parameters())
try:
async with aiohttp.ClientSession() as session:
async with session.post("http://localhost:80/v1/completions", json=params) as response:
assert response.status == 200
response_json = await response.json()
response_str = json.dumps(response_json)
response_sender.send(pb_utils.InferenceResponse(output_tensors=[
pb_utils.Tensor("text_output", np.asarray(response_str, dtype=np.object_)),
]))
except Exception as e:
print(e)
finally:
response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
Thanks for more information on reproduction. I have filed a ticket for us to investigate further.
Having the same issue when sending multiple async requests to descendant models.
Description
I'm getting the following error when sending multiple async requests to my Python backend.
On the client side I get this error:
On the server side I'm getting:
Triton Information
Built Triton myself last week so it's pretty up to date.
To Reproduce
The Python backend I'm using is pretty trivial, all it's doing is sending an HTTP request to another endpoint with the request parameters.
My client code is something like:
Expected behavior
No segfault
This thread looks related, but the proposed root cause doesn't look relevant to my case: https://github.com/triton-inference-server/server/issues/4491