triton-inference-server / server

The Triton Inference Server provides an optimized cloud and edge inferencing solution.
https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/index.html
BSD 3-Clause "New" or "Revised" License
8.04k stars 1.44k forks source link

Server goes down trying to predict on certain BERT based, TensorRT optimized model in Tenosrflow Savedmodel format #3781

Closed BorisPolonsky closed 1 year ago

BorisPolonsky commented 2 years ago

Description A clear and concise description of what the bug is. I tried serving a model optimized with tensorflow.python.compiler.tensorrt.trt_convert.TrtGraphConverterV2. Upon receiving request from client for prediction on that model triton went down without log on the server side. However, the original model in Tensorflow SavedModel format (before running optimization using TrtGraphConverterV2 works fine on triton inferince server. Triton Information What version of Triton are you using? nvcr.io/nvidia/tritonserver:21.09-py3

Update: I tested the TensorRT optimized model with nvcr.io/nvidia/tritonserver:21.12-py3 just now and the problem is replicable. Are you using the Triton container or did you build it yourself? I'm using the official container from NGC. To Reproduce Steps to reproduce the behavior.

import numpy as np import sys import time

import tritonclient.grpc as grpcclient

class InputFeature: def init(self, input_ids, segment_ids, input_mask): self.input_ids = input_ids self.segment_ids = segment_ids self.input_mask = input_mask

class Feature(InputFeature): def init(self, tokens, input_ids, segment_ids, input_mask, valid_length, clipped): super(Feature, self).init(input_ids, segment_ids, input_mask) self.tokens = tokens self.valid_length = valid_length self.clipped = clipped

if name == 'main': parser = argparse.ArgumentParser() parser.add_argument('-v', '--verbose', action="store_true", required=False, default=False, help='Enable verbose output') parser.add_argument('-u', '--url', type=str, required=False, default='localhost:8001', help='Inference server URL. Default is localhost:8001.') parser.add_argument('-s', '--ssl', action="store_true", required=False, default=False, help='Enable SSL encrypted channel to the server') parser.add_argument('-t', '--client-timeout', type=float, required=False, default=None, help='Client timeout in seconds. Default is None.') parser.add_argument( '-r', '--root-certificates', type=str, required=False, default=None, help='File holding PEM-encoded root certificates. Default is None.') parser.add_argument( '-p', '--private-key', type=str, required=False, default=None, help='File holding PEM-encoded private key. Default is None.') parser.add_argument( '-x', '--certificate-chain', type=str, required=False, default=None, help='File holding PEM-encoded certicate chain. Default is None.') parser.add_argument( '-C', '--grpc-compression-algorithm', type=str, required=False, default=None, help= 'The compression algorithm to be used when sending request to server. Default is None.' )

FLAGS = parser.parse_args()
try:
    triton_client = grpcclient.InferenceServerClient(
        url=FLAGS.url,
        verbose=FLAGS.verbose,
        ssl=FLAGS.ssl,
        root_certificates=FLAGS.root_certificates,
        private_key=FLAGS.private_key,
        certificate_chain=FLAGS.certificate_chain)
except Exception as e:
    print("channel creation failed: " + str(e))
    sys.exit()

model_name = "test_trt_fp16"
max_seq_length = 128
N_TAG = 61
N_PREDICATE = 49
PREDICATE_LABELS = [str(i) for i in range(N_PREDICATE)]
TOKEN_LABELS = [str(i) for i in range(N_TAG)]
input_ids = [101, 517, 6375, 2094, 2486, 7607, 518, 3221, 4507, 2002, 3152, 2809, 2193, 8024, 2002, 3152, 510, 1453,
             3883, 1355,
             510, 5867, 831, 510, 1155, 1649, 4386, 510, 7357, 1787, 510, 1453, 7510, 510, 2445, 1127, 510, 2002,
             3636, 5023,
             712, 4028, 4638, 1196, 2658, 4275, 8024, 754, 8166, 2399, 8110, 3299, 8121, 3189, 1762, 704, 1744,
             1920, 7355, 677,
             3216, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
             0, 0, 0, 0,
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
               0, 0, 0, 0,
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
               0, 0, 0, 0,
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
               0, 0, 0, 0,
               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
input_mask = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              0, 0, 0, 0,
              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
              0, 0, 0, 0,
              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
features = [
    Feature(tokens=None, input_ids=input_ids, segment_ids=segment_ids, input_mask=input_mask, valid_length=61,
            clipped=False)]

batch_size = len(features)
input_ids, segment_ids, input_mask = [], [], []
for feature in features:
    input_ids.append(feature.input_ids)
    segment_ids.append(feature.segment_ids)
    input_mask.append(feature.input_mask)
input_ids_val = np.array(input_ids, dtype=np.int64)
segment_ids_val = np.array(segment_ids, dtype=np.int64)
input_mask_val = np.array(input_mask, dtype=np.int64)

# Infer
input_ids = grpcclient.InferInput('input_ids', [batch_size, max_seq_length], "INT64")
segment_ids = grpcclient.InferInput('segment_ids', [batch_size, max_seq_length], "INT64")
input_mask = grpcclient.InferInput('input_mask', [batch_size, max_seq_length], "INT64")
input_ids.set_data_from_numpy(input_ids_val)
segment_ids.set_data_from_numpy(segment_ids_val)
input_mask.set_data_from_numpy(input_mask_val)
inputs = [input_ids, segment_ids, input_mask]
del input_ids_val, segment_ids_val, input_mask_val

outputs = []
outputs.append(grpcclient.InferRequestedOutput('predicate_head_probabilities'))
outputs.append(grpcclient.InferRequestedOutput('token_label_predictions'))

# Test with outputs
tic = time.time()
results = triton_client.infer(
    model_name=model_name,
    inputs=inputs,
    outputs=outputs,
    client_timeout=FLAGS.client_timeout,
    headers={'test': '1'},
    compression_algorithm=FLAGS.grpc_compression_algorithm)
toc = time.time()
print("throughput: {}/s".format(batch_size / (toc - tic)))
statistics = triton_client.get_inference_statistics(model_name=model_name)
print(statistics)
if len(statistics.model_stats) != 1:
    print("FAILED: Inference Statistics")
    sys.exit(1)

# Test with no outputs
results = triton_client.infer(
    model_name=model_name,
    inputs=inputs,
    outputs=None,
    compression_algorithm=FLAGS.grpc_compression_algorithm)

# Get the output arrays from the results
predicate_head_prob = results.as_numpy('predicate_head_probabilities')
token_labels = results.as_numpy('token_label_predictions')

print('PASS: infer')
Describe the models (framework, inputs, outputs), ideally include the model configuration file (if using an ensemble include the model configuration file for that as well).
Log on client side:

Traceback (most recent call last): File "/home/polonsky/Documents/Multiple-Relations-Extraction-Only-Look-Once/client/test_infer_hangup.py", line 154, in results = triton_client.infer( File "/usr/local/lib/python3.8/dist-packages/tritonclient/grpc/init.py", line 1146, in infer raise_error_grpc(rpc_error) File "/usr/local/lib/python3.8/dist-packages/tritonclient/grpc/init.py", line 62, in raise_error_grpc raise get_error_grpc(rpc_error) from None tritonclient.utils.InferenceServerException: [StatusCode.UNAVAILABLE] failed to connect to all addresses

Process finished with exit code 1

Log on server side

============================= == Triton Inference Server ==

NVIDIA Release 21.09 (build 27443074)

Copyright (c) 2018-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

Various files include modifications (c) NVIDIA CORPORATION. All rights reserved.

This container image and its contents are governed by the NVIDIA Deep Learning Container License. By pulling and using the container, you accept the terms and conditions of this license: https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

I0109 03:46:03.006015 1 metrics.cc:290] Collecting metrics for GPU 0: NVIDIA GeForce GTX 1050 I0109 03:46:03.535745 1 libtorch.cc:1030] TRITONBACKEND_Initialize: pytorch I0109 03:46:03.535772 1 libtorch.cc:1040] Triton TRITONBACKEND API version: 1.5 I0109 03:46:03.535777 1 libtorch.cc:1046] 'pytorch' TRITONBACKEND API version: 1.5 2022-01-09 11:46:03.774912: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0 I0109 03:46:03.899955 1 tensorflow.cc:2170] TRITONBACKEND_Initialize: tensorflow I0109 03:46:03.900016 1 tensorflow.cc:2180] Triton TRITONBACKEND API version: 1.5 I0109 03:46:03.900209 1 tensorflow.cc:2186] 'tensorflow' TRITONBACKEND API version: 1.5 I0109 03:46:03.900246 1 tensorflow.cc:2210] backend configuration: {"cmdline":{"allow-soft-placement":"true"}} I0109 03:46:03.910667 1 onnxruntime.cc:1997] TRITONBACKEND_Initialize: onnxruntime I0109 03:46:03.910692 1 onnxruntime.cc:2007] Triton TRITONBACKEND API version: 1.5 I0109 03:46:03.910874 1 onnxruntime.cc:2013] 'onnxruntime' TRITONBACKEND API version: 1.5 I0109 03:46:03.979783 1 openvino.cc:1193] TRITONBACKEND_Initialize: openvino I0109 03:46:03.979859 1 openvino.cc:1203] Triton TRITONBACKEND API version: 1.5 I0109 03:46:03.979876 1 openvino.cc:1209] 'openvino' TRITONBACKEND API version: 1.5 I0109 03:46:04.143979 1 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7f4174000000' with size 268435456 I0109 03:46:04.144792 1 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864 I0109 03:46:04.403606 1 model_repository_manager.cc:1022] loading: test_trt_fp16:1 I0109 03:46:04.527032 1 tensorflow.cc:2270] TRITONBACKEND_ModelInitialize: test_trt_fp16 (version 1) I0109 03:46:04.532030 1 tensorflow.cc:2319] TRITONBACKEND_ModelInstanceInitialize: test_trt_fp16_0 (MODEL device 0) 2022-01-09 11:46:04.532290: I tensorflow/cc/saved_model/reader.cc:31] Reading SavedModel from: /home/model-repo/test_trt_fp16/1/model.savedmodel W0109 03:46:05.007673 1 metrics.cc:396] Unable to get power limit for GPU 0. Status:Success, value:0.000000 W0109 03:46:05.007751 1 metrics.cc:414] Unable to get power usage for GPU 0. Status:Success, value:0.000000 W0109 03:46:05.007768 1 metrics.cc:438] Unable to get energy consumption for GPU 0. Status:Success, value:0 W0109 03:46:07.008555 1 metrics.cc:396] Unable to get power limit for GPU 0. Status:Success, value:0.000000 W0109 03:46:07.008631 1 metrics.cc:414] Unable to get power usage for GPU 0. Status:Success, value:0.000000 W0109 03:46:07.008650 1 metrics.cc:438] Unable to get energy consumption for GPU 0. Status:Success, value:0 W0109 03:46:09.009542 1 metrics.cc:396] Unable to get power limit for GPU 0. Status:Success, value:0.000000 W0109 03:46:09.009617 1 metrics.cc:414] Unable to get power usage for GPU 0. Status:Success, value:0.000000 W0109 03:46:09.009635 1 metrics.cc:438] Unable to get energy consumption for GPU 0. Status:Success, value:0 2022-01-09 11:46:20.218543: I tensorflow/cc/saved_model/reader.cc:54] Reading meta graph with tags { serve } 2022-01-09 11:46:20.448866: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1 2022-01-09 11:46:20.449068: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1082] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-01-09 11:46:20.449459: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1666] Found device 0 with properties: name: NVIDIA GeForce GTX 1050 major: 6 minor: 1 memoryClockRate(GHz): 1.493 pciBusID: 0000:01:00.0 2022-01-09 11:46:20.449530: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0 2022-01-09 11:46:20.449589: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11 2022-01-09 11:46:20.449621: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10 2022-01-09 11:46:20.449647: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10 2022-01-09 11:46:20.449680: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.11 2022-01-09 11:46:20.449698: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11 2022-01-09 11:46:20.449757: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8 2022-01-09 11:46:20.449831: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1082] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-01-09 11:46:20.450155: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1082] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-01-09 11:46:20.450427: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1794] Adding visible gpu devices: 0 2022-01-09 11:46:25.775391: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1206] Device interconnect StreamExecutor with strength 1 edge matrix: 2022-01-09 11:46:25.775430: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1212] 0 2022-01-09 11:46:25.775456: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1225] 0: N 2022-01-09 11:46:25.775644: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1082] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-01-09 11:46:25.775983: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1082] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-01-09 11:46:25.776369: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1082] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2022-01-09 11:46:25.776624: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1351] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 2592 MB memory) -> physical GPU (device: 0, name: NVIDIA GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1) 2022-01-09 11:46:25.794300: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f40d03de410 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: 2022-01-09 11:46:25.794356: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): NVIDIA GeForce GTX 1050, Compute Capability 6.1 2022-01-09 11:46:25.815131: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2499950000 Hz 2022-01-09 11:46:25.815905: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f40d0a61320 initialized for platform Host (this does not guarantee that XLA will be used). Devices: 2022-01-09 11:46:25.815980: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version 2022-01-09 11:46:27.075906: I tensorflow/cc/saved_model/loader.cc:251] Restoring SavedModel bundle. 2022-01-09 11:46:27.075995: I tensorflow/cc/saved_model/loader.cc:261] The specified SavedModel has no variables; no checkpoints were restored. File does not exist: /home/model-repo/test_trt_fp16/1/model.savedmodel/variables/variables.index 2022-01-09 11:46:27.076026: I tensorflow/cc/saved_model/loader.cc:200] Running initialization op on SavedModel bundle at path: /home/model-repo/test_trt_fp16/1/model.savedmodel 2022-01-09 11:46:28.276605: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 64905216 exceeds 10% of system memory. 2022-01-09 11:46:28.324683: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 64905216 exceeds 10% of system memory. 2022-01-09 11:46:29.360931: I tensorflow/cc/saved_model/loader.cc:379] SavedModel load for tags { serve }; Status: success. Took 24828486 microseconds. 2022-01-09 11:46:29.361024: W triton/tensorflow_backend_tf.cc:986] unable to find serving signature 'predict 2022-01-09 11:46:29.361032: W triton/tensorflow_backend_tf.cc:988] using signature 'serving_default' I0109 03:46:29.363139 1 model_repository_manager.cc:1183] successfully loaded 'test_trt_fp16' version 1 I0109 03:46:29.365622 1 server.cc:519] +------------------+------+ | Repository Agent | Path | +------------------+------+ +------------------+------+

I0109 03:46:29.366083 1 server.cc:546] +-------------+-----------------------------------------------------------------+---------------------------------------------+ | Backend | Path | Config | +-------------+-----------------------------------------------------------------+---------------------------------------------+ | pytorch | /opt/tritonserver/backends/pytorch/libtriton_pytorch.so | {} | | tensorflow | /opt/tritonserver/backends/tensorflow1/libtriton_tensorflow1.so | {"cmdline":{"allow-soft-placement":"true"}} | | onnxruntime | /opt/tritonserver/backends/onnxruntime/libtriton_onnxruntime.so | {} | | openvino | /opt/tritonserver/backends/openvino/libtriton_openvino.so | {} | +-------------+-----------------------------------------------------------------+---------------------------------------------+

I0109 03:46:29.366406 1 server.cc:589] +---------------+---------+--------+ | Model | Version | Status | +---------------+---------+--------+ | test_trt_fp16 | 1 | READY | +---------------+---------+--------+

I0109 03:46:29.366642 1 tritonserver.cc:1836] +----------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Option | Value | +----------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | server_id | triton | | server_version | 2.14.0 | | server_extensions | classification sequence model_repository model_repository(unload_dependents) schedule_policy model_configuration system_shared_memory cuda_shared_memory binary_tensor_data statistics | | model_repository_path[0] | /home/model-repo | | model_control_mode | MODE_NONE | | strict_model_config | 1 | | rate_limit | OFF | | pinned_memory_pool_byte_size | 268435456 | | cuda_memory_pool_byte_size{0} | 67108864 | | min_supported_compute_capability | 6.0 | | strict_readiness | 1 | | exit_timeout | 30 | +----------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

I0109 03:46:29.373515 1 grpc_server.cc:4111] Started GRPCInferenceService at 0.0.0.0:8001 I0109 03:46:29.374974 1 http_server.cc:2803] Started HTTPService at 0.0.0.0:8000 I0109 03:46:29.420320 1 http_server.cc:162] Started Metrics Service at 0.0.0.0:800


**Expected behavior**
A clear and concise description of what you expected to happen.
The model server should either yield outputs normally, or raise exception stating problem with the model. The server should at least stay alive and continue to serve the rest of the requests for predictions on other models instead of breaking down. 
ali-mirza1 commented 2 years ago

Facing the same issue.

KyloRen1 commented 2 years ago

Me as well

BorisPolonsky commented 2 years ago

Facing the same issue.

Me as well

May I ask if you have new models that replicate the issue or you are using the model that provided in this post?

KyloRen1 commented 2 years ago

Facing the same issue.

Me as well

May I ask if you have new models that replicate the issue or you are using the model that provided in this post?

New model with the same issue

ali-mirza1 commented 2 years ago

Facing the same issue.

Me as well

May I ask if you have new models that replicate the issue or you are using the model that provided in this post?

new models, but exactly the same issue.

v1nc3nt27 commented 2 years ago

I've got the same issue, same container version, custom model. Http is working fine. Did you find any solution?

Using the code from https://github.com/triton-inference-server/client/blob/main/src/python/examples/grpc_client.py:

https://github.com/triton-inference-server/client/blob/f0f2b89b3b8418a0a79a8dd60329a8b99489268b/src/python/examples/grpc_client.py#L58-L70

I get

grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    status = StatusCode.UNAVAILABLE
    details = "failed to connect to all addresses"
    debug_error_string = "{"created":"@1645635102.216682314","description":"Failed to pick subchannel","file":"src/core/ext/filters/client_channel/client_channel.cc","file_line":3093,"referenced_errors":[{"created":"@1645635102.216682027","description":"failed to connect to all addresses","file":"src/core/lib/transport/error_utils.cc","file_line":163,"grpc_status":14}]}"
>

Update: The reason why it was not working for me was that I ran the script with requests to the server and the server itself on the same machine (see also: https://stackoverflow.com/questions/65854022/grpc-failed-to-pick-subchannel-if-server-and-client-are-hosted-on-different-ma)

BorisPolonsky commented 2 years ago

I've got the same issue, same container version, custom model. Http is working fine. Did you find any solution?

Using the code from https://github.com/triton-inference-server/client/blob/main/src/python/examples/grpc_client.py:

https://github.com/triton-inference-server/client/blob/f0f2b89b3b8418a0a79a8dd60329a8b99489268b/src/python/examples/grpc_client.py#L58-L70

I get

grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
  status = StatusCode.UNAVAILABLE
  details = "failed to connect to all addresses"
  debug_error_string = "{"created":"@1645635102.216682314","description":"Failed to pick subchannel","file":"src/core/ext/filters/client_channel/client_channel.cc","file_line":3093,"referenced_errors":[{"created":"@1645635102.216682027","description":"failed to connect to all addresses","file":"src/core/lib/transport/error_utils.cc","file_line":163,"grpc_status":14}]}"
>

Update: The reason why it was not working for me was that I ran the script with requests to the server and the server itself on the same machine (see also: https://stackoverflow.com/questions/65854022/grpc-failed-to-pick-subchannel-if-server-and-client-are-hosted-on-different-ma)

From my understanding the server process did not went down in your case, which is not quite the same as mine and those reported by other developers in the posts above. Anyway I'm glad that you got it solved:)

v1nc3nt27 commented 2 years ago

@BorisPolonsky I completely missed that, my bad. But perhaps you can use the lines I posted to get a more detailed info what the grpc error is about.

BorisPolonsky commented 2 years ago

I compared both the TRT optimized model & and another model trained using the same code with slightly different configuration (i.e. the output shape) via /v1/models/<model_name>/metadata api from tensorflow/serving:2.8.0-gpu. Here's what I got. Original model

{
"model_spec":{
 "name": "bert_mre",
 "signature_name": "",
 "version": "1"
}
,
"metadata": {"signature_def": {
 "signature_def": {
  "serving_default": {
   "inputs": {
    "segment_ids": {
     "dtype": "DT_INT64",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "256",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "segment_ids:0"
    },
    "input_mask": {
     "dtype": "DT_INT64",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "256",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "input_mask:0"
    },
    "input_ids": {
     "dtype": "DT_INT64",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "256",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "input_ids:0"
    }
   },
   "outputs": {
    "token_label_predictions": {
     "dtype": "DT_INT64",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "256",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "token_label_loss/ArgMax:0"
    },
    "predicate_head_probabilities": {
     "dtype": "DT_FLOAT",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "256",
        "name": ""
       },
       {
        "size": "256",
        "name": ""
       },
       {
        "size": "49",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "predicate_head_select_loss/Sigmoid:0"
    }
   },
   "method_name": "tensorflow/serving/predict"
  }
 }
}
}
}

TRT Optimized model

{
"model_spec":{
 "name": "bert_mre_fp16_test",
 "signature_name": "",
 "version": "1"
}
,
"metadata": {"signature_def": {
 "signature_def": {
  "serving_default": {
   "inputs": {
    "segment_ids": {
     "dtype": "DT_INT64",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "128",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "serving_default_segment_ids:0"
    },
    "input_mask": {
     "dtype": "DT_INT64",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "128",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "serving_default_input_mask:0"
    },
    "input_ids": {
     "dtype": "DT_INT64",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "128",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "serving_default_input_ids:0"
    }
   },
   "outputs": {
    "token_label_predictions": {
     "dtype": "DT_INT64",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "128",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "StatefulPartitionedCall:1"
    },
    "predicate_head_probabilities": {
     "dtype": "DT_FLOAT",
     "tensor_shape": {
      "dim": [
       {
        "size": "-1",
        "name": ""
       },
       {
        "size": "128",
        "name": ""
       },
       {
        "size": "128",
        "name": ""
       },
       {
        "size": "49",
        "name": ""
       }
      ],
      "unknown_rank": false
     },
     "name": "StatefulPartitionedCall:0"
    }
   },
   "method_name": "tensorflow/serving/predict"
  },
  "__saved_model_init_op": {
   "inputs": {},
   "outputs": {
    "__saved_model_init_op": {
     "dtype": "DT_INVALID",
     "tensor_shape": {
      "dim": [],
      "unknown_rank": true
     },
     "name": "NoOp"
    }
   },
   "method_name": ""
  }
 }
}
}
}

There's a "__saved_model_init_op" with output type "dtype": "DT_INVALID" in the TRT optimized version. I don't know if this is causing the problem.

deadeyegoodwin commented 2 years ago

Does the model run correctly outside of triton? For example, does it run correctly with trtexec?

dyastremsky commented 1 year ago

Closing issue due to inactivity. Please let us know if you would like to reopen this issue for follow-up.