import numpy as np
import onnx as ox
import onnx_tensorrt.backend as backend
import threading
model = ox.load("/workspace/256.onnx")
model = backend.prepare(model, device='CUDA:0', fp16_mode=False)
image = np.random.rand(1,3,256,256).astype(np.float32)
image = np.array(image, dtype=image.dtype, order='C')
def do_0():
out = model.run(image)[0]
def do_1():
out = model.run(image)[0]
t1 = threading.Thread(target=do_0)
t2 = threading.Thread(target=do_1)
t1.start()
t2.start()
t1.join()
t2.join()
Exception in thread Thread-11:
Traceback (most recent call last):
File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
self.run()
File "/usr/lib/python3.7/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "<ipython-input-1-f21df806b74a>", line 12, in do_0
out = model.run(image)[0]
File "/usr/local/lib/python3.7/dist-packages/onnx_tensorrt-8.2.1-py3.7.egg/onnx_tensorrt/backend.py", line 158, in run
outputs = self.engine.run(inputs)
File "/usr/local/lib/python3.7/dist-packages/onnx_tensorrt-8.2.1-py3.7.egg/onnx_tensorrt/tensorrt_engine.py", line 144, in run
self.stream.synchronize()
pycuda._driver.LogicError: cuStreamSynchronize failed: an illegal memory access was encountered
Exception in thread Thread-12:
Traceback (most recent call last):
File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
self.run()
File "/usr/lib/python3.7/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "<ipython-input-1-f21df806b74a>", line 15, in do_1
out = model.run(image)[0]
File "/usr/local/lib/python3.7/dist-packages/onnx_tensorrt-8.2.1-py3.7.egg/onnx_tensorrt/backend.py", line 158, in run
outputs = self.engine.run(inputs)
File "/usr/local/lib/python3.7/dist-packages/onnx_tensorrt-8.2.1-py3.7.egg/onnx_tensorrt/tensorrt_engine.py", line 144, in run
self.stream.synchronize()
pycuda._driver.LogicError: cuStreamSynchronize failed: an illegal memory access was encountered
Another example
from multiprocessing.pool import ThreadPool as Pool
import numpy as np
import onnx as ox
import onnx_tensorrt.backend as backend
model = ox.load("/workspace/256.onnx")
model = backend.prepare(model, device='CUDA:0', fp16_mode=False)
image = np.random.rand(1,3,256,256).astype(np.float32)
image = np.array(image, dtype=image.dtype, order='C')
items = [1, 2, 3, 4]
pool_size = 2
def worker():
try:
out = model.run(image)[0]
except Exception as e: print(e)
pool = Pool(pool_size)
for item in items:
pool.apply_async(worker, ())
pool.close()
pool.join()
cuStreamSynchronize failed: an illegal memory access was encounteredcuStreamSynchronize failed: an illegal memory access was encountered
cuMemcpyHtoDAsync failed: an illegal memory access was encountered
cuMemcpyHtoDAsync failed: an illegal memory access was encountered
I'm not sure if the onnx_tensorrt backend can support multithreading out of the box like the way you are using it. You probably need to follow the TensorRT APIs as mentioned here.
Description
I want to have multithreaded inference, since normal inference results in about 10% GPU usage, but it does not seem to work.
Steps To Reproduce
Here is my model and code.
256.onnx.zip
Another example
Environment
Tensorrt: 8.2.3-1+cuda11.4 onnx: 1.10.2 onnx-tensorrt: 8.2.1 onnxruntime: 1.10.0 onnxruntime-gpu: 1.10.0