Replace OpenAI GPT with another LLM in your app by changing a single line of code. Xinference gives you the freedom to use any LLM you need. With Xinference, you're empowered to run inference with any open-source language models, speech recognition models, and multimodal models, whether in the cloud, on-premises, or even on your laptop.
Traceback (most recent call last):
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/utils.py", line 69, in wrapped
ret = await func(*args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 560, in chat
response = await self._call_wrapper_json(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 407, in _call_wrapper_json
return await self._call_wrapper("json", fn, *args, *kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 120, in _async_wrapper
return await fn(args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 418, in _call_wrapper
ret = await asyncio.to_thread(fn, *args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/asyncio/threads.py", line 25, in to_thread
return await loop.run_in_executor(None, func_call)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, *self.kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/model/llm/transformers/core.py", line 738, in chat
assert self.model_family.chat_template is not None
AssertionError
2024-09-20 14:43:04,622 xinference.api.restful_api 2997556 ERROR Chat completion stream got an error: [address=0.0.0.0:33637, pid=2999911]
Traceback (most recent call last):
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/api/restful_api.py", line 1891, in stream_results
iterator = await model.chat(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/pool.py", line 656, in send
result = await self._run_coro(message.message_id, coro)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/pool.py", line 367, in _run_coro
return await coro
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive__
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.__on_receive
result = await result
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 96, in wrapped_func
ret = await fn(self, args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/api.py", line 462, in _wrapper
r = await func(self, *args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/utils.py", line 69, in wrapped
ret = await func(*args, *kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 560, in chat
response = await self._call_wrapper_json(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 407, in _call_wrapper_json
return await self._call_wrapper("json", fn, args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 120, in _async_wrapper
return await fn(*args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 418, in _call_wrapper
ret = await asyncio.to_thread(fn, *args, *kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/asyncio/threads.py", line 25, in to_thread
return await loop.run_in_executor(None, func_call)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(self.args, self.kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/model/llm/transformers/core.py", line 738, in chat
assert self.model_family.chat_template is not None
AssertionError: [address=0.0.0.0:33637, pid=2999911]
Traceback (most recent call last):
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/queueing.py", line 536, in process_events
response = await route_utils.call_process_api(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/route_utils.py", line 322, in call_process_api
output = await app.get_blocks().process_api(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/blocks.py", line 1935, in process_api
result = await self.call_function(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/blocks.py", line 1532, in call_function
prediction = await utils.async_iteration(iterator)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 671, in async_iteration
return await iterator.anext()
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 776, in asyncgen_wrapper
response = await iterator.anext()
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/chat_interface.py", line 653, in _stream_fn
first_response = await async_iteration(generator)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 671, in async_iteration
return await iterator.anext()
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 664, in anext
return await anyio.to_thread.run_sync(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2357, in run_sync_in_worker_thread
return await future
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 864, in run
result = context.run(func, *args)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 647, in run_sync_iterator_async
return next(iterator)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/chat_interface.py", line 122, in generate_wrapper
for chunk in model.chat(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/client/common.py", line 51, in streaming_response_iterator
raise Exception(str(error))
Exception: [address=0.0.0.0:33637, pid=2999911]
具体终端错误输出如下:
Traceback (most recent call last):
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/utils.py", line 69, in wrapped
ret = await func(*args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 570, in chat
response = await self._call_wrapper_json(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 407, in _call_wrapper_json
return await self._call_wrapper("json", fn, *args, *kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 120, in _async_wrapper
return await fn(args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 416, in _call_wrapper
ret = await fn(*args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/model/llm/vllm/core.py", line 674, in async_chat
assert self.model_family.chat_template is not None
AssertionError
2024-09-20 14:46:23,043 xinference.api.restful_api 2997556 ERROR Chat completion stream got an error: [address=0.0.0.0:41467, pid=3006589]
Traceback (most recent call last):
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/api/restful_api.py", line 1891, in stream_results
iterator = await model.chat(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/context.py", line 231, in send
return self._process_result_message(result)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/pool.py", line 656, in send
result = await self._run_coro(message.message_id, coro)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/pool.py", line 367, in _run_coro
return await coro
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/api.py", line 384, in on_receive
return await super().on_receive(message) # type: ignore
File "xoscar/core.pyx", line 558, in on_receive__
raise ex
File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive
async with self._lock:
File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive
with debug_async_timeout('actor_lock_timeout',
File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.__on_receive
result = await result
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 96, in wrapped_func
ret = await fn(self, *args, *kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/api.py", line 462, in _wrapper
r = await func(self, args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/utils.py", line 69, in wrapped
ret = await func(*args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 570, in chat
response = await self._call_wrapper_json(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 407, in _call_wrapper_json
return await self._call_wrapper("json", fn, *args, *kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 120, in _async_wrapper
return await fn(args, kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 416, in _call_wrapper
ret = await fn(*args, *kwargs)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/model/llm/vllm/core.py", line 674, in async_chat
assert self.model_family.chat_template is not None
AssertionError: [address=0.0.0.0:41467, pid=3006589]
Traceback (most recent call last):
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/queueing.py", line 536, in process_events
response = await route_utils.call_process_api(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/route_utils.py", line 322, in call_process_api
output = await app.get_blocks().process_api(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/blocks.py", line 1935, in process_api
result = await self.call_function(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/blocks.py", line 1532, in call_function
prediction = await utils.async_iteration(iterator)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 671, in async_iteration
return await iterator.anext()
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 776, in asyncgen_wrapper
response = await iterator.anext()
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/chat_interface.py", line 653, in _stream_fn
first_response = await async_iteration(generator)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 671, in async_iteration
return await iterator.anext()
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 664, in anext
return await anyio.to_thread.run_sync(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
return await get_async_backend().run_sync_in_worker_thread(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2357, in run_sync_in_worker_thread
return await future
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 864, in run
result = context.run(func, args)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 647, in run_sync_iterator_async
return next(iterator)
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/chat_interface.py", line 122, in generate_wrapper
for chunk in model.chat(
File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/client/common.py", line 51, in streaming_response_iterator
raise Exception(str(error))
Exception: [address=0.0.0.0:41467, pid=3006589]
上述两个问题中都出现了 assert self.model_family.chat_template is not None 这个断言错误,怀疑可能是这个断言引起的?
System Info / 系統信息
CUDA==12.1 transformers == 4.44.2 llama_cpp_python == 0.2.90 vllm == 0.6.1.post2 vllm-flash-attn == 2.6.1
Python==3.10.14 Ubuntu==24.04
Running Xinference with Docker? / 是否使用 Docker 运行 Xinfernece?
Version info / 版本信息
Xinference=0.15.1
The command used to start Xinference / 用以启动 xinference 的命令
xinference-local --host 0.0.0.0 --port 9997
Reproduction / 复现过程
1、xinference-local --host 0.0.0.0 --port 9997 启动xinference 2、进入localhost:9997 前端界面 3、启动 Qwen2-7B-instruct (transformer启动模式)并进入 xinference 自带的推理前端后,输入任意文字报错 error [address=0.0.0.0:33637, pid=2999911] 终端输出如下:
Traceback (most recent call last): File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/utils.py", line 69, in wrapped ret = await func(*args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 560, in chat response = await self._call_wrapper_json( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 407, in _call_wrapper_json return await self._call_wrapper("json", fn, *args, *kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 120, in _async_wrapper return await fn(args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 418, in _call_wrapper ret = await asyncio.to_thread(fn, *args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/asyncio/threads.py", line 25, in to_thread return await loop.run_in_executor(None, func_call) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/concurrent/futures/thread.py", line 58, in run result = self.fn(*self.args, *self.kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/model/llm/transformers/core.py", line 738, in chat assert self.model_family.chat_template is not None AssertionError 2024-09-20 14:43:04,622 xinference.api.restful_api 2997556 ERROR Chat completion stream got an error: [address=0.0.0.0:33637, pid=2999911] Traceback (most recent call last): File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/api/restful_api.py", line 1891, in stream_results iterator = await model.chat( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/context.py", line 231, in send return self._process_result_message(result) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message raise message.as_instanceof_cause() File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/pool.py", line 656, in send result = await self._run_coro(message.message_id, coro) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/pool.py", line 367, in _run_coro return await coro File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/api.py", line 384, in on_receive return await super().on_receive(message) # type: ignore File "xoscar/core.pyx", line 558, in on_receive__ raise ex File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive async with self._lock: File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive with debug_async_timeout('actor_lock_timeout', File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.__on_receive result = await result File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 96, in wrapped_func ret = await fn(self, args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/api.py", line 462, in _wrapper r = await func(self, *args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/utils.py", line 69, in wrapped ret = await func(*args, *kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 560, in chat response = await self._call_wrapper_json( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 407, in _call_wrapper_json return await self._call_wrapper("json", fn, args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 120, in _async_wrapper return await fn(*args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 418, in _call_wrapper ret = await asyncio.to_thread(fn, *args, *kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/asyncio/threads.py", line 25, in to_thread return await loop.run_in_executor(None, func_call) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/concurrent/futures/thread.py", line 58, in run result = self.fn(self.args, self.kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/model/llm/transformers/core.py", line 738, in chat assert self.model_family.chat_template is not None AssertionError: [address=0.0.0.0:33637, pid=2999911] Traceback (most recent call last): File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/queueing.py", line 536, in process_events response = await route_utils.call_process_api( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/route_utils.py", line 322, in call_process_api output = await app.get_blocks().process_api( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/blocks.py", line 1935, in process_api result = await self.call_function( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/blocks.py", line 1532, in call_function prediction = await utils.async_iteration(iterator) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 671, in async_iteration return await iterator.anext() File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 776, in asyncgen_wrapper response = await iterator.anext() File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/chat_interface.py", line 653, in _stream_fn first_response = await async_iteration(generator) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 671, in async_iteration return await iterator.anext() File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 664, in anext return await anyio.to_thread.run_sync( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync return await get_async_backend().run_sync_in_worker_thread( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2357, in run_sync_in_worker_thread return await future File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 864, in run result = context.run(func, *args) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 647, in run_sync_iterator_async return next(iterator) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/chat_interface.py", line 122, in generate_wrapper for chunk in model.chat( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/client/common.py", line 51, in streaming_response_iterator raise Exception(str(error)) Exception: [address=0.0.0.0:33637, pid=2999911]
4、以VLLM模式启动 Qwen2-7b-instruct 后进入自带的前端对话页面,输入任意消息后仍然报错,无法得到正常回复
具体终端错误输出如下: Traceback (most recent call last): File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/utils.py", line 69, in wrapped ret = await func(*args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 570, in chat response = await self._call_wrapper_json( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 407, in _call_wrapper_json return await self._call_wrapper("json", fn, *args, *kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 120, in _async_wrapper return await fn(args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 416, in _call_wrapper ret = await fn(*args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/model/llm/vllm/core.py", line 674, in async_chat assert self.model_family.chat_template is not None AssertionError 2024-09-20 14:46:23,043 xinference.api.restful_api 2997556 ERROR Chat completion stream got an error: [address=0.0.0.0:41467, pid=3006589] Traceback (most recent call last): File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/api/restful_api.py", line 1891, in stream_results iterator = await model.chat( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/context.py", line 231, in send return self._process_result_message(result) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/context.py", line 102, in _process_result_message raise message.as_instanceof_cause() File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/pool.py", line 656, in send result = await self._run_coro(message.message_id, coro) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/backends/pool.py", line 367, in _run_coro return await coro File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/api.py", line 384, in on_receive return await super().on_receive(message) # type: ignore File "xoscar/core.pyx", line 558, in on_receive__ raise ex File "xoscar/core.pyx", line 520, in xoscar.core._BaseActor.on_receive async with self._lock: File "xoscar/core.pyx", line 521, in xoscar.core._BaseActor.on_receive with debug_async_timeout('actor_lock_timeout', File "xoscar/core.pyx", line 526, in xoscar.core._BaseActor.__on_receive result = await result File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 96, in wrapped_func ret = await fn(self, *args, *kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xoscar/api.py", line 462, in _wrapper r = await func(self, args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/utils.py", line 69, in wrapped ret = await func(*args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 570, in chat response = await self._call_wrapper_json( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 407, in _call_wrapper_json return await self._call_wrapper("json", fn, *args, *kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 120, in _async_wrapper return await fn(args, kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/model.py", line 416, in _call_wrapper ret = await fn(*args, *kwargs) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/model/llm/vllm/core.py", line 674, in async_chat assert self.model_family.chat_template is not None AssertionError: [address=0.0.0.0:41467, pid=3006589] Traceback (most recent call last): File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/queueing.py", line 536, in process_events response = await route_utils.call_process_api( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/route_utils.py", line 322, in call_process_api output = await app.get_blocks().process_api( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/blocks.py", line 1935, in process_api result = await self.call_function( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/blocks.py", line 1532, in call_function prediction = await utils.async_iteration(iterator) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 671, in async_iteration return await iterator.anext() File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 776, in asyncgen_wrapper response = await iterator.anext() File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/chat_interface.py", line 653, in _stream_fn first_response = await async_iteration(generator) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 671, in async_iteration return await iterator.anext() File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 664, in anext return await anyio.to_thread.run_sync( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync return await get_async_backend().run_sync_in_worker_thread( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2357, in run_sync_in_worker_thread return await future File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 864, in run result = context.run(func, args) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/gradio/utils.py", line 647, in run_sync_iterator_async return next(iterator) File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/core/chat_interface.py", line 122, in generate_wrapper for chunk in model.chat( File "/home/xxx/miniconda3/envs/xinference-new/lib/python3.10/site-packages/xinference/client/common.py", line 51, in streaming_response_iterator raise Exception(str(error)) Exception: [address=0.0.0.0:41467, pid=3006589]
上述两个问题中都出现了 assert self.model_family.chat_template is not None 这个断言错误,怀疑可能是这个断言引起的?
我用 xinference=0.14.3 可以正常加载 Qwen2-7b-instruct ,但是由于0.14.3对于 openai 和 fastapi 版本的限制无法使用VLLM和SGLang模式进行推理。
Expected behavior / 期待表现
期望解决该bug