Replace OpenAI GPT with another LLM in your app by changing a single line of code. Xinference gives you the freedom to use any LLM you need. With Xinference, you're empowered to run inference with any open-source language models, speech recognition models, and multimodal models, whether in the cloud, on-premises, or even on your laptop.
2024-05-13 15:18:22,138 xinference.api.restful_api 7136 ERROR Chat completion stream got an error: [address=192.168.17.20:53022, pid=15324] Tensor on device meta is not on the expected device cuda:0!
Traceback (most recent call last):
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xinference\api\restful_api.py", line 1365, in stream_results
async for item in iterator:
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xoscar\api.py", line 340, in __anext__
return await self._actor_ref.__xoscar_next__(self._uid)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xoscar\backends\context.py", line 227, in send
return self._process_result_message(result)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xoscar\backends\context.py", line 102, in _process_result_message
raise message.as_instanceof_cause()
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xoscar\backends\pool.py", line 659, in send
result = await self._run_coro(message.message_id, coro)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xoscar\backends\pool.py", line 370, in _run_coro
return await coro
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xoscar\api.py", line 384, in __on_receive__
return await super().__on_receive__(message) # type: ignore
File "xoscar\\core.pyx", line 558, in __on_receive__
raise ex
File "xoscar\\core.pyx", line 520, in xoscar.core._BaseActor.__on_receive__
async with self._lock:
File "xoscar\\core.pyx", line 521, in xoscar.core._BaseActor.__on_receive__
with debug_async_timeout('actor_lock_timeout',
File "xoscar\\core.pyx", line 526, in xoscar.core._BaseActor.__on_receive__
result = await result
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xoscar\api.py", line 431, in __xoscar_next__
raise e
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xoscar\api.py", line 417, in __xoscar_next__
r = await asyncio.to_thread(_wrapper, gen)
File "D:\anaconda\envs\xinference-vl\lib\asyncio\threads.py", line 25, in to_thread
return await loop.run_in_executor(None, func_call)
File "D:\anaconda\envs\xinference-vl\lib\concurrent\futures\thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xoscar\api.py", line 402, in _wrapper
return next(_gen)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xinference\core\model.py", line 257, in _to_json_generator
for v in gen:
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xinference\model\llm\utils.py", line 521, in _to_chat_completion_chunks
for i, chunk in enumerate(chunks):
File "D:\anaconda\envs\xinference-vl\lib\site-packages\xinference\model\llm\pytorch\qwen_vl.py", line 173, in _generate_stream
for response in response_generator:
File "C:\Users\zzj\.cache\huggingface\modules\transformers_modules\qwen-vl-chat-pytorch-7b\modeling_qwen.py", line 1021, in stream_generator
for token in self.generate_stream(
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\utils\_contextlib.py", line 35, in generator_context
response = gen.send(None)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\transformers_stream_generator\main.py", line 934, in sample_stream
outputs = self(
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\accelerate\hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "C:\Users\zzj\.cache\huggingface\modules\transformers_modules\qwen-vl-chat-pytorch-7b\modeling_qwen.py", line 856, in forward
transformer_outputs = self.transformer(
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\zzj\.cache\huggingface\modules\transformers_modules\qwen-vl-chat-pytorch-7b\modeling_qwen.py", line 565, in forward
images = self.visual.encode(images)
File "C:\Users\zzj\.cache\huggingface\modules\transformers_modules\qwen-vl-chat-pytorch-7b\visual.py", line 426, in encode
return self(images)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\zzj\.cache\huggingface\modules\transformers_modules\qwen-vl-chat-pytorch-7b\visual.py", line 410, in forward
x = self.attn_pool(x)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\zzj\.cache\huggingface\modules\transformers_modules\qwen-vl-chat-pytorch-7b\visual.py", line 148, in forward
out = self.attn(
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\accelerate\hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\modules\activation.py", line 1241, in forward
attn_output, attn_output_weights = F.multi_head_attention_forward(
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\nn\functional.py", line 5449, in multi_head_attention_forward
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\_prims_common\wrappers.py", line 250, in _fn
result = fn(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\_decomp\decompositions.py", line 72, in inner
r = f(*tree_map(increase_prec, args), **tree_map(increase_prec, kwargs))
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\_decomp\decompositions.py", line 1328, in addmm
return out + beta * self
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\_prims_common\wrappers.py", line 250, in _fn
result = fn(*args, **kwargs)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\_prims_common\wrappers.py", line 137, in _fn
result = fn(**bound.arguments)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\_refs\__init__.py", line 1082, in add
output = prims.add(a, b)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\_ops.py", line 513, in __call__
return self._op(*args, **(kwargs or {}))
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\_prims\__init__.py", line 358, in _prim_elementwise_meta
utils.check_same_device(*args_, allow_cpu_scalar_tensors=True)
File "D:\anaconda\envs\xinference-vl\lib\site-packages\torch\_prims_common\__init__.py", line 696, in check_same_device
raise RuntimeError(msg)
RuntimeError: [address=192.168.17.20:53022, pid=15324] Tensor on device meta is not on the expected device cuda:0!
使用多模态如qwen-vl进行chat的时候会报错,但是文本大模型就好好的。