bentoml / OpenLLM

Run any open-source LLMs, such as Llama, Gemma, as OpenAI compatible API endpoint in the cloud.
https://bentoml.com
Apache License 2.0
10.05k stars 636 forks source link

RuntimeError: "addmm_impl_cpu_" not implemented for 'Half' #785

Closed seanbenhur closed 11 months ago

seanbenhur commented 11 months ago

Describe the bug

I am hosting a flant5 model on CPU and I am getting the above errpr

To reproduce

openllm start google/flan-t5-small --port 3000 --do-not-track --api_workers 17

Logs

your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

2023-12-16T06:30:34+0000 [ERROR] [runner:llm-flan-t5-runner:1] Exception in ASGI application
+ Exception Group Traceback (most recent call last):
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 428, in run_asgi
|     result = await app(  # type: ignore[func-returns-value]
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 78, in __call__
|     return await self.app(scope, receive, send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/uvicorn/middleware/message_logger.py", line 86, in __call__
|     raise exc from None
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/uvicorn/middleware/message_logger.py", line 82, in __call__
|     await self.app(scope, inner_receive, inner_send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/applications.py", line 122, in __call__
|     await self.middleware_stack(scope, receive, send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/middleware/errors.py", line 184, in __call__
|     raise exc
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/middleware/errors.py", line 162, in __call__
|     await self.app(scope, receive, _send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/bentoml/_internal/server/http/traffic.py", line 23, in __call__
|     async with anyio.create_task_group():
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 658, in __aexit__
|     raise BaseExceptionGroup(
| exceptiongroup.ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
+-+---------------- 1 ----------------
| Traceback (most recent call last):
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/responses.py", line 277, in __call__
|     await wrap(partial(self.listen_for_disconnect, receive))
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/responses.py", line 273, in wrap
|     await func()
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/responses.py", line 250, in listen_for_disconnect
|     message = await receive()
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/opentelemetry/instrumentation/asgi/__init__.py", line 634, in otel_receive
|     message = await receive()
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/uvicorn/middleware/message_logger.py", line 62, in inner_receive
|     message = await receive()
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 564, in receive
|     await self.message_event.wait()
|   File "/opt/conda/envs/pytorch/lib/python3.10/asyncio/locks.py", line 214, in wait
|     await fut
| asyncio.exceptions.CancelledError: Cancelled by cancel scope 7fd8d62ad570
|
| During handling of the above exception, another exception occurred:
|
| Exception Group Traceback (most recent call last):
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/bentoml/_internal/server/http/traffic.py", line 26, in __call__
|     await self.app(scope, receive, send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/opentelemetry/instrumentation/asgi/__init__.py", line 596, in __call__
|     await self.app(scope, otel_receive, otel_send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/bentoml/_internal/server/http/instruments.py", line 252, in __call__
|     await self.app(scope, receive, wrapped_send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/bentoml/_internal/server/http/access.py", line 126, in __call__
|     await self.app(scope, receive, wrapped_send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 79, in __call__
|     raise exc
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 68, in __call__
|     await self.app(scope, receive, sender)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/routing.py", line 718, in __call__
|     await route.handle(scope, receive, send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/routing.py", line 276, in handle
|     await self.app(scope, receive, send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/routing.py", line 69, in app
|     await response(scope, receive, send)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/responses.py", line 270, in __call__
|     async with anyio.create_task_group() as task_group:
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 658, in __aexit__
|     raise BaseExceptionGroup(
| exceptiongroup.ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
+-+---------------- 1 ----------------
| Traceback (most recent call last):
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/responses.py", line 273, in wrap
|     await func()
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/starlette/responses.py", line 262, in stream_response
|     async for chunk in self.body_iterator:
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/bentoml/_internal/server/runner_app.py", line 356, in streamer
|     async for p in payload:
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/bentoml/_internal/server/runner_app.py", line 214, in inner
Traceback (most recent call last):
File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/openllm/_llm.py", line 115, in generate_iterator
async for out in generator:
File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/bentoml/_internal/runner/runner_handle/remote.py", line 330, in async_stream_method
async for b, end_of_http_chunk in resp.content.iter_chunks():
File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/aiohttp/streams.py", line 60, in __anext__
rv = await self._stream.readchunk()
File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/aiohttp/streams.py", line 431, in readchunk
await self._wait("readchunk")
File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/aiohttp/streams.py", line 302, in _wait
await waiter
aiohttp.client_exceptions.ClientPayloadError: Response payload is not completed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/openllm/entrypoints/openai.py", line 224, in chat_completions
async for res in result_generator:
File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/openllm/_llm.py", line 125, in generate_iterator
raise RuntimeError(f'Exception caught during generation: {err}') from err
RuntimeError: Exception caught during generation: Response payload is not completed
|     async for data in ret:
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/openllm/_runners.py", line 200, in generate_iterator
|     encoder_output = self.model.encoder(input_ids=torch.as_tensor([prompt_token_ids], device=self.device))[0]
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
|     return self._call_impl(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
|     return forward_call(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 1110, in forward
|     layer_outputs = layer_module(
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
|     return self._call_impl(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
|     return forward_call(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 694, in forward
|     self_attention_outputs = self.layer[0](
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
|     return self._call_impl(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
|     return forward_call(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 601, in forward
|     attention_output = self.SelfAttention(
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
|     return self._call_impl(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
|     return forward_call(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/transformers/models/t5/modeling_t5.py", line 520, in forward
|     query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
|     return self._call_impl(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
|     return forward_call(*args, **kwargs)
|   File "/opt/conda/envs/pytorch/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
|     return F.linear(input, self.weight, self.bias)
| RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
+------------------------------------

Environment

openllm==0.4.40

System information (Optional)

AWS Instance t2.large

aarnphm commented 11 months ago

You need to set DTYPE=float32 for this.

Maybe I can set the type automatically when there is no CPU.

Currently the behaviour is always default to bfloat16, which requires GPU.

Closed because this is not a bug. I will update the README about this.

zhangxinyang97 commented 8 months ago

I got same error here, start chatglm using local model, and set DTYPE=float32 doesn't work for me.

| Traceback (most recent call last):
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/starlette/responses.py", line 261, in wrap
|     await func()
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/starlette/responses.py", line 250, in stream_response
|     async for chunk in self.body_iterator:
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/bentoml/_internal/server/runner_app.py", line 373, in stream_encoder
|     async for p in payload:
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/bentoml/_internal/server/runner_app.py", line 214, in inner
|     async for data in ret:
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/openllm/_runners.py", line 222, in generate_iterator
|     out = self.model(input_ids=start_ids, use_cache=True)
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|     return self._call_impl(*args, **kwargs)
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|     return forward_call(*args, **kwargs)
|   File "/root/.cache/huggingface/modules/transformers_modules/7d451dccbae3196be9b8efcdffe6a47c8c028687/modeling_chatglm.py", line 1190, in forward
|     transformer_outputs = self.transformer(
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|     return self._call_impl(*args, **kwargs)
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|     return forward_call(*args, **kwargs)
|   File "/root/.cache/huggingface/modules/transformers_modules/7d451dccbae3196be9b8efcdffe6a47c8c028687/modeling_chatglm.py", line 996, in forward
|     layer_ret = layer(
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|     return self._call_impl(*args, **kwargs)
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|     return forward_call(*args, **kwargs)
|   File "/root/.cache/huggingface/modules/transformers_modules/7d451dccbae3196be9b8efcdffe6a47c8c028687/modeling_chatglm.py", line 624, in forward
|     attention_input = self.input_layernorm(hidden_states)
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
|     return self._call_impl(*args, **kwargs)
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
|     return forward_call(*args, **kwargs)
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/modules/normalization.py", line 201, in forward
|     return F.layer_norm(
|   File "/opt/buildtools/python-3.9.2/lib/python3.9/site-packages/torch/nn/functional.py", line 2546, in layer_norm
|     return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
Traceback (most recent call last):
| RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'

cmd: DTYPE=float32 TRUST_REMOTE_CODE=True openllm start /usr1/models/chatglm-6b --backend pt