When using litellm to interact with Ollama models and fallbacks are configured, the fallback mechanism does not function correctly when the stream=True option is used.
Steps to Reproduce
Configure litellm with one Ollama model (or more in load balance) as the primary model and a fallback model (e.g., another Ollama model or an OpenAI model). Relevant config.yaml:
and also triggers the TypeError exception shown in PR #6281
Expected behavior
When a request triggers the fallback logic, even with stream=True, the fallback model should be seamlessly invoked, and the response should be streamed from the fallback model.
{"message": "litellm.proxy.proxy_server.async_data_generator(): Exception occured - b''", "level": "ERROR", "timestamp": "2024-10-17T19:29:21.683280"}
Exception in ASGI application
Traceback (most recent call last):
File "/usr/local/lib/python3.11/site-packages/starlette/responses.py", line 265, in __call__
await wrap(partial(self.listen_for_disconnect, receive))
File "/usr/local/lib/python3.11/site-packages/starlette/responses.py", line 261, in wrap
await func()
File "/usr/local/lib/python3.11/site-packages/starlette/responses.py", line 238, in listen_for_disconnect
message = await receive()
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/uvicorn/protocols/http/httptools_impl.py", line 568, in receive
await self.message_event.wait()
File "/usr/local/lib/python3.11/asyncio/locks.py", line 213, in wait
await fut
asyncio.exceptions.CancelledError: Cancelled by cancel scope 7fbce820db90
During handling of the above exception, another exception occurred:
+ Exception Group Traceback (most recent call last):
| File "/usr/local/lib/python3.11/site-packages/uvicorn/protocols/http/httptools_impl.py", line 411, in run_asgi
| result = await app( # type: ignore[func-returns-value]
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/site-packages/uvicorn/middleware/proxy_headers.py", line 69, in __call__
| return await self.app(scope, receive, send)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/site-packages/fastapi/applications.py", line 1054, in __call__
| await super().__call__(scope, receive, send)
| File "/usr/local/lib/python3.11/site-packages/starlette/applications.py", line 123, in __call__
| await self.middleware_stack(scope, receive, send)
| File "/usr/local/lib/python3.11/site-packages/starlette/middleware/errors.py", line 186, in __call__
| raise exc
| File "/usr/local/lib/python3.11/site-packages/starlette/middleware/errors.py", line 164, in __call__
| await self.app(scope, receive, _send)
| File "/usr/local/lib/python3.11/site-packages/starlette/middleware/cors.py", line 85, in __call__
| await self.app(scope, receive, send)
| File "/usr/local/lib/python3.11/site-packages/starlette/middleware/exceptions.py", line 65, in __call__
| await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
| File "/usr/local/lib/python3.11/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app
| raise exc
| File "/usr/local/lib/python3.11/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
| await app(scope, receive, sender)
| File "/usr/local/lib/python3.11/site-packages/starlette/routing.py", line 756, in __call__
| await self.middleware_stack(scope, receive, send)
| File "/usr/local/lib/python3.11/site-packages/starlette/routing.py", line 776, in app
| await route.handle(scope, receive, send)
| File "/usr/local/lib/python3.11/site-packages/starlette/routing.py", line 297, in handle
| await self.app(scope, receive, send)
| File "/usr/local/lib/python3.11/site-packages/starlette/routing.py", line 77, in app
| await wrap_app_handling_exceptions(app, request)(scope, receive, send)
| File "/usr/local/lib/python3.11/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app
| raise exc
| File "/usr/local/lib/python3.11/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
| await app(scope, receive, sender)
| File "/usr/local/lib/python3.11/site-packages/starlette/routing.py", line 75, in app
| await response(scope, receive, send)
| File "/usr/local/lib/python3.11/site-packages/starlette/responses.py", line 258, in __call__
| async with anyio.create_task_group() as task_group:
| File "/usr/local/lib/python3.11/site-packages/anyio/_backends/_asyncio.py", line 680, in __aexit__
| raise BaseExceptionGroup(
| ExceptionGroup: unhandled errors in a TaskGroup (1 sub-exception)
+-+---------------- 1 ----------------
| Traceback (most recent call last):
| File "/usr/local/lib/python3.11/site-packages/litellm/proxy/proxy_server.py", line 2579, in async_data_generator
| async for chunk in response:
| File "/usr/local/lib/python3.11/site-packages/litellm/llms/ollama.py", line 443, in ollama_async_streaming
| raise e # don't use verbose_logger.exception, if exception is raised
| ^^^^^^^
| File "/usr/local/lib/python3.11/site-packages/litellm/llms/ollama.py", line 386, in ollama_async_streaming
| raise OllamaError(
| litellm.llms.ollama.OllamaError: b''
|
| During handling of the above exception, another exception occurred:
|
| Traceback (most recent call last):
| File "/usr/local/lib/python3.11/site-packages/starlette/responses.py", line 261, in wrap
| await func()
| File "/usr/local/lib/python3.11/site-packages/starlette/responses.py", line 250, in stream_response
| async for chunk in self.body_iterator:
| File "/usr/local/lib/python3.11/site-packages/litellm/proxy/proxy_server.py", line 2620, in async_data_generator
| proxy_exception = ProxyException(
| ^^^^^^^^^^^^^^^
| File "/usr/local/lib/python3.11/site-packages/litellm/proxy/_types.py", line 1839, in __init__
| "No healthy deployment available" in self.message
| TypeError: a bytes-like object is required, not 'str'
+------------------------------------
What happened?
When using litellm to interact with Ollama models and fallbacks are configured, the fallback mechanism does not function correctly when the stream=True option is used.
Steps to Reproduce
litellm
with one Ollama model (or more in load balance) as the primary model and a fallback model (e.g., another Ollama model or an OpenAI model). Relevantconfig.yaml
:router_settings: num_retries: 0 retry_after: 0 allowed_fails: 1 cooldown_time: 300 fallbacks:
litellm_settings: json_logs: true
and also triggers the TypeError exception shown in PR #6281
Expected behavior
When a request triggers the fallback logic, even with
stream=True
, the fallback model should be seamlessly invoked, and the response should be streamed from the fallback model.Environment:
litellm
version: 1.49.6 (from 2024-10-17)Notes:
stream = False
Relevant log output
Twitter / LinkedIn details
No response