Traceback (most recent call last):
File "/world/data-gpu-107/github/vllm/vllm/engine/async_llm_engine.py", line 604, in run_engine_loop
done, _ = await asyncio.wait(
File "/usr/lib/python3.10/asyncio/tasks.py", line 384, in wait
return await _wait(fs, timeout, return_when, loop)
File "/usr/lib/python3.10/asyncio/tasks.py", line 491, in _wait
await waiter
asyncio.exceptions.CancelledError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 399, in run_asgi
result = await app( # type: ignore[func-returns-value]
File "/usr/local/lib/python3.10/dist-packages/uvicorn/middleware/proxy_headers.py", line 70, in __call__
return await self.app(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/fastapi/applications.py", line 1054, in __call__
await super().__call__(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/applications.py", line 123, in __call__
await self.middleware_stack(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 186, in __call__
raise exc
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 164, in __call__
await self.app(scope, receive, _send)
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/cors.py", line 85, in __call__
await self.app(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/exceptions.py", line 65, in __call__
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 64, in wrapped_app
await app(scope, receive, sender)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 756, in __call__
await self.middleware_stack(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 776, in app
await route.handle(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 297, in handle
await self.app(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 77, in app
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 64, in wrapped_app
raise exc
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 53, in wrapped_app
await app(scope, receive, sender)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 72, in app
response = await func(request)
File "/usr/local/lib/python3.10/dist-packages/fastapi/routing.py", line 278, in app
raw_response = await run_endpoint_function(
File "/usr/local/lib/python3.10/dist-packages/fastapi/routing.py", line 191, in run_endpoint_function
return await dependant.call(**values)
File "/world/data-gpu-107/github/vllm/vllm/entrypoints/openai/api_server.py", line 132, in create_chat_completion
generator = await openai_serving_chat.create_chat_completion(
File "/world/data-gpu-107/github/vllm/vllm/entrypoints/openai/serving_chat.py", line 305, in create_chat_completion
return await self.chat_completion_full_generator(
File "/world/data-gpu-107/github/vllm/vllm/entrypoints/openai/serving_chat.py", line 505, in chat_completion_full_generator
async for res in result_generator:
File "/world/data-gpu-107/github/vllm/vllm/engine/async_llm_engine.py", line 765, in generate
async for output in self._process_request(
File "/world/data-gpu-107/github/vllm/vllm/engine/async_llm_engine.py", line 881, in _process_request
raise e
File "/world/data-gpu-107/github/vllm/vllm/engine/async_llm_engine.py", line 877, in _process_request
async for request_output in stream:
File "/world/data-gpu-107/github/vllm/vllm/engine/async_llm_engine.py", line 91, in __anext__
raise result
File "/world/data-gpu-107/github/vllm/vllm/engine/async_llm_engine.py", line 44, in _log_task_completion
return_value = task.result()
File "/world/data-gpu-107/github/vllm/vllm/engine/async_llm_engine.py", line 603, in run_engine_loop
async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
File "/world/data-gpu-107/github/vllm/vllm/engine/async_timeout.py", line 95, in __aexit__
self._do_exit(exc_type)
File "/world/data-gpu-107/github/vllm/vllm/engine/async_timeout.py", line 178, in _do_exit
raise asyncio.TimeoutError
asyncio.exceptions.TimeoutError
INFO 07-19 16:09:21 metrics.py:295] Avg prompt throughput: 84.8 tokens/s, Avg generation throughput: 0.1 tokens/s, Running: 1 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.3%, CPU KV cache usage: 0.0%.
Your current environment
🐛 Describe the bug
I try to use openai_api_server to serve Qwen2-7B-Instruct on P40. When serving gptq models, vllm comes into errors with long prompt.
fp16 commond:
fp16 works fine with short or long prompt:
gptq-int8 command:
short prompt works fine:
long prompt go into error: