Closed kannon92 closed 5 months ago
https://github.com/vllm-project/vllm/pull/4396 I think this PR is fixing it
this issue remains active for me, when building from Dockerfile.cpu on latest commit b31a1fb63c98fa1c64666aaae15579439af60d95
python3 -m vllm.entrypoints.openai.api_server --model microsoft/Phi-3-mini-128k-instruct --trust-remote-code --max-model-len 8000
ERROR: Exception in ASGI application
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 411, in run_asgi
result = await app( # type: ignore[func-returns-value]
File "/usr/local/lib/python3.10/dist-packages/uvicorn/middleware/proxy_headers.py", line 69, in __call__
return await self.app(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/fastapi/applications.py", line 1054, in __call__
await super().__call__(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/applications.py", line 123, in __call__
await self.middleware_stack(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 186, in __call__
raise exc
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/errors.py", line 164, in __call__
await self.app(scope, receive, _send)
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/cors.py", line 93, in __call__
await self.simple_response(scope, receive, send, request_headers=headers)
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/cors.py", line 148, in simple_response
await self.app(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/middleware/exceptions.py", line 65, in __call__
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 64, in wrapped_app
raise exc
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 53, in wrapped_app
await app(scope, receive, sender)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 756, in __call__
await self.middleware_stack(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 776, in app
await route.handle(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 297, in handle
await self.app(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 77, in app
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 64, in wrapped_app
raise exc
File "/usr/local/lib/python3.10/dist-packages/starlette/_exception_handler.py", line 53, in wrapped_app
await app(scope, receive, sender)
File "/usr/local/lib/python3.10/dist-packages/starlette/routing.py", line 72, in app
response = await func(request)
File "/usr/local/lib/python3.10/dist-packages/fastapi/routing.py", line 278, in app
raw_response = await run_endpoint_function(
File "/usr/local/lib/python3.10/dist-packages/fastapi/routing.py", line 191, in run_endpoint_function
return await dependant.call(**values)
File "/workspace/vllm/vllm/entrypoints/openai/api_server.py", line 90, in create_chat_completion
generator = await openai_serving_chat.create_chat_completion(
File "/workspace/vllm/vllm/entrypoints/openai/serving_chat.py", line 128, in create_chat_completion
return await self.chat_completion_full_generator(
File "/workspace/vllm/vllm/entrypoints/openai/serving_chat.py", line 290, in chat_completion_full_generator
async for res in result_generator:
File "/workspace/vllm/vllm/engine/async_llm_engine.py", line 663, in generate
raise e
File "/workspace/vllm/vllm/engine/async_llm_engine.py", line 657, in generate
async for request_output in stream:
File "/workspace/vllm/vllm/engine/async_llm_engine.py", line 77, in __anext__
raise result
File "/workspace/vllm/vllm/engine/async_llm_engine.py", line 38, in _raise_exception_on_finish
task.result()
File "/workspace/vllm/vllm/engine/async_llm_engine.py", line 498, in run_engine_loop
has_requests_in_progress = await asyncio.wait_for(
File "/usr/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
return fut.result()
File "/workspace/vllm/vllm/engine/async_llm_engine.py", line 472, in engine_step
request_outputs = await self.engine.step_async()
File "/workspace/vllm/vllm/engine/async_llm_engine.py", line 213, in step_async
output = await self.model_executor.execute_model_async(
File "/workspace/vllm/vllm/executor/cpu_executor.py", line 114, in execute_model_async
output = await make_async(self.driver_worker.execute_model)(
File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
TypeError: CPUWorker.execute_model() got an unexpected keyword argument 'num_lookahead_slots'
My mistake; it's a similar setup but different error
My mistake; it's a similar setup but different error
@andysalerno I am still facing it. Did you resolve this issue? I also built it from Dockerfile.cpu
Your current environment
🐛 Describe the bug
I am having trouble running the openapi server on CPU. I can't confirm that this works for GPU.
I follow instructions for building the code and then run the openapiserver:
output: