from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# Completion API
stream = True
completion = client.completions.create(
model=model,
prompt="A robot may not injure a human being",
echo=False,
n=2,
stream=stream)
print("Completion results:")
if stream:
for c in completion:
print(c)
else:
print(completion)
The client will see:
Traceback (most recent call last):
File "/home/user/vllm/send_request.py", line 27, in <module>
for c in completion:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 46, in __iter__
for item in self._iterator:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 58, in __stream__
for sse in iterator:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 50, in _iter_events
yield from self._decoder.iter_bytes(self.response.iter_bytes())
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 280, in iter_bytes
for chunk in self._iter_chunks(iterator):
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/openai/_streaming.py", line 291, in _iter_chunks
for chunk in iterator:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_models.py", line 829, in iter_bytes
for raw_bytes in self.iter_raw():
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_models.py", line 883, in iter_raw
for raw_stream_bytes in self.stream:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_client.py", line 126, in __iter__
for chunk in self._stream:
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_transports/default.py", line 112, in __iter__
with map_httpcore_exceptions():
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/contextlib.py", line 158, in __exit__
self.gen.throw(typ, value, traceback)
File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/httpx/_transports/default.py", line 86, in map_httpcore_exceptions
raise mapped_exc(message) from exc
httpx.RemoteProtocolError: peer closed connection without sending complete message body (incomplete chunked read)
and the error on the server-side is:
| Traceback (most recent call last):
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/starlette/responses.py", line 261, in wrap
| await func()
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/starlette/responses.py", line 250, in stream_response
| async for chunk in self.body_iterator:
| File "/home/user/vllm/vllm/entrypoints/openai/serving_completion.py", line 222, in completion_stream_generator
| async for prompt_idx, res in result_generator:
| File "/home/user/vllm/vllm/utils.py", line 319, in consumer
| raise e
| File "/home/user/vllm/vllm/utils.py", line 310, in consumer
| raise item
| File "/home/user/vllm/vllm/utils.py", line 294, in producer
| async for item in iterator:
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 746, in generate
| async for output in self._process_request(
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 859, in _process_request
| raise e
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 855, in _process_request
| async for request_output in stream:
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 90, in __anext__
| raise result
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 43, in _log_task_completion
| return_value = task.result()
| ^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 595, in run_engine_loop
| result = task.result()
| ^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 540, in engine_step
| request_outputs = await self.engine.step_async(virtual_engine)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/engine/async_llm_engine.py", line 241, in step_async
| output = await self.model_executor.execute_model_async(
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/executor/gpu_executor.py", line 122, in execute_model_async
| output = await make_async(self.driver_worker.execute_model
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/concurrent/futures/thread.py", line 58, in run
| result = self.fn(*self.args, **self.kwargs)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
| return func(*args, **kwargs)
| ^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/spec_decode/spec_decode_worker.py", line 338, in execute_model
| return self._run_no_spec(execute_model_req,
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/contextlib.py", line 81, in inner
| return func(*args, **kwds)
| ^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/spec_decode/spec_decode_worker.py", line 386, in _run_no_spec
| sampler_output = self.scorer_worker.execute_model(execute_model_req)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/worker/worker_base.py", line 271, in execute_model
| output = self.model_runner.execute_model(
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
| return func(*args, **kwargs)
| ^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/worker/model_runner.py", line 1245, in execute_model
| output: SamplerOutput = self.model.sample(
| ^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/model_executor/models/llama.py", line 416, in sample
| next_tokens = self.sampler(logits, sampling_metadata)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
| return self._call_impl(*args, **kwargs)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/miniforge3/envs/dev-env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
| return forward_call(*args, **kwargs)
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/model_executor/layers/sampler.py", line 96, in forward
| sample_results, maybe_sampled_tokens_tensor = _sample(
| ^^^^^^^^
| File "/home/user/vllm/vllm/model_executor/layers/sampler.py", line 658, in _sample
| return _sample_with_torch(
| ^^^^^^^^^^^^^^^^^^^
| File "/home/user/vllm/vllm/model_executor/layers/sampler.py", line 528, in _sample_with_torch
| sampled_token_ids_tensor[
| RuntimeError: shape mismatch: value tensor of shape [2] cannot be broadcast to indexing result of shape [1, 1]
+------------------------------------
Your current environment
🐛 Describe the bug
If one sends a request with
n>1
to a server with spec. decode enabled, the request with fail with an unhelpful error message.To reproduce, start an inference server with:
and then send a request via:
The client will see:
and the error on the server-side is: