Open kezouke opened 4 months ago
Server Code:
import argparse
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
from vllm import TokensPrompt
from aiostream import stream
import torch
TIMEOUT_KEEP_ALIVE = 5 # seconds.
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
app = FastAPI()
engine = None
async def fast_logprobs(common_part, tokenized_candidates, k, tokenizer):
# Initialize sampling parameters for generating text
sampling_params = SamplingParams(max_tokens=1, prompt_logprobs=1)
# Dictionary to store unique prompts and corresponding tokenized candidates
input_prompts = {}
# Placeholder for results
results = None
# Iterate over each tokenized candidate
for candidate in tokenized_candidates:
# Calculate the length of the candidate excluding padding tokens
candidate_length = torch.sum(candidate!= tokenizer.pad_token_id, dim=-1)
# Combine the common part with the candidate up to its actual length
prompt_token_ids = common_part + candidate.tolist()[:candidate_length]
# Store the combined prompt and the trimmed candidate in the dictionary
input_prompts[tuple(prompt_token_ids)] = candidate.tolist()[:candidate_length]
# Iterate over each unique prompt
for input_prompt in input_prompts.keys():
# Generate a unique identifier for the request
request_id = random_uuid()
# Prepare the prompt for the model
tokens_prompt: TokensPrompt = {
'prompt_token_ids': list(input_prompt),
'multi_modal_data': None
}
# Use the model to generate logits for the prompt
res = engine.generate(request_id=request_id,
inputs=tokens_prompt,
sampling_params=sampling_params)
# Merge the new result with existing results
if not results:
results = res
else:
results = stream.merge(results, res)
# Convert the results into a list asynchronously
results = [item async for item in results]
# Map each candidate to its cumulative log probability
candidate_log_prob_map = {}
# Iterate over each result
for result in results:
# Retrieve the prompt token IDs from the result
prompt_token_ids = result.prompt_token_ids
# Get the corresponding candidate from the input_prompts dictionary
candidate = input_prompts[tuple(prompt_token_ids)]
# Initialize the cumulative log probability
cum_log_prob = 0
# Calculate the cumulative log probability for the candidate
for token_id, log_prob in zip(candidate, result.prompt_logprobs[-len(candidate):]):
cum_log_prob += log_prob[token_id].logprob
# Store the cumulative log probability in the map
candidate_log_prob_map[tuple(candidate)] = cum_log_prob
# Calculate the normalized scores for each candidate
scores = []
for candidate in tokenized_candidates:
# Calculate the length of the candidate excluding padding tokens
candidate_length = torch.sum(candidate!= tokenizer.pad_token_id, dim=-1)
# Retrieve the cumulative log probability for the candidate
cum_log_probability = candidate_log_prob_map[tuple(candidate.tolist()[:candidate_length])]
# Normalize the score by dividing by the candidate length
scores.append(cum_log_probability / candidate_length)
# Convert the scores into a tensor
scores = torch.tensor(scores)
# Find the indices of the top k scores
top_k_indices = torch.topk(scores, k=k, largest=True, sorted=True).indices
# Return the top k candidates based on their scores
return [tokenized_candidates[i] for i in top_k_indices]
@app.post("/generate")
async def generate(request: Request) -> Response:
"""Generate completion for the request.
The request should be a JSON object with the following fields:
- prompt: the prompt to use for the generation.
- schema: the JSON schema to use for the generation (if regex is not provided).
- regex: the regex to use for the generation (if schema is not provided).
- stream: whether to stream the results or not.
- other fields: the sampling parameters (See `SamplingParams` for details).
"""
assert engine is not None
request_dict = await request.json()
prompt = request_dict.pop('prompt',None)
substring_context = request_dict.pop('substring',None)
if substring_context is not None:
tokenizer = engine.engine.get_tokenizer()
if not tokenizer.pad_token:
tokenizer.pad_token = tokenizer.eos_token
tokenized_prompt = tokenizer(prompt,
return_tensors="pt",
padding=True,
add_special_tokens=False)['input_ids']
last_part = tokenizer.decode(tokenized_prompt[0, -2:])
common_part = tokenizer.decode(tokenized_prompt[0, :-2])
substring_candidates = set()
restriction = min(len(substring_context) + 1, 16)
substring_context_splitted = substring_context.split(' ')
word_lengths = [len(word) for word in substring_context_splitted]
avg_length = (sum(word_lengths) + len(word_lengths))/len(word_lengths)
step_size = int(restriction // avg_length) + 1
# Iterate over the text to generate all possible substrings within the restriction
for i in range(len(substring_context_splitted)):
for j in range(i+1, min(len(substring_context_splitted) + 1, i + 1 + step_size)):
candidate = " ".join(substring_context_splitted[i:j])
substring_candidates.add(candidate)
# Sort the set of substring candidates for reproducibility
substring_candidates = sorted(substring_candidates)
# Prefix each candidate with the prompt to form complete sequences
substring_candidates = [last_part + candidate for candidate in substring_candidates]
tokenized_s_cand = tokenizer(substring_candidates,
return_tensors="pt",
padding=True,
add_special_tokens=False)['input_ids']
common_part_encoded = tokenizer.encode(common_part)
chosed_options = await fast_logprobs(common_part_encoded,
tokenized_s_cand,
1,
tokenizer)
res_text = []
for option in chosed_options:
substirng_choice = tokenizer.decode(option, skip_special_tokens=True)
res_text.append(substirng_choice[len(last_part):])
return JSONResponse(res_text)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--ssl-keyfile", type=str, default=None)
parser.add_argument("--ssl-certfile", type=str, default=None)
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
engine_args: AsyncEngineArgs = AsyncEngineArgs.from_cli_args(args) # type: ignore
# Sets default for the model (`facebook/opt-125m`)
engine = AsyncLLMEngine.from_engine_args(engine_args)
uvicorn.run(
app,
host=args.host,
port=args.port,
log_level="debug",
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
)
Request to the server code example:
import requests
import time
prompt = '''There is a request from the user to create a task in task management system. Please, extract the status of this task that will be placed in Task Management System.
Example 1: Update the status of the pentest task to completed.
Answer: “completed”
Example 2: “Create a code refactoring task for Vanya. And set the status to “in progress”
Answer: “in progress”.
Example 3: “Show me the executors of the task of charting”.
Answer: “null”.
Example 4: “Create a task to buy candy, status - in progress”.
Answer: “'''
substirng = "Create a task to buy candy, status - in progress"
substring_example = {'prompt': prompt, 'substring': substirng}
s = time.time()
x = requests.post(url, json = substring_example)
e = time.time()
print(x.text)
e - s
I'm not sure exactly what's causing your problem but I have two comments that might help.
Firstly, I notice you are using --tokenizer-pool-size=100
but your CPU only has 48 cores. This means you are creating 100 Ray processes for tokenization, which could be clogging up your CPU. I would experiment with much smaller numbers (the tests use 1 and 2 for example) and see if you still get good performance.
Secondly, KV cache is always enabled. --enable-prefix-caching
enables the reuse of KV cache from previous requests. Please see the following images from this twee for some explanation:
I'm not sure exactly what's causing your problem but I have two comments that might help.
Firstly, I notice you are using
--tokenizer-pool-size=100
but your CPU only has 48 cores. This means you are creating 100 Ray processes for tokenization, which could be clogging up your CPU. I would experiment with much smaller numbers (the tests use 1 and 2 for example) and see if you still get good performance.Secondly, KV cache is always enabled.
--enable-prefix-caching
enables the reuse of KV cache from previous requests. Please see the following images from this twee for some explanation:
Thank you so much for your comment! You're right, I had the wrong title for this issue originally, but I've now renamed it
Could you also try giving vLLM more memory?
80*0.3=24GB leaves only 8GB for KV cache since the weights take up 16GB.
Could you also try giving vLLM more memory?
80*0.3=24GB leaves only 8GB for KV cache since the weights take up 16GB.
Yes. The same code example with --gpu-memory-utilization=0.9
has the exact same problem:
(.venv) (.venv) (base) root@pishhost-g06:/home/projects/beta-guidance/async_engine_test# python3 issue_code.py --host=10.100.30.239 --port=1222 --tensor-parallel-size=1 --model=meta-llama/Meta-Llama-3-8B --gpu-memory-utilization=0.9 --dtype=bfloat16 --max-logprobs=1 --disable-log-requests --enable-prefix-caching
/home/.venv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
INFO 06-01 09:25:34 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='meta-llama/Meta-Llama-3-8B', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO 06-01 09:25:37 weight_utils.py:207] Using model weights format ['*.safetensors']
INFO 06-01 09:25:41 model_runner.py:146] Loading model weights took 14.9595 GB
INFO 06-01 09:25:42 gpu_executor.py:83] # GPU blocks: 27895, # CPU blocks: 2048
INFO 06-01 09:25:43 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-01 09:25:43 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-01 09:25:46 model_runner.py:924] Graph capturing finished in 3 secs.
INFO 06-01 09:25:46 block_manager_v1.py:247] Automatic prefix caching is enabled.
INFO: Started server process [860480]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://10.100.30.239:1222 (Press CTRL+C to quit)
/home/projects/beta-guidance/async_engine_test/issue_code.py:59: UserWarning: Streamer is iterated outside of its context
results = [item async for item in results]
INFO 06-01 09:25:52 metrics.py:341] Avg prompt throughput: 22.2 tokens/s, Avg generation throughput: 0.2 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
ERROR 06-01 09:25:52 async_llm_engine.py:45] Engine background task failed
ERROR 06-01 09:25:52 async_llm_engine.py:45] Traceback (most recent call last):
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/engine/async_llm_engine.py", line 40, in _raise_exception_on_finish
ERROR 06-01 09:25:52 async_llm_engine.py:45] task.result()
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/engine/async_llm_engine.py", line 521, in run_engine_loop
ERROR 06-01 09:25:52 async_llm_engine.py:45] has_requests_in_progress = await asyncio.wait_for(
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/usr/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
ERROR 06-01 09:25:52 async_llm_engine.py:45] return fut.result()
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/engine/async_llm_engine.py", line 495, in engine_step
ERROR 06-01 09:25:52 async_llm_engine.py:45] request_outputs = await self.engine.step_async()
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/engine/async_llm_engine.py", line 226, in step_async
ERROR 06-01 09:25:52 async_llm_engine.py:45] output = await self.model_executor.execute_model_async(
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/executor/gpu_executor.py", line 117, in execute_model_async
ERROR 06-01 09:25:52 async_llm_engine.py:45] output = await make_async(self.driver_worker.execute_model
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
ERROR 06-01 09:25:52 async_llm_engine.py:45] result = self.fn(*self.args, **self.kwargs)
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
ERROR 06-01 09:25:52 async_llm_engine.py:45] return func(*args, **kwargs)
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/worker/worker.py", line 272, in execute_model
ERROR 06-01 09:25:52 async_llm_engine.py:45] output = self.model_runner.execute_model(seq_group_metadata_list,
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
ERROR 06-01 09:25:52 async_llm_engine.py:45] return func(*args, **kwargs)
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/worker/model_runner.py", line 738, in execute_model
ERROR 06-01 09:25:52 async_llm_engine.py:45] output = self.model.sample(
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/model_executor/models/llama.py", line 378, in sample
ERROR 06-01 09:25:52 async_llm_engine.py:45] next_tokens = self.sampler(logits, sampling_metadata)
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
ERROR 06-01 09:25:52 async_llm_engine.py:45] return self._call_impl(*args, **kwargs)
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
ERROR 06-01 09:25:52 async_llm_engine.py:45] return forward_call(*args, **kwargs)
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/model_executor/layers/sampler.py", line 112, in forward
ERROR 06-01 09:25:52 async_llm_engine.py:45] prompt_logprobs, sample_logprobs = _get_logprobs(
ERROR 06-01 09:25:52 async_llm_engine.py:45] File "/home/vllm/vllm/model_executor/layers/sampler.py", line 760, in _get_logprobs
ERROR 06-01 09:25:52 async_llm_engine.py:45] assert len(next_token_ids) == len(query_indices)
ERROR 06-01 09:25:52 async_llm_engine.py:45] AssertionError
Exception in callback functools.partial(<function _raise_exception_on_finish at 0x7f83b0939bd0>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7f83ae414dc0>>)
handle: <Handle functools.partial(<function _raise_exception_on_finish at 0x7f83b0939bd0>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7f83ae414dc0>>)>
Traceback (most recent call last):
File "/home/vllm/vllm/engine/async_llm_engine.py", line 40, in _raise_exception_on_finish
task.result()
File "/home/vllm/vllm/engine/async_llm_engine.py", line 521, in run_engine_loop
has_requests_in_progress = await asyncio.wait_for(
File "/usr/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
return fut.result()
File "/home/vllm/vllm/engine/async_llm_engine.py", line 495, in engine_step
request_outputs = await self.engine.step_async()
File "/home/vllm/vllm/engine/async_llm_engine.py", line 226, in step_async
output = await self.model_executor.execute_model_async(
File "/home/vllm/vllm/executor/gpu_executor.py", line 117, in execute_model_async
output = await make_async(self.driver_worker.execute_model
File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/vllm/vllm/worker/worker.py", line 272, in execute_model
output = self.model_runner.execute_model(seq_group_metadata_list,
File "/home/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/vllm/vllm/worker/model_runner.py", line 738, in execute_model
output = self.model.sample(
File "/home/vllm/vllm/model_executor/models/llama.py", line 378, in sample
next_tokens = self.sampler(logits, sampling_metadata)
File "/home/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/vllm/vllm/model_executor/layers/sampler.py", line 112, in forward
prompt_logprobs, sample_logprobs = _get_logprobs(
File "/home/vllm/vllm/model_executor/layers/sampler.py", line 760, in _get_logprobs
assert len(next_token_ids) == len(query_indices)
AssertionError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "uvloop/cbhandles.pyx", line 63, in uvloop.loop.Handle._run
File "/home/vllm/vllm/engine/async_llm_engine.py", line 47, in _raise_exception_on_finish
raise AsyncEngineDeadError(
vllm.engine.async_llm_engine.AsyncEngineDeadError: Task finished unexpectedly. This should never happen! Please open an issue on Github. See stack trace above for the actual cause.
INFO: 10.100.30.239:58040 - "POST /generate HTTP/1.1" 500 Internal Server Error
ERROR: Exception in ASGI application
Traceback (most recent call last):
File "/home/.venv/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 399, in run_asgi
result = await app( # type: ignore[func-returns-value]
File "/home/.venv/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 70, in __call__
return await self.app(scope, receive, send)
File "/home/.venv/lib/python3.10/site-packages/fastapi/applications.py", line 1054, in __call__
await super().__call__(scope, receive, send)
File "/home/.venv/lib/python3.10/site-packages/starlette/applications.py", line 123, in __call__
await self.middleware_stack(scope, receive, send)
File "/home/.venv/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
raise exc
File "/home/.venv/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
await self.app(scope, receive, _send)
File "/home/.venv/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 65, in __call__
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
File "/home/.venv/lib/python3.10/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app
raise exc
File "/home/.venv/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
await app(scope, receive, sender)
File "/home/.venv/lib/python3.10/site-packages/starlette/routing.py", line 756, in __call__
await self.middleware_stack(scope, receive, send)
File "/home/.venv/lib/python3.10/site-packages/starlette/routing.py", line 776, in app
await route.handle(scope, receive, send)
File "/home/.venv/lib/python3.10/site-packages/starlette/routing.py", line 297, in handle
await self.app(scope, receive, send)
File "/home/.venv/lib/python3.10/site-packages/starlette/routing.py", line 77, in app
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
File "/home/.venv/lib/python3.10/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app
raise exc
File "/home/.venv/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
await app(scope, receive, sender)
File "/home/.venv/lib/python3.10/site-packages/starlette/routing.py", line 72, in app
response = await func(request)
File "/home/.venv/lib/python3.10/site-packages/fastapi/routing.py", line 278, in app
raw_response = await run_endpoint_function(
File "/home/.venv/lib/python3.10/site-packages/fastapi/routing.py", line 191, in run_endpoint_function
return await dependant.call(**values)
File "/home/projects/beta-guidance/async_engine_test/issue_code.py", line 156, in generate
chosed_options = await fast_logprobs(common_part_encoded,
File "/home/projects/beta-guidance/async_engine_test/issue_code.py", line 59, in fast_logprobs
results = [item async for item in results]
File "/home/projects/beta-guidance/async_engine_test/issue_code.py", line 59, in <listcomp>
results = [item async for item in results]
File "/home/.venv/lib/python3.10/site-packages/aiostream/stream/advanced.py", line 75, in base_combine
result = task.result()
File "/home/.venv/lib/python3.10/site-packages/aiostream/stream/advanced.py", line 75, in base_combine
result = task.result()
File "/home/vllm/vllm/engine/async_llm_engine.py", line 662, in generate
async for output in self._process_request(
File "/home/vllm/vllm/engine/async_llm_engine.py", line 769, in _process_request
raise e
File "/home/vllm/vllm/engine/async_llm_engine.py", line 765, in _process_request
async for request_output in stream:
File "/home/vllm/vllm/engine/async_llm_engine.py", line 80, in __anext__
raise result
File "/home/vllm/vllm/engine/async_llm_engine.py", line 40, in _raise_exception_on_finish
task.result()
File "/home/vllm/vllm/engine/async_llm_engine.py", line 521, in run_engine_loop
has_requests_in_progress = await asyncio.wait_for(
File "/usr/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
return fut.result()
File "/home/vllm/vllm/engine/async_llm_engine.py", line 495, in engine_step
request_outputs = await self.engine.step_async()
File "/home/vllm/vllm/engine/async_llm_engine.py", line 226, in step_async
output = await self.model_executor.execute_model_async(
File "/home/vllm/vllm/executor/gpu_executor.py", line 117, in execute_model_async
output = await make_async(self.driver_worker.execute_model
File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/home/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/vllm/vllm/worker/worker.py", line 272, in execute_model
output = self.model_runner.execute_model(seq_group_metadata_list,
File "/home/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/vllm/vllm/worker/model_runner.py", line 738, in execute_model
output = self.model.sample(
File "/home/vllm/vllm/model_executor/models/llama.py", line 378, in sample
next_tokens = self.sampler(logits, sampling_metadata)
File "/home/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/vllm/vllm/model_executor/layers/sampler.py", line 112, in forward
prompt_logprobs, sample_logprobs = _get_logprobs(
File "/home/vllm/vllm/model_executor/layers/sampler.py", line 760, in _get_logprobs
assert len(next_token_ids) == len(query_indices)
AssertionError
I've encountered an inconsistency where smaller prompts work fine but larger ones trigger an unexpected error. Here's a breakdown of my observations:
Using the following command:
python3 issue_code.py --host=10.100.30.239 --port=1222 --tensor-parallel-size=1 --model=meta-llama/Meta-Llama-3-8B --gpu-memory-utilization=0.9 --dtype=bfloat16 --max-logprobs=1 --disable-log-requests --enable-prefix-caching
And sending requests with small prompts like this:
import time
data = {'prompt': 'The weather is amazing It is so ', 'substring': 'good day bad day, I hate it'}
x = requests.post(url, json=data)
Repeatedly executing these requests does not produce any errors:
for _ in range(100):
status_code = send_request(data)
Output shows normal operation. BUT with GPU KV cache usage at 0.0% (strange, because prompts has the same prefix):
INFO 06-01 09:55:30 metrics.py:341] Avg prompt throughput: 2956.0 tokens/s, Avg generation throughput: 245.4 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
When using a larger prompt (already showed before):
prompt = '''There is a request from the user to create a task in task management system. Please, extract the status of this task that will be placed in Task Management System.
Example 1: Update the status of the pentest task to completed.
Answer: “completed”
Example 2: “Create a code refactoring task for Vanya. And set the status to “in progress”
Answer: “in progress”.
Example 3: “Show me the executors of the task of charting”.
Answer: “null”.
Example 4: “Create a task to buy candy, status - in progress”.
Answer: “'''
substirng = "Create a task to buy candy, status - in progress"
substring_example = {'prompt': prompt, 'substring': substring}
x = requests.post(url, json=substring_example)
I encounter the following exception:
vllm.engine.async_llm_engine.AsyncEngineDeadError: Task finished unexpectedly. This should never happen Please open an issue on Github. See stack trace above for the actual cause.
Exception happens in vllm.model_executor.layers.sampler.py
in the _get_logprobs()
method:
def _get_logprobs(
logprobs: torch.Tensor,
sampling_metadata: SamplingMetadata,
sample_results: SampleResultType,
) -> Tuple[List[Optional[PromptLogprobs]], List[SampleLogprobs]]:
"""Return sample lobprobs and prompt logprobs.
The logic consists of 3 parts.
- Select indices to compute logprob from, ranks of token ids, and
the top k token ids from logprobs.
- Compute prompt logprobs if required.
- Compute sample logprobs if required.
Args:
logprobs: (num_query_tokens_across_batch, num_vocab). Each query token's
logprob per vocab. Sequence groups' query tokens are batched in a
single flattened tensor. For example, assuming there are N
seq groups, it is sorted by prefill tokens for seq_group_1 (if
prompt logprob is enabled), decode tokens for seq_group_1 (if
sampling is required), prefill tokens for seq_group_2, ...
sampling_metadata: The sampling metadata.
sample_results: (num_seq_groups) The tuple of (next_token_ids,
parent_ids) for each sequence group. When beam search is enabled,
sample_results can contain different number of seq_ids from
sampling_metadata.seq_groups. It is because beam search creates
2 * BEAM_WIDTH number of samples (whereas there are only up to
BEAM_WIDTH number of seq_ids).
Returns:
A tuple of prompt and sample logprobs per sequence group in a batch.
"""
# The index of query token to calculate logprobs. It includes both
# prompt and sample logprob indices.
query_indices: List[int] = []
# The next token ids to get the logprob value from.
next_token_ids: List[int] = []
# The largest requested number of logprobs. We find logprobs as many as the
# largest num logprobs in this API.
largest_num_logprobs = 1
# Select indices to compute logprob from, ranks of token ids, and the top
# k token ids from logprobs.
for (seq_group, sample_result) in zip(sampling_metadata.seq_groups,
sample_results):
sampling_params = seq_group.sampling_params
# Update indices and tokens for prompt logprobs.
if (seq_group.is_prompt
and sampling_params.prompt_logprobs is not None):
largest_num_logprobs = max(largest_num_logprobs,
sampling_params.prompt_logprobs)
next_prompt_tokens = _get_next_prompt_tokens(seq_group)
query_indices.extend(seq_group.prompt_logprob_indices)
next_token_ids.extend(next_prompt_tokens)
# Update indices and next tokenes for sample logprob.
if seq_group.do_sample:
token_ids, parent_seq_ids = sample_result
# NOTE: We cannot directly use sample_indices because
# sample_indices only contain parent seq_ids of a previous step.
# The current step may have different number of seq_ids, and
# we can obtain it from `sample_result[1]`.
query_idx = seq_group.sample_indices[0]
query_indices.extend(
[query_idx + parent_id for parent_id in parent_seq_ids])
next_token_ids.extend(token_ids)
if sampling_params.logprobs is not None:
largest_num_logprobs = max(largest_num_logprobs,
sampling_params.logprobs)
assert len(next_token_ids) == len(query_indices)
#there is also code below....
I tried to add the following lines of code:
print(len(next_token_ids))
print(len(query_indices))
print(next_token_ids)
print(query_indices)
And what I get as a result is this:
... some output above ...
131
131
[3947, 374, 264, 1715, 505, 279, 1217, 311, 1893, 264, 3465, 304, 3465, 6373, 1887, 13, 5321, 11, 8819, 279, 2704, 315, 420, 3465, 430, 690, 387, 9277, 304, 5546, 9744, 744, 627, 13617, 220, 16, 25, 5666, 279, 2704, 315, 279, 20801, 478, 3465, 311, 8308, 627, 16533, 25, 1054, 35835, 7663, 13617, 220, 17, 25, 1054, 4110, 264, 2082, 2098, 76507, 3465, 369, 650, 25041, 13, 1628, 743, 279, 2704, 311, 1054, 258, 5208, 89874, 16533, 25, 1054, 258, 5208, 15397, 13617, 220, 18, 25, 1054, 7968, 757, 279, 3969, 9663, 315, 279, 3465, 315, 9676, 287, 113068, 16533, 25, 1054, 2994, 15397, 13617, 220, 19, 25, 1054, 4110, 264, 3465, 311, 3780, 32656, 11, 2704, 482, 304, 5208, 113068, 16533, 25, 1054, 998, 3780, 32656, 11, 2704, 482]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130]
INFO 06-01 10:57:03 metrics.py:341] Avg prompt throughput: 24.7 tokens/s, Avg generation throughput: 0.2 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
19
18
[3947, 374, 264, 1715, 505, 279, 1217, 311, 1893, 264, 3465, 304, 3465, 6373, 1887, 13, 5321, 11, 2704]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
This is followed by an ERROR (the assertion did not succeed)
... some output above ...
If you try to decode the last next_token_ids
list with llama3
tokenizer:
There is a request from the user to create a task in task management system. Please, status
The last token is not part of the prompt, but the result of the llm generation (2704
or status
). The prompt is the same as in my previous comment.
It appears there's an issue with populating query_indices
and next_token_ids
. This could be due to a missing prompt fragment.
I'd appreciate any help in identifying where the mistake was made, and I'll work on making a pull request to fix it.
I think there is an issue with the combination of Prefix Caching and Logprobs if the user requests prompt logprobs. I do not think we have a mechanism right now to handle this.
Could you make a simple example with one of our officially supported APIs (like LLM
or the Server we support so we can isolate the issue)?
I think there is an issue with the combination of Prefix Caching and Logprobs if the user requests prompt logprobs. I do not think we have a mechanism right now to handle this.
Could you make a simple example with one of our officially supported APIs (like
LLM
or the Server we support so we can isolate the issue)?
Yes of course!
Server code:
import argparse
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
from vllm import TokensPrompt
from aiostream import stream
import torch
TIMEOUT_KEEP_ALIVE = 5 # seconds.
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
app = FastAPI()
engine = None
@app.post("/generate")
async def generate(request: Request) -> Response:
assert engine is not None
request_dict = await request.json()
substring_context = request_dict.pop('prompt_token_ids', None)
max_tokens = request_dict.pop('max_tokens', None)
prompt_logprobs = request_dict.pop('prompt_logprobs', None)
request_id = random_uuid()
# Prepare the prompt for the model
tokens_prompt: TokensPrompt = {
'prompt_token_ids': substring_context,
'multi_modal_data': None
}
sampling_params = SamplingParams(
max_tokens = max_tokens,
prompt_logprobs = prompt_logprobs
)
# Use the model to generate logits for the prompt
res = engine.generate(request_id=request_id,
inputs=tokens_prompt,
sampling_params=sampling_params)
res = [item async for item in res]
return res
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--ssl-keyfile", type=str, default=None)
parser.add_argument("--ssl-certfile", type=str, default=None)
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
engine_args: AsyncEngineArgs = AsyncEngineArgs.from_cli_args(args) # type: ignore
# Sets default for the model (`facebook/opt-125m`)
engine = AsyncLLMEngine.from_engine_args(engine_args)
uvicorn.run(
app,
host=args.host,
port=args.port,
log_level="debug",
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
)
Client code:
data1 = {
'prompt_token_ids': [128000, 3947, 374, 264, 1715, 505, 279, 1217, 311, 1893, 264, 3465, 304, 3465, 6373, 1887, 13, 5321, 11, 8819, 279, 2704, 315, 420, 3465, 430, 690, 387, 9277, 304, 5546, 9744, 744, 627, 13617, 220, 16, 25, 5666, 279, 2704, 315, 279, 20801, 478, 3465, 311, 8308, 627, 16533, 25, 1054, 35835, 7663, 13617, 220, 17, 25, 1054, 4110, 264, 2082, 2098, 76507, 3465, 369, 650, 25041, 13, 1628, 743, 279, 2704, 311, 1054, 258, 5208, 89874, 16533, 25, 1054, 258, 5208, 15397, 13617, 220, 18, 25, 1054, 7968, 757, 279, 3969, 9663, 315, 279, 3465, 315, 9676, 287, 113068, 16533, 25, 1054, 2994, 15397, 13617, 220, 19, 25, 1054, 4110, 264, 3465, 311, 3780, 32656, 11, 2704, 482, 304, 5208, 113068, 16533, 25, 1054, 12],
'max_tokens': 1,
'prompt_logprobs': 1
}
data2 = {
'prompt_token_ids': [128000, 3947, 374, 264, 1715, 505, 279, 1217, 311, 1893, 264, 3465, 304, 3465, 6373, 1887, 13, 5321, 11, 8819, 279, 2704, 315, 420, 3465, 430, 690, 387, 9277, 304, 5546, 9744, 744, 627, 13617, 220, 16, 25, 5666, 279, 2704, 315, 279, 20801, 478, 3465, 311, 8308, 627, 16533, 25, 1054, 35835, 7663, 13617, 220, 17, 25, 1054, 4110, 264, 2082, 2098, 76507, 3465, 369, 650, 25041, 13, 1628, 743, 279, 2704, 311, 1054, 258, 5208, 89874, 16533, 25, 1054, 258, 5208, 15397, 13617, 220, 18, 25, 1054, 7968, 757, 279, 3969, 9663, 315, 279, 3465, 315, 9676, 287, 113068, 16533, 25, 1054, 2994, 15397, 13617, 220, 19, 25, 1054, 4110, 264, 3465, 311, 3780, 32656, 11, 2704, 482, 304, 5208, 113068, 16533, 25, 1054, 12, 304],
'max_tokens': 1,
'prompt_logprobs': 1
}
data = [data1, data2]
for request_data in data:
requests.post(url, json = request_data)
Thanks! Would you be willing to help work on the issue if I provide some guidance?
Thanks! Would you be willing to help work on the issue if I provide some guidance?
Yes, sounds great!
I would like to share intermediate results of my investigation of this error.
Running the last code snippets provided by @kezouke have led to another error than the one provided at the root of the issue.
1) Original error was related to assertion len(next_token_ids) == len(query_indices)
at sampler.py
.
2) However, currently provided code (at least for my identical setup) leads to another error, that is:
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Hopefully, this new error is the root of the original problem. Actual CUDA error that I see here is:
...
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [466,0,0], thread: [89,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [466,0,0], thread: [90,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [466,0,0], thread: [91,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [466,0,0], thread: [92,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [466,0,0], thread: [93,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [466,0,0], thread: [94,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [466,0,0], thread: [95,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [462,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [462,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [462,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [462,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
...
Seems like this error is somehow similar to one from #4019 - at least end of the path is more or less similar, unveiling that this one could be related to pytorch: ...pytorch/aten/src/ATen/native/hip...
.
--enforce-eager
If the server is runned without --enforce-eager
flag, error is reported in the following place:
[SHARED TRACEBACK]
...
File "/home/bot_env/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 738, in execute_model
output = self.model.sample(
...
File "/home/bot_env/lib/python3.10/site-packages/vllm/model_executor/layers/sampler.py", line 655, in _sample
return _sample_with_torch(
File "/home/bot_env/lib/python3.10/site-packages/vllm/model_executor/layers/sampler.py", line 544, in _sample_with_torch
sample_results = _random_sample(seq_groups,
File "/home/bot_env/lib/python3.10/site-packages/vllm/model_executor/layers/sampler.py", line 324, in _random_sample
random_samples = random_samples.cpu()
If the server is runned with --enforce-eager
flag, error is reported earlier, during the attention computation:
[SHARED TRACEBACK]
...
File "/home/bot_env/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 728, in execute_model
hidden_states = model_executable(**execute_model_kwargs)
...
File "/home/bot_env/lib/python3.10/site-packages/vllm/attention/layer.py", line 89, in forward
return self.impl.forward(query, key, value, kv_cache, attn_metadata,
File "/home/bot_env/lib/python3.10/site-packages/vllm/attention/backends/flash_attn.py", line 320, in forward
out = flash_attn_varlen_func(
File "/home/bot_env/lib/python3.10/site-packages/vllm_flash_attn/flash_attn_interface.py", line 1066, in flash_attn_varlen_func
return FlashAttnVarlenFunc.apply(
File "/home/bot_env/lib/python3.10/site-packages/torch/autograd/function.py", line 598, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/bot_env/lib/python3.10/site-packages/vllm_flash_attn/flash_attn_interface.py", line 581, in forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
File "/home/bot_env/lib/python3.10/site-packages/vllm_flash_attn/flash_attn_interface.py", line 86, in _flash_attn_varlen_forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
I would like to emphasize, that error is the same in both cases and it is CUDA-related.
I have noted that an error could be reproduced even for one client request, with way smaller token list:
import requests
url = "http://127.0.0.1:8000/generate"
data = {
'prompt_token_ids': [128000, 3947, 374, 264],
'max_tokens': 1,
'prompt_logprobs': 1
}
requests.post(url, json = data)
Moreover, we could add logging to the server side, so that it is easier to analyze software-related feedback (shared CUDA error will not appear there, unfortunately):
import argparse
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
from vllm import TokensPrompt
from aiostream import stream
import logging
import torch
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s',
handlers=[
logging.FileHandler("fastapi_tracebacks.log"),
logging.StreamHandler()
]
)
TIMEOUT_KEEP_ALIVE = 5 # seconds.
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
app = FastAPI()
engine = None
@app.exception_handler(Exception)
async def unicorn_exception_handler(request: Request, exc: Exception):
logging.error(f"Exception occurred: {exc}", exc_info=True)
return JSONResponse(
status_code=500,
content={"message": f"An error occurred: {exc}"},
)
@app.post("/generate")
async def generate(request: Request) -> Response:
request_dict = await request.json()
substring_context = request_dict.pop('prompt_token_ids', None)
max_tokens = request_dict.pop('max_tokens', None)
prompt_logprobs = request_dict.pop('prompt_logprobs', None)
request_id = random_uuid()
# Prepare the prompt for the model
tokens_prompt: TokensPrompt = {
'prompt_token_ids': substring_context,
'multi_modal_data': None
}
sampling_params = SamplingParams(
max_tokens = max_tokens,
prompt_logprobs = prompt_logprobs
)
# Use the model to generate logits for the prompt
res = engine.generate(request_id=request_id,
inputs=tokens_prompt,
sampling_params=sampling_params)
res = [item async for item in res]
return res
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default=None)
parser.add_argument("--port", type=int, default=8100)
parser.add_argument("--ssl-keyfile", type=str, default=None)
parser.add_argument("--ssl-certfile", type=str, default=None)
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
engine_args: AsyncEngineArgs = AsyncEngineArgs.from_cli_args(args) # type: ignore
# Sets default for the model (`facebook/opt-125m`)
engine = AsyncLLMEngine.from_engine_args(engine_args)
uvicorn.run(
app,
host=args.host,
port=args.port,
log_level="debug",
timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
ssl_keyfile=args.ssl_keyfile,
ssl_certfile=args.ssl_certfile,
)
@robertgshaw2-neuralmagic I am ready to continue error investigation and its resolution together with @kezouke, at this point your guidance is essential as I am not quite sure what exactly should I/we further review to finally localize an error. Thank you in advance for your answer!
Thanks! Would you be willing to help work on the issue if I provide some guidance?
@robertgshaw2-neuralmagic Could you please share the guidance so I can proceed with the work?
Your current environment
🐛 Describe the bug
When attempting to enable the KV cache in the VLLM library by adding the parameter
--enable-prefix-caching
to the AsyncLLMEngine, I encountered the following exception message:This issue occurs despite following the documentation's instructions. The full output in the console: