Closed maxin9966 closed 1 month ago
I have similary issue.
vllm 0.4.3 CUDA Driver Version: 535.129.03 GTX 4090
cmd: export CUDA_VISIBLE_DEVICES=0 && nohup python -m vllm.entrypoints.openai.api_server --model models/Qwen2-7B-Instruct-GPTQ-Int4 --host 192.168.168.242 --port 8001 --served-model-name "gpt-3.5-turbo" --tensor-parallel-size 1 --api-key "sk-FqMzXBwjitG7X3xMtVFuKEYDJ4dwQ9iD" --max-model-len 19000 --enable-prefix-caching > llm.log 2>&1 &
ERROR 06-12 08:59:48 async_llm_engine.py:45] Engine background task failed
ERROR 06-12 08:59:48 async_llm_engine.py:45] Traceback (most recent call last):
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 40, in _raise_exception_on_finish
ERROR 06-12 08:59:48 async_llm_engine.py:45] task.result()
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 521, in run_engine_loop
ERROR 06-12 08:59:48 async_llm_engine.py:45] has_requests_in_progress = await asyncio.wait_for(
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/.pyenv/versions/3.11.8/lib/python3.11/asyncio/tasks.py", line 489, in wait_for
ERROR 06-12 08:59:48 async_llm_engine.py:45] return fut.result()
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 495, in engine_step
ERROR 06-12 08:59:48 async_llm_engine.py:45] request_outputs = await self.engine.step_async()
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 226, in step_async
ERROR 06-12 08:59:48 async_llm_engine.py:45] output = await self.model_executor.execute_model_async(
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/executor/gpu_executor.py", line 117, in execute_model_async
ERROR 06-12 08:59:48 async_llm_engine.py:45] output = await make_async(self.driver_worker.execute_model
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/.pyenv/versions/3.11.8/lib/python3.11/concurrent/futures/thread.py", line 58, in run
ERROR 06-12 08:59:48 async_llm_engine.py:45] result = self.fn(*self.args, **self.kwargs)
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
ERROR 06-12 08:59:48 async_llm_engine.py:45] return func(*args, **kwargs)
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/worker/worker.py", line 272, in execute_model
ERROR 06-12 08:59:48 async_llm_engine.py:45] output = self.model_runner.execute_model(seq_group_metadata_list,
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
ERROR 06-12 08:59:48 async_llm_engine.py:45] return func(*args, **kwargs)
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 738, in execute_model
ERROR 06-12 08:59:48 async_llm_engine.py:45] output = self.model.sample(
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 345, in sample
ERROR 06-12 08:59:48 async_llm_engine.py:45] next_tokens = self.sampler(logits, sampling_metadata)
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
ERROR 06-12 08:59:48 async_llm_engine.py:45] return self._call_impl(*args, **kwargs)
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
ERROR 06-12 08:59:48 async_llm_engine.py:45] return forward_call(*args, **kwargs)
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 96, in forward
ERROR 06-12 08:59:48 async_llm_engine.py:45] sample_results, maybe_sampled_tokens_tensor = _sample(
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 655, in _sample
ERROR 06-12 08:59:48 async_llm_engine.py:45] return _sample_with_torch(
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 544, in _sample_with_torch
ERROR 06-12 08:59:48 async_llm_engine.py:45] sample_results = _random_sample(seq_groups,
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 324, in _random_sample
ERROR 06-12 08:59:48 async_llm_engine.py:45] random_samples = random_samples.cpu()
ERROR 06-12 08:59:48 async_llm_engine.py:45] ^^^^^^^^^^^^^^^^^^^^
ERROR 06-12 08:59:48 async_llm_engine.py:45] RuntimeError: CUDA error: an illegal memory access was encountered
ERROR 06-12 08:59:48 async_llm_engine.py:45] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
ERROR 06-12 08:59:48 async_llm_engine.py:45] For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
ERROR 06-12 08:59:48 async_llm_engine.py:45] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
ERROR 06-12 08:59:48 async_llm_engine.py:45]
Exception in callback functools.partial(<function _raise_exception_on_finish at 0x7fac33874c20>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7fac30499f50>>)
handle: <Handle functools.partial(<function _raise_exception_on_finish at 0x7fac33874c20>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7fac30499f50>>)>
Traceback (most recent call last):
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 40, in _raise_exception_on_finish
task.result()
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 521, in run_engine_loop
has_requests_in_progress = await asyncio.wait_for(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/.pyenv/versions/3.11.8/lib/python3.11/asyncio/tasks.py", line 489, in wait_for
return fut.result()
^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 495, in engine_step
request_outputs = await self.engine.step_async()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 226, in step_async
output = await self.model_executor.execute_model_async(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/executor/gpu_executor.py", line 117, in execute_model_async
output = await make_async(self.driver_worker.execute_model
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/.pyenv/versions/3.11.8/lib/python3.11/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/worker/worker.py", line 272, in execute_model
output = self.model_runner.execute_model(seq_group_metadata_list,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 738, in execute_model
output = self.model.sample(
^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 345, in sample
next_tokens = self.sampler(logits, sampling_metadata)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 96, in forward
sample_results, maybe_sampled_tokens_tensor = _sample(
^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 655, in _sample
return _sample_with_torch(
^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 544, in _sample_with_torch
sample_results = _random_sample(seq_groups,
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/model_executor/layers/sampler.py", line 324, in _random_sample
random_samples = random_samples.cpu()
^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "uvloop/cbhandles.pyx", line 63, in uvloop.loop.Handle._run
File "/home/tlserver/workspace/imitater/.venv/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 47, in _raise_exception_on_finish
raise AsyncEngineDeadError(
vllm.engine.async_llm_engine.AsyncEngineDeadError: Task finished unexpectedly. This should never happen! Please open an issue on Github. See stack trace above for the actual cause.
@JulyFinal Several members in our community have encountered similar issues. Version 0.4.2 appears to be functioning normally, we're unsure whether this is due to a problem with flash-attn2 or an issue specific to version 0.4.3.
@maxin9966 I also reproduced this problem in the latest version 0.5.0.
I have no choice but to revert back to Version 0.4.2.
我也有这个问题。
Hi, can you follow the tips so that we can get more information on which operation caused the problem?
In addition, does it work with --enforce-eager
?
@maxin9966 我也在最新版本0.5.0中复现了这个问题。
我别无选择,只能恢复到 0.4.2 版本。
使用0.4.2版本报错172.17.0.1:45344 - "OPTIONS /v1/chat/completions HTTP/1.1" 401 Unauthorized是什么原因?
docker run --gpus all -d -p 8888:8000 -v D:\docker\huggingface:/root/.cache/huggingface
--env "HUGGING_FACE_HUB_TOKEN=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" --ipc=host --name Qwen2-7B-Instruct-32k
vllm/vllm-openai:v0.4.2 --model Qwen/Qwen2-7B-Instruct
--served-model-name gpt-3.5-turbo --api-key sk-123456
--swap-space 0
@youkaichao --enforce-eager I did not use this parameter, I tried it before and found that it significantly reduced throughput, so I haven't used it since. For a long time, using the same parameters in versions prior to 0.4.3 was always fine.
@maxin9966 我也在最新版本0.5.0中复现了这个问题。
我别无选择,只能恢复到 0.4.2 版本。
我0.4.2也是同样的问题
@gaye746560359 是不是文本长度超过一定限度就报错?我这里做压力测试是没问题的,但是运行一段时间之后就报错了,是不是触发了长文本所以报错?
我这边也是大量运行后某个点突然报错的,但报错的这块最近执行的任务并没有很长的情况出现。。。
@gaye746560359 是不是文本长度超过一定限度就报错?我这里做压力测试是没问题的,但是运行一段时间之后就报错了,是不是触发了长文本所以报错?
不清楚原因,问题是没有大面积爆发这个问题也是玄学?官方也没有说明
@gaye746560359 是不是文本长度超过一定限度就报错?我这里做压力测试是没问题的,但是运行一段时间之后就报错了,是不是触发了长文本所以报错?
不清楚原因,问题是没有大面积爆发这个问题也是玄学?官方也没有说明
你是不是也是40系的显卡? 如果是的话可能是集中在40系这块. 使用A系列的似乎没有这个问题
@JulyFinal 目前看都是40系,我这里是用了flash attn2,确定是有问题的,只是不确定问题在哪里。 我发现20系显卡(0.4.2环境下),同样的模型能支持的上下文比40系(0.4.3以上环境下)要少很多,但是20系从来没出过问题。
暂时先放弃40系吧,我的工作流全部迁移到老环境上了
Hi, can you follow the tips so that we can get more information on which operation caused the problem?
In addition, does it work with
--enforce-eager
?
Hi Youkai, could you please take a look at below error message to see if it helps?
a800:3651115:3772442 [0] misc/strongstream.cc:395 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] NCCL INFO init.cc:1899 -> 1
a800:3651115:3772442 [0] init.cc:2030 NCCL WARN commReclaim: comm 0x563f2b7e1450 (rank = 0) in abort, error 1
a800:3651115:3651247 [0] NCCL INFO [Service thread] Connection closed by localRank 0
a800:3651115:3651247 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3651247 [0] NCCL INFO transport/net.cc:537 -> 1
a800:3651115:3651247 [0] NCCL INFO transport/net.cc:940 -> 1
a800:3651115:3651247 [0] NCCL INFO proxy.cc:980 -> 1
a800:3651115:3651247 [0] NCCL INFO proxy.cc:996 -> 1
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] include/alloc.h:248 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] misc/strongstream.cc:110 NCCL WARN Cuda failure 'an illegal memory access was encountered'
a800:3651115:3772442 [0] NCCL INFO init.cc:218 -> 1
a800:3651115:3772442 [0] NCCL INFO init.cc:1933 -> 1
a800:3651115:3772442 [0] init.cc:2065 NCCL WARN commReclaim: cleanup comm 0x563f2b7e1450 rank 0 failed in destroy/abort, error 1
a800:3651115:3772442 [0] NCCL INFO comm 0x563f2b7e1450 rank 0 nranks 1 cudaDev 0 busId 65000 - Abort COMPLETE
Could you try setting the env variable VLLM_ATTENTION_BACKEND=XFORMERS
? We'd like to know whether the FlashAttention kernel causes this bug.
@gaye746560359 是不是文本长度超过一定限度就报错?我这里做压力测试是没问题的,但是运行一段时间之后就报错了,是不是触发了长文本所以报错?
不清楚原因,问题是没有大面积爆发这个问题也是玄学?官方也没有说明
你是不是也是40系的显卡? 如果是的话可能是集中在40系这块. 使用A系列的似乎没有这个问题
我是rtx4090
@gaye746560359 是不是文本长度超过一定限度就报错?我这里做压力测试是没问题的,但是运行一段时间之后就报错了,是不是触发了长文本所以报错?
不清楚原因,问题是没有大面积爆发这个问题也是玄学?官方也没有说明
你是不是也是40系的显卡? 如果是的话可能是集中在40系这块. 使用A系列的似乎没有这个问题
Hit this error on A800 lol
@WoosukKwon I've been testing it out. The workflow for this test is relatively lightweight because we can't currently access the production environment. I'll run it for a while to see how things go.
I think my error might be the same as this
It's a similar error and happens with a GPTQ version of Qwen2 but with 0.5.0.post1
Was running into the very same issue.
Woosuk (@WoosukKwon) Setting VLLM_ATTENTION_BACKEND=XFORMERS
seems to have resolved it for our runs.
@JulyFinal 目前看都是40系,我这里是用了flash attn2,确定是有问题的,只是不确定问题在哪里。 我发现20系显卡(0.4.2环境下),同样的模型能支持的上下文比40系(0.4.3以上环境下)要少很多,但是20系从来没出过问题。
同样的模型能支持的上下文比40系(0.4.3以上环境下)要少很多
这个我也发现了 =-= 感觉很奇怪, 升级了个版本能多这么多上下文.
换成XFORMERS
做后端确实到目前为止没报错过.
另外我感觉 --enable-prefix-caching
开启后是不是会导致返回的结果有些不稳定, 之前好像没有这种情况
Hi @maxin9966 , have you resolved the issue? If so, what steps did you take to do so? If not, can you please provide steps to reproduce it using the following template (replacing the italicized values with your values)?
Hi @maxin9966 , have you resolved the issue? If so, what steps did you take to do so? If not, can you please provide steps to reproduce it using the following template (replacing the italicized values with your values)?嗨@maxin9966,你解决了这个问题吗?如果是这样,您采取了哪些步骤来做到这一点?如果不是,您能否提供使用以下模板重现它的步骤(将斜体值替换为您的值)?
GPUs
- 8x A6000
Client code
- _python3 benchmarks/benchmark_prefixcaching.py --model meta-llama/Meta-Llama-3-70B-Instruct --dataset-path ShareGPT.json --enable-prefix-caching --num-prompts 20 --repeat-count 5 --input-length-range 128:256
Server code
- _python -m vllm.entrypoints.openai.apiserver --model meta-llama/Meta-Llama-3-70B-Instruct --gpu-memory-utilization 0.40 --tensor-parallel-size 8 --max-model-len 2048 --trust-remote-code --enable-prefix-caching --max-num-seqs 128
Env Vars
- _export VLLM_ATTENTION_BACKEND=FLASHATTN
- _export CUDA_VISIBLEDEVICES=0,1,2,3,4,5,6,7,8
Version(s) Tested
- v0.5.2
just change to export VLLM_ATTENTION_BACKEND=XFORMERS
try it.
@JulyFinal Several members in our community have encountered similar issues. Version 0.4.2 appears to be functioning normally, we're unsure whether this is due to a problem with flash-attn2 or an issue specific to version 0.4.3.
Yes, I agree. I was able to run gptq marlin model with enable-chunked-prefill on 0.4.2 but could not after this version. From my error log, it is related to flash attention that cause "RuntimeError: CUDA error: an illegal memory access was encountered"
@maxin9966 你好,想问一下这个问题在最近的版本上解决了吗?我在0.5.3上运行再次出现了这个问题。
Still happened in version 0.6.1.post2!!!!
+1 in 0.6.0
+1 in 0.6.0
this bug fixed?
getting the same issue in L4 with version 0.6.2
use --enforce-eager to avoid
Your current environment
vllm 0.4.3 CUDA Driver Version: 555.42.02 4060Ti Super * 2
VLLM_ATTENTION_BACKEND=FLASH_ATTN CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server --gpu-memory-utilization 0.85 --quantization gptq --host 0.0.0.0 --port 1234 -tp 1 --max-model-len 32768 --served-model-name qwen2 --trust-remote-code --enable-prefix-caching
🐛 Describe the bug
desc: I have tried running it both with a single and dual GPU, but after running for a period of time, it starts to report errors; the issue occurs 100% of the time. The commands used are as follows:
VLLM_ATTENTION_BACKEND=FLASH_ATTN CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server --gpu-memory-utilization 0.85 --quantization gptq --host 0.0.0.0 --port 1234 -tp 1 --max-model-len 32768 --served-model-name qwen2 --trust-remote-code --enable-prefix-caching
error: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: RuntimeError: CUDA error: an illegal memory access was encountered 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
logs: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: INFO: 192.168.1.161:38834 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: ERROR: Exception in ASGI application 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: Traceback (most recent call last): 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/uvicorn/protocols/http/httptools_impl.py", line 419, in run_asgi 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: result = await app( # type: ignore[func-returns-value] 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/uvicorn/middleware/proxy_headers.py", line 84, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/applications.py", line 1054, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await super().call(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/applications.py", line 123, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.middleware_stack(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/errors.py", line 186, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/errors.py", line 164, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, _send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/cors.py", line 83, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/exceptions.py", line 62, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await app(scope, receive, sender) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 762, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.middleware_stack(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 782, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await route.handle(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 297, in handle 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 77, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await wrap_app_handling_exceptions(app, request)(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await app(scope, receive, sender) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 72, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: response = await func(request) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 299, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise e 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 294, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raw_response = await run_endpoint_function( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 191, in run_endpoint_function 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await dependant.call(values) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 103, in create_chat_completion 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: generator = await openai_serving_chat.create_chat_completion( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/serving_chat.py", line 198, in create_chat_completion 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await self.chat_completion_full_generator( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/serving_chat.py", line 360, in chat_completion_full_generator 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for res in result_generator: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 662, in generate 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for output in self._process_request( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 769, in _process_request 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise e 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 765, in _process_request 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for request_output in stream: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 80, in anext 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise result 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/uvicorn/protocols/http/httptools_impl.py", line 419, in run_asgi 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: result = await app( # type: ignore[func-returns-value] 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/uvicorn/middleware/proxy_headers.py", line 84, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/applications.py", line 1054, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await super().call(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/applications.py", line 123, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.middleware_stack(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/errors.py", line 186, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/errors.py", line 164, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, _send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/cors.py", line 83, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/exceptions.py", line 62, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await app(scope, receive, sender) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 762, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.middleware_stack(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 782, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await route.handle(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 297, in handle 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 77, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await wrap_app_handling_exceptions(app, request)(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await app(scope, receive, sender) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 72, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: response = await func(request) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 299, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise e 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 294, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raw_response = await run_endpoint_function( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 191, in run_endpoint_function 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await dependant.call(values) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 103, in create_chat_completion 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: generator = await openai_serving_chat.create_chat_completion( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/serving_chat.py", line 198, in create_chat_completion 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await self.chat_completion_full_generator( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/serving_chat.py", line 360, in chat_completion_full_generator 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for res in result_generator: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 662, in generate 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for output in self._process_request( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 769, in _process_request 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise e 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 765, in _process_request 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for request_output in stream: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 80, in anext 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise result 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/uvicorn/protocols/http/httptools_impl.py", line 419, in run_asgi 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: result = await app( # type: ignore[func-returns-value] 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/uvicorn/middleware/proxy_headers.py", line 84, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/applications.py", line 1054, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await super().call(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/applications.py", line 123, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.middleware_stack(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/errors.py", line 186, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/errors.py", line 164, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, _send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/cors.py", line 83, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/exceptions.py", line 62, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await app(scope, receive, sender) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 762, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.middleware_stack(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 782, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await route.handle(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 297, in handle 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 77, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await wrap_app_handling_exceptions(app, request)(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await app(scope, receive, sender) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 72, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: response = await func(request) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 299, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise e 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 294, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raw_response = await run_endpoint_function( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 191, in run_endpoint_function 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await dependant.call(values) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 103, in create_chat_completion 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: generator = await openai_serving_chat.create_chat_completion( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/serving_chat.py", line 198, in create_chat_completion 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await self.chat_completion_full_generator( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/serving_chat.py", line 360, in chat_completion_full_generator 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for res in result_generator: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 662, in generate 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for output in self._process_request( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 769, in _process_request 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise e 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 765, in _process_request 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for request_output in stream: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 80, in anext 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise result 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 40, in _raise_exception_on_finish 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: task.result() 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 521, in run_engine_loop 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: has_requests_in_progress = await asyncio.wait_for( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/asyncio/tasks.py", line 479, in wait_for 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return fut.result() 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 495, in engine_step 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: request_outputs = await self.engine.step_async() 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 226, in step_async 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: output = await self.model_executor.execute_model_async( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/executor/gpu_executor.py", line 117, in execute_model_async 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: output = await make_async(self.driver_worker.execute_model 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/concurrent/futures/thread.py", line 58, in run 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: result = self.fn(*self.args, *self.kwargs) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return func(args, kwargs) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/worker/worker.py", line 272, in execute_model 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: output = self.model_runner.execute_model(seq_group_metadata_list, 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return func(*args, kwargs) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/worker/model_runner.py", line 738, in execute_model 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: output = self.model.sample( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/model_executor/models/chatglm.py", line 379, in sample 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: next_tokens = self.sampler(logits, sampling_metadata) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return self._call_impl(*args, *kwargs) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return forward_call(args, kwargs) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/model_executor/layers/sampler.py", line 96, in forward 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: sample_results, maybe_sampled_tokens_tensor = _sample( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/model_executor/layers/sampler.py", line 655, in _sample 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return _sample_with_torch( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/model_executor/layers/sampler.py", line 544, in _sample_with_torch 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: sample_results = _random_sample(seq_groups, 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/model_executor/layers/sampler.py", line 324, in _random_sample 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: random_samples = random_samples.cpu() 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: RuntimeError: CUDA error: an illegal memory access was encountered 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1. 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions. 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: The above exception was the direct cause of the following exception: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: Traceback (most recent call last): 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/uvicorn/protocols/http/httptools_impl.py", line 419, in run_asgi 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: result = await app( # type: ignore[func-returns-value] 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/uvicorn/middleware/proxy_headers.py", line 84, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/applications.py", line 1054, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await super().call(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/applications.py", line 123, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.middleware_stack(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/errors.py", line 186, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/errors.py", line 164, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, _send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/cors.py", line 83, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/middleware/exceptions.py", line 62, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await app(scope, receive, sender) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 762, in call 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.middleware_stack(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 782, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await route.handle(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 297, in handle 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await self.app(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 77, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await wrap_app_handling_exceptions(app, request)(scope, receive, send) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise exc 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: await app(scope, receive, sender) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/starlette/routing.py", line 72, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: response = await func(request) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 299, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise e 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 294, in app 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raw_response = await run_endpoint_function( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/fastapi/routing.py", line 191, in run_endpoint_function 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await dependant.call(**values) 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/api_server.py", line 103, in create_chat_completion 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: generator = await openai_serving_chat.create_chat_completion( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/serving_chat.py", line 198, in create_chat_completion 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: return await self.chat_completion_full_generator( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/entrypoints/openai/serving_chat.py", line 360, in chat_completion_full_generator 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for res in result_generator: 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 662, in generate 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: async for output in self._process_request( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 756, in _process_request 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: stream = await self.add_request( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 561, in add_request 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: self.start_background_loop() 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: File "/home/ma/miniconda3/envs/myenv/lib/python3.9/site-packages/vllm/engine/async_llm_engine.py", line 431, in start_background_loop 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: raise AsyncEngineDeadError( 6月 10 16:45:05 ma-MS-TZZ-Z690M bash[107468]: vllm.engine.async_llm_engine.AsyncEngineDeadError: Background loop has errored already. 6月 10 16:45:06 ma-MS-TZZ-Z690M bash[107468]: INFO 06-10 16:45:06 metrics.py:341] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 3 reqs, Swapped: 0 reqs, Pending: 0 reqs> 6月 10 16:45:09 ma-MS-TZZ-Z690M systemd[1]: Stopping Uvicorn server for my app...