vllm本地部署时,vllm engine启动失败 #274

4 months ago

4 months ago

提交前必须检查以下项目 | The following items must be checked before submission

问题类型 | Type of problem

启动命令 | Startup command

操作系统 | Operating system


详细描述问题 | Detailed description of the problem

pip install torch==2.1.0
pip install vllm==0.4.0                #运行之后发现安转了2.1.2的torch
pip install -r requirements.txt    #运行之后发现安转了2.3的torch

pip install torch==2.1.2

import torch


#启动立即遇到报错,初始化vllm engine时报错
python server.py

#系统:ubuntu 20.04,



#llm related

#rag related

#vllm related

#p.s.不使用vllm,用default engine是可以正常运行的


# 请在此处粘贴依赖情况
# Please paste the dependencies here
运行日志或截图 | Runtime logs or screenshots

# 请在此处粘贴运行日志
# Please paste the run log here
(srrat) ruibn@llm1:~/safe/api-for-open-llm$ python server.py
2024-05-17 05:26:43.612 | DEBUG    | api.config:<module>:338 - SETTINGS: {
    "embedding_name": "/home/ruibn/safe/bce-embedding-base_v1",
    "rerank_name": "/home/ruibn/safe/bce-reranker-base_v1",
    "embedding_size": -1,
    "embedding_device": "cuda:0",
    "rerank_device": "cuda:0",
    "trust_remote_code": false,
    "tokenize_mode": "auto",
    "tensor_parallel_size": 1,
    "gpu_memory_utilization": 0.5,
    "max_num_batched_tokens": -1,
    "max_num_seqs": 256,
    "quantization_method": null,
    "enforce_eager": false,
    "max_context_len_to_capture": 8192,
    "max_loras": 1,
    "max_lora_rank": 32,
    "lora_extra_vocab_size": 256,
    "lora_dtype": "auto",
    "max_cpu_loras": -1,
    "lora_modules": "",
    "vllm_disable_log_stats": true,
    "model_name": "qwen2",
    "model_path": "/home/ruibn/safe/Qwen1.5-0.5B-Chat",
    "dtype": "half",
    "load_in_8bit": false,
    "load_in_4bit": false,
    "context_length": -1,
    "chat_template": "qwen2",
    "rope_scaling": null,
    "flash_attn": false,
    "use_streamer_v2": true,
    "interrupt_requests": true,
    "host": "",
    "port": 8000,
    "api_prefix": "/v1",
    "engine": "vllm",
    "tasks": [
    "device_map": "cuda:1",
    "gpus": null,
    "num_gpus": 1,
    "activate_inference": true,
    "model_names": [
    "api_keys": null
/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  return self.fget.__get__(instance, owner)()
2024-05-17 05:26:57.800 | INFO     | api.rag.models.rerank:__init__:45 - Loading from `/home/ruibn/safe/bce-reranker-base_v1`.
ERROR 05-17 05:26:59 pynccl.py:53] Failed to load NCCL library from libnccl.so.2 .It is expected if you are not running on NVIDIA/AMD GPUs.Otherwise please set the environment variable VLLM_NCCL_SO_PATH to point to the correct nccl library path.
INFO 05-17 05:26:59 pynccl_utils.py:17] Failed to import NCCL library: libnccl.so.2: cannot open shared object file: No such file or directory
INFO 05-17 05:26:59 pynccl_utils.py:18] It is expected if you are not running on NVIDIA GPUs.
WARNING 05-17 05:26:59 config.py:748] Casting torch.bfloat16 to torch.float16.
INFO 05-17 05:26:59 llm_engine.py:75] Initializing an LLM engine (v0.4.0) with config: model='/home/ruibn/safe/Qwen1.5-0.5B-Chat', tokenizer='/home/ruibn/safe/Qwen1.5-0.5B-Chat', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO 05-17 05:26:59 selector.py:34] Cannot use FlashAttention backend for Volta and Turing GPUs.
INFO 05-17 05:26:59 selector.py:21] Using XFormers backend.
INFO 05-17 05:27:01 model_runner.py:104] Loading model weights took 0.8865 GB
Traceback (most recent call last):
  File "server.py", line 2, in <module>
    from api.models import (
  File "/home/ruibn/safe/api-for-open-llm/api/models.py", line 199, in <module>
    LLM_ENGINE = create_vllm_engine()
  File "/home/ruibn/safe/api-for-open-llm/api/models.py", line 125, in create_vllm_engine
    engine = AsyncLLMEngine.from_engine_args(engine_args)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/engine/async_llm_engine.py", line 348, in from_engine_args
    engine = cls(
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/engine/async_llm_engine.py", line 311, in __init__
    self.engine = self._init_engine(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/engine/async_llm_engine.py", line 422, in _init_engine
    return engine_class(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/engine/llm_engine.py", line 111, in __init__
    self.model_executor = executor_class(model_config, cache_config,
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/executor/gpu_executor.py", line 40, in __init__
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/executor/gpu_executor.py", line 80, in _init_cache
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/worker/model_runner.py", line 742, in profile_run
    self.execute_model(seqs, kv_caches)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/worker/model_runner.py", line 663, in execute_model
    hidden_states = model_executable(**execute_model_kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs) 
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/model_executor/models/qwen2.py", line 317, in forward
    hidden_states = self.model(input_ids, positions, kv_caches,
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs) 
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/model_executor/models/qwen2.py", line 254, in forward
    hidden_states, residual = layer(
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs) 
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/model_executor/models/qwen2.py", line 207, in forward
    hidden_states = self.self_attn(
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs) 
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/model_executor/models/qwen2.py", line 150, in forward
    qkv, _ = self.qkv_proj(hidden_states)   
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs) 
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/model_executor/layers/linear.py", line 215, in forward
    output_parallel = self.linear_method.apply_weights(
  File "/home/ruibn/.conda/envs/srrat/lib/python3.8/site-packages/vllm/model_executor/layers/linear.py", line 79, in apply_weights
    return F.linear(x, weight, bias)
RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
4 months ago


4 months ago


4 months ago

如何修改?请问是用最新的代码,加上使用vllm==0.4.2的版本就可以解决了吗? @xusenlinzy @FreeRotate

4 months ago

如何修改?请问是用最新的代码,加上使用vllm==0.4.2的版本就可以解决了吗? @xusenlinzy @FreeRotate
