Hello, I met the following error when running blend.py
I changed the model to Llama-3-8B-Instruct since I have no access to mixtral models. Will that cause error?
Log:
$ python example/blend.py
INFO 08-27 10:29:40 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='/mnt/data/models/Llama-3-8B-Instruct', speculative_config=None, tokenizer='/mnt/data/models/Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)
INFO 08-27 10:29:40 utils.py:608] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 08-27 10:29:41 selector.py:28] Using FlashAttention backend.
INFO 08-27 10:29:45 model_runner.py:190] Loading model weights took 14.9595 GB
Traceback (most recent call last):
File "/mnt/data/pgw/CacheBlend/example/blend.py", line 5, in <module>
llm = LLM(model="/mnt/data/models/Llama-3-8B-Instruct", gpu_memory_utilization=0.9, tensor_parallel_size=1)
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/entrypoints/llm.py", line 118, in __init__
self.llm_engine = LLMEngine.from_engine_args(
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/engine/llm_engine.py", line 277, in from_engine_args
engine = cls(
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/engine/llm_engine.py", line 160, in __init__
self._initialize_kv_caches()
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/engine/llm_engine.py", line 236, in _initialize_kv_caches
self.model_executor.determine_num_available_blocks())
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/executor/gpu_executor.py", line 111, in determine_num_available_blocks
return self.driver_worker.determine_num_available_blocks()
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/worker/worker.py", line 147, in determine_num_available_blocks
self.model_runner.profile_run()
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/worker/model_runner.py", line 994, in profile_run
self.execute_model(seqs, kv_caches)
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/worker/model_runner.py", line 888, in execute_model
hidden_states = model_executable(**execute_model_kwargs)
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/model_executor/models/llama.py", line 444, in forward
hidden_states = self.model(input_ids, positions, kv_caches,
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/model_executor/models/llama.py", line 361, in forward
hidden_states, residual = layer(
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/model_executor/models/llama.py", line 251, in forward
hidden_states = self.self_attn(
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/model_executor/models/llama.py", line 184, in forward
attn_output = self.attn(q, k, v, kv_cache, attn_metadata,
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/data/pgw/CacheBlend/vllm_blend/vllm/attention/layer.py", line 53, in forward
return self.impl.forward(query, key, value, kv_cache, attn_metadata,
TypeError: FlashAttentionImpl.forward() takes 7 positional arguments but 10 were given
Hello, I met the following error when running
blend.py
I changed the model to
Llama-3-8B-Instruct
since I have no access to mixtral models. Will that cause error?Log: