Closed xuechendi closed 1 month ago
The output of `python collect_env.py`
# VLLM_SKIP_WARMUP=true PT_HPU_ENABLE_LAZY_COLLECTIVES=true VLLM_RAY_DISABLE_LOG_TO_DRIVER=1 from vllm import LLM, SamplingParams from time import sleep import traceback def test_llm_model(model): tp_size = 1 llm = LLM(model=model, tensor_parallel_size=tp_size, block_size=128, trust_remote_code=True, enforce_eager=True, ) prompts = [ "The future of AI is", ] sampling_params = [ SamplingParams(temperature=0.0, max_tokens=128,) for _ in prompts ] output = llm.generate(prompts, sampling_params=sampling_params, use_tqdm=False) del llm return output if __name__ == "__main__": model_list = ['EleutherAI/gpt-j-6b'] for model in model_list: try: output = test_llm_model(model=model) print(f"- {model} succeed") print(f"output is", output) except Exception as e: print(f" - {model} faied!") print(f"Error info: {e}") print(traceback.format_exc())
error log
============================= HABANA PT BRIDGE CONFIGURATION =========================== PT_HPU_LAZY_MODE = 1 PT_RECIPE_CACHE_PATH = PT_CACHE_FOLDER_DELETE = 0 PT_HPU_RECIPE_CACHE_CONFIG = PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807 PT_HPU_LAZY_ACC_PAR_MODE = 1 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0 ---------------------------: System Configuration :--------------------------- Num CPU Cores : 152 CPU RAM : 1056440364 KB ------------------------------------------------------------------------------ INFO 08-28 22:57:42 selector.py:85] Using HabanaAttention backend. INFO 08-28 22:57:42 loader.py:284] Loading weights on hpu ... INFO 08-28 22:57:43 weight_utils.py:224] Using model weights format ['*.bin'] Loading pt checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s] Loading pt checkpoint shards: 100% Completed | 1/1 [00:22<00:00, 22.98s/it] Loading pt checkpoint shards: 100% Completed | 1/1 [00:22<00:00, 22.98s/it] INFO 08-28 22:58:06 habana_model_runner.py:472] Pre-loading model weights on hpu:0 took 11.27 GiB of device memory (11.28 GiB/94.62 GiB used) and 23.58 GiB of host memory (101.3 GiB/1008 GiB used) INFO 08-28 22:58:06 habana_model_runner.py:517] Wrapping in HPU Graph took 0 B of device memory (11.28 GiB/94.62 GiB used) and -252 KiB of host memory (101.4 GiB/1008 GiB used) INFO 08-28 22:58:06 habana_model_runner.py:521] Loading model weights took in total 11.27 GiB of device memory (11.28 GiB/94.62 GiB used) and 23.71 GiB of host memory (101.4 GiB/1008 GiB used) - EleutherAI/gpt-j-6b faied! Error info: Graph compile failed. synStatus=synStatus 26 [Generic failure]. Traceback (most recent call last): File "/workspace/script/test_llm_generate_modellist.py", line 49, in <module> output = test_llm_model(model=model) File "/workspace/script/test_llm_generate_modellist.py", line 14, in test_llm_model llm = LLM(model=model, File "/workspace/vllm/vllm/entrypoints/llm.py", line 155, in __init__ self.llm_engine = LLMEngine.from_engine_args( File "/workspace/vllm/vllm/engine/llm_engine.py", line 456, in from_engine_args engine = cls( File "/workspace/vllm/vllm/engine/llm_engine.py", line 266, in __init__ self._initialize_kv_caches() File "/workspace/vllm/vllm/engine/llm_engine.py", line 365, in _initialize_kv_caches self.model_executor.determine_num_available_blocks()) File "/workspace/vllm/vllm/executor/habana_executor.py", line 77, in determine_num_available_blocks return self.driver_worker.determine_num_available_blocks() File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/workspace/vllm/vllm/worker/habana_worker.py", line 142, in determine_num_available_blocks self.model_runner.profile_run() File "/workspace/vllm/vllm/worker/habana_model_runner.py", line 1141, in profile_run self.warmup_scenario(max_batch_size, File "/workspace/vllm/vllm/worker/habana_model_runner.py", line 1203, in warmup_scenario self.execute_model(inputs, kv_caches, warmup_mode=True) File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/workspace/vllm/vllm/worker/habana_model_runner.py", line 1652, in execute_model output = self.model.sample( File "/workspace/vllm/vllm/worker/habana_model_runner.py", line 241, in sample return self.model.sample(*args, **kwargs) File "/workspace/vllm/vllm/model_executor/models/gpt_j.py", line 260, in sample next_tokens = self.sampler(logits, sampling_metadata) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1535, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1585, in _call_impl result = forward_call(*args, **kwargs) File "/workspace/vllm/vllm/model_executor/layers/sampler.py", line 138, in forward sample_results, maybe_sampled_tokens_tensor = _sample( File "/workspace/vllm/vllm/model_executor/layers/sampler.py", line 711, in _sample return _sample_with_torch( File "/workspace/vllm/vllm/model_executor/layers/sampler.py", line 592, in _sample_with_torch sample_results = _greedy_sample(seq_groups, greedy_samples) File "/workspace/vllm/vllm/model_executor/layers/sampler.py", line 336, in _greedy_sample samples_lst = samples.tolist() RuntimeError: Graph compile failed. synStatus=synStatus 26 [Generic failure]. inc shutdown inc shutdown inc shutdown inc shutdown
Fixed with https://github.com/HabanaAI/vllm-fork/pull/245
Your current environment
🐛 Describe the bug
error log