[Bug]: llama 405B fp8 fails

Your current environment

#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Set default values
default_port=8008
default_model=$LLM_MODEL
default_hw_mode="cpu"
default_parallel_number=1
default_block_size=128
default_max_num_seqs=256
default_max_seq_len_to_capture=2048

# Assign arguments to variables
port_number=${1:-$default_port}
model_name=${2:-$default_model}
hw_mode=${3:-$default_hw_mode}
parallel_number=${4:-$default_parallel_number}
block_size=${5:-$default_block_size}
max_num_seqs=${6:-$default_max_num_seqs}
max_seq_len_to_capture=${7:-$default_max_seq_len_to_capture}

# Check if all required arguments are provided
if [ "$#" -lt 0 ] || [ "$#" -gt 4 ]; then
    echo "Usage: $0 [port_number] [model_name] [hw_mode] [parallel_number]"
    echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080."
    echo "model_name: The model name utilized for LLM, with the default set to 'meta-llama/Meta-Llama-3-8B-Instruct'."
    echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection can be 'hpu'"
    echo "parallel_number: parallel nodes number for 'hpu' mode"
    echo "block_size: default set to 128 for better performance on HPU"
    echo "max_num_seqs: default set to 256 for better performance on HPU"
    echo "max_seq_len_to_capture: default set to 2048 for better performance on HPU"
    exit 1
fi

# Set the volume variable
volume=/storage/hf_models

# Build the Docker run command based on hardware mode
if [ "$hw_mode" = "hpu" ]; then
    docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name  --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs  $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture "
else
    docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model $model_name --host 0.0.0.0 --port 80"
fi

root@laion-gaudi2-00:~/GenAIComps/comps/llms/text-generation/vllm/docker# cat Dockerfile.hpu
# FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
FROM vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest

ENV LANG=en_US.UTF-8

WORKDIR /root

RUN pip install --upgrade-strategy eager optimum[habana]

RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@48c2775

RUN pip install setuptools==69.5.1

RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
    service ssh restart

ENV no_proxy=localhost,127.0.0.1

ENV PT_HPU_LAZY_ACC_PAR_MODE=0

ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

CMD ["/bin/bash"]


docker lroot@laion-gaudi2-00:~/GenAIComps/comps/llms/text-generation/vllm# docker logs -f ce742a5b3870b325031329f6e6b2282548fbb38d026e50daa8e75ed2bbcad7ad
WARNING 07-31 01:28:37 _custom_ops.py:14] Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")
INFO 07-31 01:28:42 api_server.py:286] vLLM API server version 0.5.3.post1
INFO 07-31 01:28:42 api_server.py:287] args: Namespace(host='0.0.0.0', port=80, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, model='PrimeIntellect/Meta-Llama-3.1-405B-Instruct-FP8', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', dtype='auto', kv_cache_dtype='auto', quantization_param_path=None, max_model_len=None, guided_decoding_backend='outlines', distributed_executor_backend=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=128, enable_prefix_caching=False, disable_sliding_window=False, use_v2_block_manager=False, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_seqs=256, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, enforce_eager=True, max_context_len_to_capture=None, max_seq_len_to_capture=2048, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, enable_lora=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, num_speculative_tokens=None, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, engine_use_ray=False, disable_log_requests=False, max_log_len=None)
WARNING 07-31 01:28:42 arg_utils.py:778] The model has a long context length (131072). This may cause OOM errors during the initial memory profiling phase, or result in low performance due to small KV cache space. Consider setting --max-model-len to a smaller value.
INFO 07-31 01:28:42 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='PrimeIntellect/Meta-Llama-3.1-405B-Instruct-FP8', speculative_config=None, tokenizer='PrimeIntellect/Meta-Llama-3.1-405B-Instruct-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fbgemm_fp8, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=hpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=PrimeIntellect/Meta-Llama-3.1-405B-Instruct-FP8, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 07-31 01:28:43 profiler.py:62] Profiler enabled for: vllm-instance-c67195d547aa474a8afb62f611c5ef29
WARNING 07-31 01:28:43 utils.py:565] Pin memory is not supported on HPU.
INFO 07-31 01:28:43 selector.py:85] Using HabanaAttention backend.
INFO 07-31 01:28:43 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024]
INFO 07-31 01:28:43 habana_model_runner.py:499] Generated 56 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (8, 128), (8, 256), (8, 384), (8, 512), (8, 640), (8, 768), (8, 896), (8, 1024), (16, 128), (16, 256), (16, 384), (16, 512), (16, 640), (16, 768), (16, 896), (16, 1024), (32, 128), (32, 256), (32, 384), (32, 512), (32, 640), (32, 768), (32, 896), (32, 1024), (64, 128), (64, 256), (64, 384), (64, 512), (64, 640), (64, 768), (64, 896), (64, 1024)]
INFO 07-31 01:28:43 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 256], seq:[128, 128, 2048]
INFO 07-31 01:28:43 habana_model_runner.py:509] Generated 144 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048), (8, 128), (8, 256), (8, 384), (8, 512), (8, 640), (8, 768), (8, 896), (8, 1024), (8, 1152), (8, 1280), (8, 1408), (8, 1536), (8, 1664), (8, 1792), (8, 1920), (8, 2048), (16, 128), (16, 256), (16, 384), (16, 512), (16, 640), (16, 768), (16, 896), (16, 1024), (16, 1152), (16, 1280), (16, 1408), (16, 1536), (16, 1664), (16, 1792), (16, 1920), (16, 2048), (32, 128), (32, 256), (32, 384), (32, 512), (32, 640), (32, 768), (32, 896), (32, 1024), (32, 1152), (32, 1280), (32, 1408), (32, 1536), (32, 1664), (32, 1792), (32, 1920), (32, 2048), (64, 128), (64, 256), (64, 384), (64, 512), (64, 640), (64, 768), (64, 896), (64, 1024), (64, 1152), (64, 1280), (64, 1408), (64, 1536), (64, 1664), (64, 1792), (64, 1920), (64, 2048), (128, 128), (128, 256), (128, 384), (128, 512), (128, 640), (128, 768), (128, 896), (128, 1024), (128, 1152), (128, 1280), (128, 1408), (128, 1536), (128, 1664), (128, 1792), (128, 1920), (128, 2048), (256, 128), (256, 256), (256, 384), (256, 512), (256, 640), (256, 768), (256, 896), (256, 1024), (256, 1152), (256, 1280), (256, 1408), (256, 1536), (256, 1664), (256, 1792), (256, 1920), (256, 2048)]
============================= HABANA PT BRIDGE CONFIGURATION ===========================
 PT_HPU_LAZY_MODE = 1
 PT_RECIPE_CACHE_PATH =
 PT_CACHE_FOLDER_DELETE = 0
 PT_HPU_RECIPE_CACHE_CONFIG =
 PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
 PT_HPU_LAZY_ACC_PAR_MODE = 0
 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
---------------------------: System Configuration :---------------------------
Num CPU Cores : 160
CPU RAM       : 1056375276 KB
------------------------------------------------------------------------------
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 312, in <module>
    asyncio.run(run_server(args))
  File "/usr/lib/python3.10/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "/usr/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
    return future.result()
  File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 289, in run_server
    app = await init_app(args, llm_engine)
  File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 229, in init_app
    if llm_engine is not None else AsyncLLMEngine.from_engine_args(
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 479, in from_engine_args
    engine = cls(
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 380, in __init__
    self.engine = self._init_engine(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 560, in _init_engine
    return engine_class(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 251, in __init__
    self.model_executor = executor_class(
  File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 47, in __init__
    self._init_executor()
  File "/usr/local/lib/python3.10/dist-packages/vllm/executor/habana_executor.py", line 27, in _init_executor
    self._init_worker()
  File "/usr/local/lib/python3.10/dist-packages/vllm/executor/habana_executor.py", line 71, in _init_worker
    self.driver_worker.load_model()
  File "/usr/local/lib/python3.10/dist-packages/vllm/worker/habana_worker.py", line 105, in load_model
    self.model_runner.load_model()
  File "/usr/local/lib/python3.10/dist-packages/vllm/worker/habana_model_runner.py", line 418, in load_model
    self.model = get_model(
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/__init__.py", line 21, in get_model
    return loader.load_model(model_config=model_config,
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 280, in load_model
    model = _initialize_model(model_config, self.load_config,
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 109, in _initialize_model
    quant_config = _get_quantization_config(model_config, load_config)
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 50, in _get_quantization_config
    quant_config = get_quant_config(model_config, load_config)
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/weight_utils.py", line 131, in get_quant_config
    return quant_cls.from_config(hf_quant_config)
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/quantization/fbgemm_fp8.py", line 58, in from_config
    return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/quantization/fbgemm_fp8.py", line 34, in __init__
    capability = current_platform.get_device_capability()
  File "/usr/local/lib/python3.10/dist-packages/vllm/platforms/interface.py", line 28, in get_device_capability
    raise NotImplementedError
NotImplementedError

### 🐛 Describe the bug

docker lroot@laion-gaudi2-00:~/GenAIComps/comps/llms/text-generation/vllm# docker logs -f ce742a5b3870b325031329f6e6b2282548fbb38d026e50daa8e75ed2bbcad7ad
WARNING 07-31 01:28:37 _custom_ops.py:14] Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")
INFO 07-31 01:28:42 api_server.py:286] vLLM API server version 0.5.3.post1
INFO 07-31 01:28:42 api_server.py:287] args: Namespace(host='0.0.0.0', port=80, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, model='PrimeIntellect/Meta-Llama-3.1-405B-Instruct-FP8', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', dtype='auto', kv_cache_dtype='auto', quantization_param_path=None, max_model_len=None, guided_decoding_backend='outlines', distributed_executor_backend=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=128, enable_prefix_caching=False, disable_sliding_window=False, use_v2_block_manager=False, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_seqs=256, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, enforce_eager=True, max_context_len_to_capture=None, max_seq_len_to_capture=2048, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, enable_lora=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, num_speculative_tokens=None, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, engine_use_ray=False, disable_log_requests=False, max_log_len=None)
WARNING 07-31 01:28:42 arg_utils.py:778] The model has a long context length (131072). This may cause OOM errors during the initial memory profiling phase, or result in low performance due to small KV cache space. Consider setting --max-model-len to a smaller value.
INFO 07-31 01:28:42 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='PrimeIntellect/Meta-Llama-3.1-405B-Instruct-FP8', speculative_config=None, tokenizer='PrimeIntellect/Meta-Llama-3.1-405B-Instruct-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fbgemm_fp8, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=hpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=PrimeIntellect/Meta-Llama-3.1-405B-Instruct-FP8, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 07-31 01:28:43 profiler.py:62] Profiler enabled for: vllm-instance-c67195d547aa474a8afb62f611c5ef29
WARNING 07-31 01:28:43 utils.py:565] Pin memory is not supported on HPU.
INFO 07-31 01:28:43 selector.py:85] Using HabanaAttention backend.
INFO 07-31 01:28:43 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024]
INFO 07-31 01:28:43 habana_model_runner.py:499] Generated 56 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (8, 128), (8, 256), (8, 384), (8, 512), (8, 640), (8, 768), (8, 896), (8, 1024), (16, 128), (16, 256), (16, 384), (16, 512), (16, 640), (16, 768), (16, 896), (16, 1024), (32, 128), (32, 256), (32, 384), (32, 512), (32, 640), (32, 768), (32, 896), (32, 1024), (64, 128), (64, 256), (64, 384), (64, 512), (64, 640), (64, 768), (64, 896), (64, 1024)]
INFO 07-31 01:28:43 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 256], seq:[128, 128, 2048]
INFO 07-31 01:28:43 habana_model_runner.py:509] Generated 144 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048), (8, 128), (8, 256), (8, 384), (8, 512), (8, 640), (8, 768), (8, 896), (8, 1024), (8, 1152), (8, 1280), (8, 1408), (8, 1536), (8, 1664), (8, 1792), (8, 1920), (8, 2048), (16, 128), (16, 256), (16, 384), (16, 512), (16, 640), (16, 768), (16, 896), (16, 1024), (16, 1152), (16, 1280), (16, 1408), (16, 1536), (16, 1664), (16, 1792), (16, 1920), (16, 2048), (32, 128), (32, 256), (32, 384), (32, 512), (32, 640), (32, 768), (32, 896), (32, 1024), (32, 1152), (32, 1280), (32, 1408), (32, 1536), (32, 1664), (32, 1792), (32, 1920), (32, 2048), (64, 128), (64, 256), (64, 384), (64, 512), (64, 640), (64, 768), (64, 896), (64, 1024), (64, 1152), (64, 1280), (64, 1408), (64, 1536), (64, 1664), (64, 1792), (64, 1920), (64, 2048), (128, 128), (128, 256), (128, 384), (128, 512), (128, 640), (128, 768), (128, 896), (128, 1024), (128, 1152), (128, 1280), (128, 1408), (128, 1536), (128, 1664), (128, 1792), (128, 1920), (128, 2048), (256, 128), (256, 256), (256, 384), (256, 512), (256, 640), (256, 768), (256, 896), (256, 1024), (256, 1152), (256, 1280), (256, 1408), (256, 1536), (256, 1664), (256, 1792), (256, 1920), (256, 2048)]
============================= HABANA PT BRIDGE CONFIGURATION ===========================
 PT_HPU_LAZY_MODE = 1
 PT_RECIPE_CACHE_PATH =
 PT_CACHE_FOLDER_DELETE = 0
 PT_HPU_RECIPE_CACHE_CONFIG =
 PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
 PT_HPU_LAZY_ACC_PAR_MODE = 0
 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
---------------------------: System Configuration :---------------------------
Num CPU Cores : 160
CPU RAM       : 1056375276 KB
------------------------------------------------------------------------------
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 312, in <module>
    asyncio.run(run_server(args))
  File "/usr/lib/python3.10/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "/usr/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
    return future.result()
  File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 289, in run_server
    app = await init_app(args, llm_engine)
  File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 229, in init_app
    if llm_engine is not None else AsyncLLMEngine.from_engine_args(
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 479, in from_engine_args
    engine = cls(
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 380, in __init__
    self.engine = self._init_engine(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 560, in _init_engine
    return engine_class(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 251, in __init__
    self.model_executor = executor_class(
  File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 47, in __init__
    self._init_executor()
  File "/usr/local/lib/python3.10/dist-packages/vllm/executor/habana_executor.py", line 27, in _init_executor
    self._init_worker()
  File "/usr/local/lib/python3.10/dist-packages/vllm/executor/habana_executor.py", line 71, in _init_worker
    self.driver_worker.load_model()
  File "/usr/local/lib/python3.10/dist-packages/vllm/worker/habana_worker.py", line 105, in load_model
    self.model_runner.load_model()
  File "/usr/local/lib/python3.10/dist-packages/vllm/worker/habana_model_runner.py", line 418, in load_model
    self.model = get_model(
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/__init__.py", line 21, in get_model
    return loader.load_model(model_config=model_config,
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 280, in load_model
    model = _initialize_model(model_config, self.load_config,
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 109, in _initialize_model
    quant_config = _get_quantization_config(model_config, load_config)
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 50, in _get_quantization_config
    quant_config = get_quant_config(model_config, load_config)
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/weight_utils.py", line 131, in get_quant_config
    return quant_cls.from_config(hf_quant_config)
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/quantization/fbgemm_fp8.py", line 58, in from_config
    return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
  File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/quantization/fbgemm_fp8.py", line 34, in __init__
    capability = current_platform.get_device_capability()
  File "/usr/local/lib/python3.10/dist-packages/vllm/platforms/interface.py", line 28, in get_device_capability
    raise NotImplementedError
NotImplementedError

HabanaAI / vllm-fork

[Bug]: llama 405B fp8 fails #140

Your current environment