vllm-project / vllm

A high-throughput and memory-efficient inference and serving engine for LLMs
https://docs.vllm.ai
Apache License 2.0
30.38k stars 4.6k forks source link

[Bug]: Can't load Gemma2 model (9b-it, 27b-it) #7141

Closed yjkim3 closed 3 months ago

yjkim3 commented 3 months ago

Your current environment

pip install vllm

vllm version: 0.5.3post1

πŸ› Describe the bug

Trigger

model_params = { "model": 'google/gemma-2-9b-it', "quantization": "fp8", "dtype": "auto", "tensor_parallel_size": 1, "gpu_memory_utilization": 0.95, "max_model_len": 4096, "max_num_seqs": 256, "enable_prefix_caching": False, "enforce_eager": True, "seed": SEED, } llm = LLM(**model_params)

Error

`RuntimeError Traceback (most recent call last) Cell In[13], line 14 1 model_params = { 2 "model": CFG.llm_model_id, 3 "quantization": "fp8", (...) 11 "seed": SEED, 12 } ---> 14 model_llm = CustomLLM(model_params)

Cell In[12], line 3 2 def init(self, model_params): ----> 3 self.llm = LLM(**model_params)

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/entrypoints/llm.py:155, in LLM.init(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, disable_custom_all_reduce, kwargs) 132 raise TypeError( 133 "There is no need to pass vision-related arguments anymore.") 134 engine_args = EngineArgs( 135 model=model, 136 tokenizer=tokenizer, (...) 153 kwargs, 154 ) --> 155 self.llm_engine = LLMEngine.from_engine_args( 156 engine_args, usage_context=UsageContext.LLM_CLASS) 157 self.request_counter = Counter()

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/engine/llm_engine.py:441, in LLMEngine.from_engine_args(cls, engine_args, usage_context, stat_loggers) 439 executor_class = cls._get_executor_cls(engine_config) 440 # Create the LLM engine. --> 441 engine = cls( 442 **engine_config.to_dict(), 443 executor_class=executor_class, 444 log_stats=not engine_args.disable_log_stats, 445 usage_context=usage_context, 446 stat_loggers=stat_loggers, 447 ) 449 return engine

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/engine/llm_engine.py:251, in LLMEngine.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, multimodal_config, speculative_config, decoding_config, observability_config, prompt_adapter_config, executor_class, log_stats, usage_context, stat_loggers) 245 self.generation_config_fields = _load_generation_config_dict( 246 model_config) 248 self.input_processor = INPUT_REGISTRY.create_input_processor( 249 self.model_config) --> 251 self.model_executor = executor_class( 252 model_config=model_config, 253 cache_config=cache_config, 254 parallel_config=parallel_config, 255 scheduler_config=scheduler_config, 256 device_config=device_config, 257 lora_config=lora_config, 258 multimodal_config=multimodal_config, 259 speculative_config=speculative_config, 260 load_config=load_config, 261 prompt_adapter_config=prompt_adapter_config, 262 ) 264 if not self.model_config.embedding_mode: 265 self._initialize_kv_caches()

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/executor/executor_base.py:47, in ExecutorBase.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, multimodal_config, speculative_config, prompt_adapter_config) 44 self.speculative_config = speculative_config 45 self.prompt_adapter_config = prompt_adapter_config ---> 47 self._init_executor()

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/executor/gpu_executor.py:36, in GPUExecutor._init_executor(self) 34 self.driver_worker = self._create_worker() 35 self.driver_worker.init_device() ---> 36 self.driver_worker.load_model()

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/worker/worker.py:139, in Worker.load_model(self) 138 def load_model(self): --> 139 self.model_runner.load_model()

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/worker/model_runner.py:682, in GPUModelRunnerBase.load_model(self) 680 logger.info("Starting to load model %s...", self.model_config.model) 681 with CudaMemoryProfiler() as m: --> 682 self.model = get_model(model_config=self.model_config, 683 device_config=self.device_config, 684 load_config=self.load_config, 685 lora_config=self.lora_config, 686 multimodal_config=self.multimodal_config, 687 parallel_config=self.parallel_config, 688 scheduler_config=self.scheduler_config, 689 cache_config=self.cache_config) 691 self.model_memory_usage = m.consumed_memory 692 logger.info("Loading model weights took %.4f GB", 693 self.model_memory_usage / float(2**30))

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/model_executor/model_loader/init.py:21, in get_model(model_config, load_config, device_config, parallel_config, scheduler_config, lora_config, multimodal_config, cache_config) 14 def get_model(*, model_config: ModelConfig, load_config: LoadConfig, 15 device_config: DeviceConfig, parallel_config: ParallelConfig, 16 scheduler_config: SchedulerConfig, 17 lora_config: Optional[LoRAConfig], 18 multimodal_config: Optional[MultiModalConfig], 19 cache_config: CacheConfig) -> nn.Module: 20 loader = get_model_loader(load_config) ---> 21 return loader.load_model(model_config=model_config, 22 device_config=device_config, 23 lora_config=lora_config, 24 multimodal_config=multimodal_config, 25 parallel_config=parallel_config, 26 scheduler_config=scheduler_config, 27 cache_config=cache_config)

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py:283, in DefaultModelLoader.load_model(self, model_config, device_config, lora_config, multimodal_config, parallel_config, scheduler_config, cache_config) 279 with torch.device(device_config.device): 280 model = _initialize_model(model_config, self.load_config, 281 lora_config, multimodal_config, 282 cache_config, scheduler_config) --> 283 model.load_weights( 284 self._get_weights_iterator(model_config.model, 285 model_config.revision, 286 fall_back_to_pt=getattr( 287 model, 288 "fall_back_to_pt_during_load", 289 True)), ) 291 for _, module in model.named_modules(): 292 quant_method = getattr(module, "quant_method", None)

File ~/vllm-chunking/.venv/lib/python3.10/site-packages/vllm/model_executor/models/gemma2.py:393, in Gemma2ForCausalLM.load_weights(self, weights) 391 unloaded_params = params_dict.keys() - loaded_params 392 if unloaded_params: --> 393 raise RuntimeError( 394 "Some weights are not initialized from checkpoints: " 395 f"{unloaded_params}")

RuntimeError: Some weights are not initialized from checkpoints: {'model.layers.14.self_attn.attn.k_scale', 'model.layers.11.self_attn.attn.v_scale', 'model.layers.14.self_attn.attn.v_scale', 'model.layers.13.self_attn.attn.k_scale', 'model.layers.7.self_attn.attn.v_scale', 'model.layers.33.self_attn.attn.v_scale', 'model.layers.31.self_attn.attn.v_scale', 'model.layers.29.self_attn.attn.k_scale', 'model.layers.16.self_attn.attn.v_scale', 'model.layers.24.self_attn.attn.v_scale', 'model.layers.15.self_attn.attn.k_scale', 'model.layers.4.self_attn.attn.k_scale', 'model.layers.41.self_attn.attn.v_scale', 'model.layers.3.self_attn.attn.v_scale', 'model.layers.37.self_attn.attn.k_scale', 'model.layers.30.self_attn.attn.k_scale', 'model.layers.22.self_attn.attn.k_scale', 'model.layers.9.self_attn.attn.k_scale', 'model.layers.4.self_attn.attn.v_scale', 'model.layers.0.self_attn.attn.v_scale', 'model.layers.30.self_attn.attn.v_scale', 'model.layers.8.self_attn.attn.v_scale', 'model.layers.40.self_attn.attn.v_scale', 'model.layers.39.self_attn.attn.v_scale', 'model.layers.41.self_attn.attn.k_scale', 'model.layers.31.self_attn.attn.k_scale', 'model.layers.19.self_attn.attn.k_scale', 'model.layers.23.self_attn.attn.k_scale', 'model.layers.9.self_attn.attn.v_scale', 'model.layers.24.self_attn.attn.k_scale', 'model.layers.5.self_attn.attn.k_scale', 'model.layers.13.self_attn.attn.v_scale', 'model.layers.26.self_attn.attn.v_scale', 'model.layers.27.self_attn.attn.k_scale', 'model.layers.0.self_attn.attn.k_scale', 'model.layers.38.self_attn.attn.k_scale', 'model.layers.40.self_attn.attn.k_scale', 'model.layers.5.self_attn.attn.v_scale', 'model.layers.22.self_attn.attn.v_scale', 'model.layers.27.self_attn.attn.v_scale', 'model.layers.25.self_attn.attn.v_scale', 'model.layers.18.self_attn.attn.k_scale', 'model.layers.10.self_attn.attn.k_scale', 'model.layers.15.self_attn.attn.v_scale', 'model.layers.35.self_attn.attn.k_scale', 'model.layers.18.self_attn.attn.v_scale', 'model.layers.8.self_attn.attn.k_scale', 'model.layers.12.self_attn.attn.v_scale', 'model.layers.36.self_attn.attn.k_scale', 'model.layers.10.self_attn.attn.v_scale', 'model.layers.6.self_attn.attn.k_scale', 'model.layers.35.self_attn.attn.v_scale', 'model.layers.36.self_attn.attn.v_scale', 'model.layers.26.self_attn.attn.k_scale', 'model.layers.19.self_attn.attn.v_scale', 'model.layers.16.self_attn.attn.k_scale', 'model.layers.1.self_attn.attn.v_scale', 'model.layers.23.self_attn.attn.v_scale', 'model.layers.20.self_attn.attn.k_scale', 'model.layers.2.self_attn.attn.k_scale', 'model.layers.21.self_attn.attn.v_scale', 'model.layers.20.self_attn.attn.v_scale', 'model.layers.12.self_attn.attn.k_scale', 'model.layers.32.self_attn.attn.v_scale', 'model.layers.38.self_attn.attn.v_scale', 'model.layers.32.self_attn.attn.k_scale', 'model.layers.6.self_attn.attn.v_scale', 'model.layers.17.self_attn.attn.v_scale', 'model.layers.34.self_attn.attn.v_scale', 'model.layers.39.self_attn.attn.k_scale', 'model.layers.37.self_attn.attn.v_scale', 'model.layers.28.self_attn.attn.k_scale', 'model.layers.29.self_attn.attn.v_scale', 'model.layers.7.self_attn.attn.k_scale', 'model.layers.34.self_attn.attn.k_scale', 'model.layers.25.self_attn.attn.k_scale', 'model.layers.3.self_attn.attn.k_scale', 'model.layers.33.self_attn.attn.k_scale', 'model.layers.28.self_attn.attn.v_scale', 'model.layers.2.self_attn.attn.v_scale', 'model.layers.11.self_attn.attn.k_scale', 'model.layers.1.self_attn.attn.k_scale', 'model.layers.21.self_attn.attn.k_scale', 'model.layers.17.self_attn.attn.k_scale'}`

mgoin commented 3 months ago

This is a known issue with fp8 quantization for gemma and has been resolved with https://github.com/vllm-project/vllm/pull/7002