ValueError: Model architectures ['LlamaForSequenceClassification'] are not supported for now.

Hello， I download the model(NCSOFT/Llama-3-OffsetBias-RM-8B) from hugginface。 and then run the code below:

pip install -r requirements.txt

and then

from module import VllmModule

instruction = "explain like im 5"
output_a = "Scientists are studying special cells that could help treat a sickness called prostate cancer. They even tried these cells on mice and it worked!"
output_b = "Sure, I'd be happy to help explain something to you! What would you like me to explain?"

model_name = "/root/offsetbias/Llama-3-OffsetBias-RM-8B"
# model_name = "NCSOFT/Llama-3-OffsetBias-RM-8B"
module = VllmModule(prompt_name="offsetbias", model_name=model_name)

conversation = module.make_conversation(
  instruction=instruction,
  response1=output_a,
  response2=output_b,
  swap=False)

output = module.generate([conversation])
print(output[0])
# The model should output "Output (b)"

but got the error below:

Initializing vllm model...
model args: {'model': '/root/offsetbias/Llama-3-OffsetBias-RM-8B', 'dtype': 'float16'}
WARNING 07-26 21:02:00 config.py:1434] Casting torch.bfloat16 to torch.float16.
INFO 07-26 21:02:00 llm_engine.py:176] Initializing an LLM engine (v0.5.3) with config: model='/root/offsetbias/Llama-3-OffsetBias-RM-8B', speculative_config=None, tokenizer='/root/offsetbias/Llama-3-OffsetBias-RM-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/root/offsetbias/Llama-3-OffsetBias-RM-8B, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 07-26 21:02:00 model_runner.py:680] Starting to load model /root/offsetbias/Llama-3-OffsetBias-RM-8B...
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 9
      7 model_name = "/root/offsetbias/Llama-3-OffsetBias-RM-8B"
      8 # model_name = "NCSOFT/Llama-3-OffsetBias-RM-8B"
----> 9 module = VllmModule(prompt_name="offsetbias", model_name=model_name)
     11 conversation = module.make_conversation(
     12   instruction=instruction,
     13   response1=output_a,
     14   response2=output_b,
     15   swap=False)
     17 output = module.generate([conversation])

File ~/offsetbias/module.py:72, in VllmModule.__init__(self, prompt_name, model_name, dtype, temperature, max_tokens, config)
     70 tokenizer_name = self.config.get("tokenizer", self.model_name)
     71 self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
---> 72 self.model = LLM(**model_args)
     74 sampling_params_args = dict(temperature=temperature, max_tokens=max_tokens)
     75 sampling_params_args.update(vllm_args.get("sampling_params", {}))

File ~/anaconda3/lib/python3.11/site-packages/vllm/entrypoints/llm.py:155, in LLM.__init__(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, disable_custom_all_reduce, **kwargs)
    132     raise TypeError(
    133         "There is no need to pass vision-related arguments anymore.")
    134 engine_args = EngineArgs(
    135     model=model,
    136     tokenizer=tokenizer,
   (...)
    153     **kwargs,
    154 )
--> 155 self.llm_engine = LLMEngine.from_engine_args(
    156     engine_args, usage_context=UsageContext.LLM_CLASS)
    157 self.request_counter = Counter()

File ~/anaconda3/lib/python3.11/site-packages/vllm/engine/llm_engine.py:441, in LLMEngine.from_engine_args(cls, engine_args, usage_context, stat_loggers)
    439 executor_class = cls._get_executor_cls(engine_config)
    440 # Create the LLM engine.
--> 441 engine = cls(
    442     **engine_config.to_dict(),
    443     executor_class=executor_class,
    444     log_stats=not engine_args.disable_log_stats,
    445     usage_context=usage_context,
    446     stat_loggers=stat_loggers,
    447 )
    449 return engine

File ~/anaconda3/lib/python3.11/site-packages/vllm/engine/llm_engine.py:251, in LLMEngine.__init__(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, multimodal_config, speculative_config, decoding_config, observability_config, prompt_adapter_config, executor_class, log_stats, usage_context, stat_loggers)
    245 self.generation_config_fields = _load_generation_config_dict(
    246     model_config)
    248 self.input_processor = INPUT_REGISTRY.create_input_processor(
    249     self.model_config)
--> 251 self.model_executor = executor_class(
    252     model_config=model_config,
    253     cache_config=cache_config,
    254     parallel_config=parallel_config,
    255     scheduler_config=scheduler_config,
    256     device_config=device_config,
    257     lora_config=lora_config,
    258     multimodal_config=multimodal_config,
    259     speculative_config=speculative_config,
    260     load_config=load_config,
    261     prompt_adapter_config=prompt_adapter_config,
    262 )
    264 if not self.model_config.embedding_mode:
    265     self._initialize_kv_caches()

File ~/anaconda3/lib/python3.11/site-packages/vllm/executor/executor_base.py:47, in ExecutorBase.__init__(self, model_config, cache_config, parallel_config, scheduler_config, device_config, load_config, lora_config, multimodal_config, speculative_config, prompt_adapter_config)
     44 self.speculative_config = speculative_config
     45 self.prompt_adapter_config = prompt_adapter_config
---> 47 self._init_executor()

File ~/anaconda3/lib/python3.11/site-packages/vllm/executor/gpu_executor.py:36, in GPUExecutor._init_executor(self)
     34 self.driver_worker = self._create_worker()
     35 self.driver_worker.init_device()
---> 36 self.driver_worker.load_model()

File ~/anaconda3/lib/python3.11/site-packages/vllm/worker/worker.py:139, in Worker.load_model(self)
    138 def load_model(self):
--> 139     self.model_runner.load_model()

File ~/anaconda3/lib/python3.11/site-packages/vllm/worker/model_runner.py:682, in GPUModelRunnerBase.load_model(self)
    680 logger.info("Starting to load model %s...", self.model_config.model)
    681 with CudaMemoryProfiler() as m:
--> 682     self.model = get_model(model_config=self.model_config,
    683                            device_config=self.device_config,
    684                            load_config=self.load_config,
    685                            lora_config=self.lora_config,
    686                            multimodal_config=self.multimodal_config,
    687                            parallel_config=self.parallel_config,
    688                            scheduler_config=self.scheduler_config,
    689                            cache_config=self.cache_config)
    691 self.model_memory_usage = m.consumed_memory
    692 logger.info("Loading model weights took %.4f GB",
    693             self.model_memory_usage / float(2**30))

File ~/anaconda3/lib/python3.11/site-packages/vllm/model_executor/model_loader/__init__.py:21, in get_model(model_config, load_config, device_config, parallel_config, scheduler_config, lora_config, multimodal_config, cache_config)
     14 def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
     15               device_config: DeviceConfig, parallel_config: ParallelConfig,
     16               scheduler_config: SchedulerConfig,
     17               lora_config: Optional[LoRAConfig],
     18               multimodal_config: Optional[MultiModalConfig],
     19               cache_config: CacheConfig) -> nn.Module:
     20     loader = get_model_loader(load_config)
---> 21     return loader.load_model(model_config=model_config,
     22                              device_config=device_config,
     23                              lora_config=lora_config,
     24                              multimodal_config=multimodal_config,
     25                              parallel_config=parallel_config,
     26                              scheduler_config=scheduler_config,
     27                              cache_config=cache_config)

File ~/anaconda3/lib/python3.11/site-packages/vllm/model_executor/model_loader/loader.py:280, in DefaultModelLoader.load_model(self, model_config, device_config, lora_config, multimodal_config, parallel_config, scheduler_config, cache_config)
    278 with set_default_torch_dtype(model_config.dtype):
    279     with torch.device(device_config.device):
--> 280         model = _initialize_model(model_config, self.load_config,
    281                                   lora_config, multimodal_config,
    282                                   cache_config, scheduler_config)
    283     model.load_weights(
    284         self._get_weights_iterator(model_config.model,
    285                                    model_config.revision,
   (...)
    288                                        "fall_back_to_pt_during_load",
    289                                        True)), )
    291     for _, module in model.named_modules():

File ~/anaconda3/lib/python3.11/site-packages/vllm/model_executor/model_loader/loader.py:108, in _initialize_model(model_config, load_config, lora_config, multimodal_config, cache_config, scheduler_config)
    100 def _initialize_model(
    101         model_config: ModelConfig,
    102         load_config: LoadConfig,
   (...)
    105         cache_config: CacheConfig,
    106         scheduler_config: Optional[SchedulerConfig] = None) -> nn.Module:
    107     """Initialize a model with the given configurations."""
--> 108     model_class = get_model_architecture(model_config)[0]
    109     quant_config = _get_quantization_config(model_config, load_config)
    111     return model_class(config=model_config.hf_config,
    112                        cache_config=cache_config,
    113                        quant_config=quant_config,
    114                        **_get_model_initialization_kwargs(
    115                            model_class, lora_config, multimodal_config,
    116                            scheduler_config))

File ~/anaconda3/lib/python3.11/site-packages/vllm/model_executor/model_loader/utils.py:35, in get_model_architecture(model_config)
     33     if model_cls is not None:
     34         return (model_cls, arch)
---> 35 raise ValueError(
     36     f"Model architectures {architectures} are not supported for now. "
     37     f"Supported architectures: {ModelRegistry.get_supported_archs()}")

ValueError: Model architectures ['LlamaForSequenceClassification'] are not supported for now. Supported architectures: ['AquilaModel', 'AquilaForCausalLM', 'BaiChuanForCausalLM', 'BaichuanForCausalLM', 'BloomForCausalLM', 'ChameleonForCausalLM', 'ChameleonForConditionalGeneration', 'ChatGLMModel', 'ChatGLMForConditionalGeneration', 'CohereForCausalLM', 'DbrxForCausalLM', 'DeciLMForCausalLM', 'DeepseekForCausalLM', 'DeepseekV2ForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTJForCausalLM', 'GPTNeoXForCausalLM', 'InternLMForCausalLM', 'InternLM2ForCausalLM', 'JAISLMHeadModel', 'LlamaForCausalLM', 'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration', 'LLaMAForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'QuantMixtralForCausalLM', 'MptForCausalLM', 'MPTForCausalLM', 'MiniCPMForCausalLM', 'OlmoForCausalLM', 'OPTForCausalLM', 'OrionForCausalLM', 'PersimmonForCausalLM', 'PaliGemmaForConditionalGeneration', 'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3VForCausalLM', 'QWenLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2MoeForCausalLM', 'RWForCausalLM', 'StableLMEpochForCausalLM', 'StableLmForCausalLM', 'Starcoder2ForCausalLM', 'ArcticForCausalLM', 'XverseForCausalLM', 'Phi3SmallForCausalLM', 'MedusaModel', 'MLPSpeculatorPreTrainedModel', 'JambaForCausalLM', 'MistralModel']

my vLLM version is 0.5.3, it seems vLLM only support xxxCausalLM model rather than ClassificationModel ?

ncsoft / offsetbias

ValueError: Model architectures ['LlamaForSequenceClassification'] are not supported for now. #2