Open Xu-Chen opened 1 week ago
@Xu-Chen Please share your model config.json
and quantize_config.json
so I can check the specs of your quantized model.
@Xu-Chen Please share your
config.json
andquantize_config.json
so I can check the specs of your quantized model.
config.json
{
"_name_or_path": "./DeepSeek-V2-Chat",
"architectures": [
"DeepseekV2ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"attn_implementation": "flash_attention_2",
"auto_map": {
"AutoConfig": "configuration_deepseek.DeepseekV2Config",
"AutoModel": "modeling_deepseek.DeepseekV2Model",
"AutoModelForCausalLM": "modeling_deepseek.DeepseekV2ForCausalLM"
},
"aux_loss_alpha": 0.001,
"bos_token_id": 100000,
"eos_token_id": 100001,
"ep_size": 1,
"first_k_dense_replace": 1,
"hidden_act": "silu",
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 12288,
"kv_lora_rank": 512,
"max_position_embeddings": 163840,
"model_type": "deepseek_v2",
"moe_intermediate_size": 1536,
"moe_layer_freq": 1,
"n_group": 8,
"n_routed_experts": 160,
"n_shared_experts": 2,
"norm_topk_prob": false,
"num_attention_heads": 128,
"num_experts_per_tok": 6,
"num_hidden_layers": 60,
"num_key_value_heads": 128,
"pretraining_tp": 1,
"q_lora_rank": 1536,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"quantization_config": {
"bits": 4,
"checkpoint_format": "gptq",
"damp_percent": 0.01,
"desc_act": false,
"group_size": 32,
"lm_head": false,
"meta": {
"quantizer": "gptqmodel:0.9.1"
},
"model_file_base_name": null,
"model_name_or_path": null,
"quant_method": "gptq",
"static_groups": false,
"sym": true,
"true_sequential": false
},
"rms_norm_eps": 1e-06,
"rope_scaling": {
"beta_fast": 32,
"beta_slow": 1,
"factor": 40,
"mscale": 0.707,
"mscale_all_dim": 0.707,
"original_max_position_embeddings": 4096,
"type": "yarn"
},
"rope_theta": 10000,
"routed_scaling_factor": 16.0,
"scoring_func": "softmax",
"seq_aux": true,
"tie_word_embeddings": false,
"topk_group": 3,
"topk_method": "group_limited_greedy",
"torch_dtype": "bfloat16",
"transformers_version": "4.41.2",
"use_cache": true,
"v_head_dim": 128,
"vocab_size": 102400
}
quantize_config.json
{
"bits": 4,
"group_size": 32,
"desc_act": false,
"static_groups": false,
"sym": true,
"lm_head": false,
"damp_percent": 0.01,
"true_sequential": false,
"model_name_or_path": "./DeepSeek-V2-Chat-gptq-int4",
"model_file_base_name": "model",
"quant_method": "gptq",
"checkpoint_format": "gptq",
"meta": {
"quantizer": "gptqmodel:0.9.1"
}
}
@Xu-Chen To confirm, did you have inference problem with GPTQMode.from_quantized()
too or is the inference issue localized to vLLM?
@Xu-Chen To confirm, did you have inference problem with
GPTQMode.from_quantized()
too or is the inference issue localized to vLLM?
The inference issue localized to vLLM. I will try GPTQMode.from_quantized()
.
Ref: https://github.com/vllm-project/vllm/issues/5343#issuecomment-2154882419
@Xu-Chen vllm currently does not support quantized MoE models except Mixtral.
Ref: vllm-project/vllm#5343 (comment)
@Xu-Chen vllm currently does not support quantized MoE models except Mixtral.
I try GPTQMode.from_quantized(),but the quantized model's output is error, eg "GPTQModel is !!!!!!!!!!!!!!!!!!", repeatedly outputs the character "!". Will re-quantify this model.
What calibration sataset and size of calib data did you use? Also please use something other than "GPTQModel is..." as prompt test since it would just hallucinate on unknown info.
Using vllm to infer the deepseek model encountered an error