modelscope / dash-infer

DashInfer is a native LLM inference engine aiming to deliver industry-leading performance atop various hardware architectures, including x86 and ARMv9.
Apache License 2.0
137 stars 15 forks source link

flatten stop_words_ids in generation_config to 1 dim array #27

Open yejunjin opened 4 months ago

yejunjin commented 4 months ago

Here is an example of config file at examples/python/model_config/config_qwen_v10_7b.json

{
    "model_name": "Qwen-7B-Chat",
    "model_type": "Qwen_v10",
    "model_path": "~/dashinfer_models/",
    "data_type": "float32",
    "device_type": "CPU",
    "device_ids": [
        0
    ],
    "multinode_mode": false,
    "engine_config": {
        "engine_max_length": 2048,
        "engine_max_batch": 8,
        "do_profiling": false,
        "num_threads": 0,
        "matmul_precision": "medium"
    },
    "generation_config": {
        "temperature": 1.0,
        "early_stopping": true,
        "top_k": 1024,
        "top_p": 0.8,
        "repetition_penalty": 1.1,
        "presence_penalty": 0.0,
        "min_length": 0,
        "max_length": 2048,
        "no_repeat_ngram_size": 0,
        "eos_token_id": 151643,
        "seed": 1234,
        "stop_words_ids": [
            [
                151643
            ],
            [
                151644
            ],
            [
                151645
            ]
        ]
    },
    "convert_config": {
        "do_dynamic_quantize_convert": false
    },
    "quantization_config": {
        "activation_type": "bfloat16",
        "weight_type": "uint8",
        "SubChannel": true,
        "GroupSize": 128
    }
}

The item .generation_config.stop_words_ids is a two-dim array, changing this to one-dim array needs to modify C++ side interface and python binding code.

The change is to align to openai style configuration.