ggerganov / llama.cpp

LLM inference in C/C++
MIT License
68.67k stars 9.87k forks source link

Bug: server GET /props request return json with chat_template with last char replaced by \x00 #10235

Open kks-imt opened 3 weeks ago

kks-imt commented 3 weeks ago

What happened?

examples/server/utils.hpp static std::string llama_get_chat_template(const struct llama_model model) { std::string template_key = "tokenizer.chat_template"; // call with NULL buffer to get the total size of the string int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); if (res < 0) { return ""; } else { std::vector model_template(res, 0); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); return std::string(model_template.data(), model_template.size()); } } src/llama.cc int32_t llama_model_meta_val_str(const struct llama_model model, const char key, char buf, size_t buf_size) { const auto & it = model->gguf_kv.find(key); if (it == model->gguf_kv.end()) { if (buf_size > 0) { buf[0] = '\0'; } return -1; } return snprintf(buf, buf_size, "%s", it->second.c_str()); } C function snprintf add \x00 to the end of buffer and replace the last char of chat_template by \x00

Name and Version

C:\llama.cpp>llama-cli --version ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA GeForce RTX 2050, compute capability 8.6, VMM: yes version: 4055 (e8921349) built with MSVC 19.29.30152.0 for x64

What operating system are you seeing the problem on?

Linux, Windows

Relevant log output

# ***********************************************************************************
    # execute request

    headers = {
        'Content-Type': 'application/json',
    }

    print("endpoint_url: {}".format(endpoint_url));
    response = requests.request("GET", endpoint_url, headers = headers);
    response.close();

    # ***********************************************************************************
    # handle response

    result = response.json();

    print(result);

============================ RESULT JSON =====================
endpoint_url: http://127.0.0.1:8080/props
{'default_generation_settings': {'n_ctx': 8192, 'n_predict': -1, 'model': 'saiga_nemo_12b', 'seed': -1, 'seed_cur': 0, 'temperature': 0.800000011920929, 'dynatemp_range': 0.0, 'dynatemp_exponent': 1.0, 'top_k': 40, 'top_p': 0.949999988079071, 'min_p': 0.05000000074505806, 'xtc_probability': 0.0, 'xtc_threshold': 0.10000000149011612, 'typical_p': 1.0, 'repeat_last_n': 64, 'repeat_penalty': 1.0, 'presence_penalty': 0.0, 'frequency_penalty': 0.0, 'dry_multiplier': 0.0, 'dry_base': 1.75, 'dry_allowed_length': 2, 'dry_penalty_last_n': -1, 'dry_sequence_breakers': ['\n', ':', '"', '*'], 'mirostat': 0, 'mirostat_tau': 5.0, 'mirostat_eta': 0.10000000149011612, 'penalize_nl': False, 'stop': [], 'max_tokens': -1, 'n_keep': 0, 'n_discard': 0, 'ignore_eos': False, 'stream': True, 'n_probs': 0, 'min_keep': 0, 'grammar': '', 'samplers': ['dry', 'top_k', 'typ_p', 'top_p', 'min_p', 'xtc', 'temperature']}, 'total_slots': 1, 'chat_template': "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] | trim + '\n\n' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{{- bos_token + system_message}}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] | trim + eos_token }}{% endif %}{% endfor %\x00"}
===== at the and of chat_template: ====
{% endfor %\x00"
=====
last '}' replaced by \x00
kks-imt commented 3 weeks ago

possible solution: examples/server/utils.hpp

static std::string llama_get_chat_template(const struct llama_model * model) { std::string template_key = "tokenizer.chat_template"; // call with NULL buffer to get the total size of the string int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); if (res < 0) { return ""; } else { // add 1 char for \x00 will be added by snprintf to end of buf std::vector model_template(res + 1, 0); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); return std::string(model_template.data(), model_template.size() - 1); } }