ggerganov / llama.cpp

LLM inference in C/C++
MIT License
61.92k stars 8.89k forks source link

Bug: having more than one context doesn't work as expected with the Vulkan backend #7575

Open giladgd opened 1 month ago

giladgd commented 1 month ago

What happened?

There seems to be some kind of memory overlap between contexts created with the same model with the Vulkan backend when the contexts are loaded at the same time. Freeing the first context before creating the second one works as expected, though. Other backends support having multiple contexts at the same time, so I think Vulkan should support it, too.

The following code crashes with signal SIGSEGV, Segmentation fault:

void embed_text(const char * text, llama_model * model, llama_context * context) {
    std::vector<llama_token> tokens = llama_tokenize(model, text, false, false);
    auto n_tokens = tokens.size();
    auto batch = llama_batch_init(n_tokens, 0, 1);

    for (size_t i = 0; i < n_tokens; i++) {
        llama_batch_add(batch, tokens[i], i, { 0 }, false);
    }
    batch.logits[batch.n_tokens - 1] = true;

    llama_decode(context, batch);
    llama_synchronize(context);

    const int n_embd = llama_n_embd(model);
    const auto* embeddings = llama_get_embeddings_seq(context, 0);
    if (embeddings == NULL) {
        embeddings = llama_get_embeddings_ith(context, tokens.size() - 1);

        if (embeddings == NULL) {
            printf("Failed to get embedding");
        }
    }

    if (embeddings != NULL) {
        printf("Embeddings: ");
        for (size_t i = 0; i < n_embd; ++i) {
            printf("%f ", embeddings[i]);
        }
    }

    llama_batch_free(batch);
}

void main() {
    llama_backend_init();

    auto model_params = llama_model_default_params();
    model_params.n_gpu_layers = 33;

    auto model_path = "/home/user/models/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf";
    auto model = llama_load_model_from_file(model_path, model_params);

    auto text1 = "Hi there";
    auto text2 = "Hello there";

    auto context_params = llama_context_default_params();
    context_params.embeddings = true;
    context_params.seed = time(NULL);
    context_params.n_ctx = 4096;
    context_params.n_threads = 6;
    context_params.n_threads_batch = context_params.n_threads;
    context_params.n_batch = 512;
    context_params.n_ubatch = 512;

    auto context1 = llama_new_context_with_model(model, context_params);
    embed_text(text1, model, context1);

    auto context2 = llama_new_context_with_model(model, context_params);
    embed_text(text2, model, context2);

    llama_free(context1);
    llama_free(context2); // it crashes here

    llama_free_model(model);

    llama_backend_free();
}

Using gdb shows this stack trace:

#0  0x00007fffced544eb in ggml_vk_graph_cleanup(ggml_backend_vk_context*) ()
#1  0x00007fffced54a7e in ggml_backend_vk_free(ggml_backend*) ()
#2  0x00007fffcec172d4 in llama_free ()
#3  0x00007fffcebe8979 in main() ()

I've used this model in this code.

Name and Version

I tested the above code with release b3012.

version: 1 (10b1e45)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu

What operating system are you seeing the problem on?

Linux

Relevant log output

llama_model_loader: loaded meta data with 29 key-value pairs and 291 tensors from /home/user/models/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8
llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000
llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  10:                          general.file_type u32              = 15
llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256
llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe
llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000
llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128001
llama_model_loader: - kv  20:                    tokenizer.chat_template str              = {% set loop_messages = messages %}{% ...
llama_model_loader: - kv  21:               general.quantization_version u32              = 2
llama_model_loader: - kv  22:                                general.url str              = https://huggingface.co/mradermacher/M...
llama_model_loader: - kv  23:              mradermacher.quantize_version str              = 2
llama_model_loader: - kv  24:                  mradermacher.quantized_by str              = mradermacher
llama_model_loader: - kv  25:                  mradermacher.quantized_at str              = 2024-05-02T13:57:22+02:00
llama_model_loader: - kv  26:                  mradermacher.quantized_on str              = db2
llama_model_loader: - kv  27:                         general.source.url str              = https://huggingface.co/NousResearch/M...
llama_model_loader: - kv  28:                  mradermacher.convert_type str              = hfhfix
llama_model_loader: - type  f32:   65 tensors
llama_model_loader: - type q4_K:  193 tensors
llama_model_loader: - type q6_K:   33 tensors
llm_load_vocab: special tokens definition check successful ( 256/128256 ).
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = llama
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 128256
llm_load_print_meta: n_merges         = 280147
llm_load_print_meta: n_ctx_train      = 8192
llm_load_print_meta: n_embd           = 4096
llm_load_print_meta: n_head           = 32
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_layer          = 32
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 4
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     = 1024
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale    = 0.0e+00
llm_load_print_meta: n_ff             = 14336
llm_load_print_meta: n_expert         = 0
llm_load_print_meta: n_expert_used    = 0
llm_load_print_meta: causal attn      = 1
llm_load_print_meta: pooling type     = 0
llm_load_print_meta: rope type        = 0
llm_load_print_meta: rope scaling     = linear
llm_load_print_meta: freq_base_train  = 500000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_yarn_orig_ctx  = 8192
llm_load_print_meta: rope_finetuned   = unknown
llm_load_print_meta: ssm_d_conv       = 0
llm_load_print_meta: ssm_d_inner      = 0
llm_load_print_meta: ssm_d_state      = 0
llm_load_print_meta: ssm_dt_rank      = 0
llm_load_print_meta: model type       = 8B
llm_load_print_meta: model ftype      = Q4_K - Medium
llm_load_print_meta: model params     = 8.03 B
llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW) 
llm_load_print_meta: general.name     = Meta-Llama-3-8B-Instruct
llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
llm_load_print_meta: EOS token        = 128001 '<|end_of_text|>'
llm_load_print_meta: LF token         = 128 'Ä'
llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
ggml_vulkan: Found 1 Vulkan devices:
Vulkan0: NVIDIA RTX A4000 | uma: 0 | fp16: 1 | warp size: 32
llm_load_tensors: ggml ctx size =    0.30 MiB
llm_load_tensors: offloading 32 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 33/33 layers to GPU
llm_load_tensors:        CPU buffer size =   281.81 MiB
llm_load_tensors:    Vulkan0 buffer size =  4403.49 MiB
........................................................................................
llama_new_context_with_model: n_ctx      = 4096
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 500000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:    Vulkan0 KV buffer size =   512.00 MiB
llama_new_context_with_model: KV self size  =  512.00 MiB, K (f16):  256.00 MiB, V (f16):  256.00 MiB
llama_new_context_with_model: Vulkan_Host  output buffer size =     0.50 MiB
llama_new_context_with_model:    Vulkan0 compute buffer size =   296.00 MiB
llama_new_context_with_model: Vulkan_Host compute buffer size =    16.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 2
Embeddings: -1.410964 -0.499604 0.122247 -2.673030 [truncated due to long text]
llama_new_context_with_model: n_ctx      = 4096
llama_new_context_with_model: n_batch    = 512
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 0
llama_new_context_with_model: freq_base  = 500000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:    Vulkan0 KV buffer size =   512.00 MiB
llama_new_context_with_model: KV self size  =  512.00 MiB, K (f16):  256.00 MiB, V (f16):  256.00 MiB
llama_new_context_with_model: Vulkan_Host  output buffer size =     0.50 MiB
llama_new_context_with_model:    Vulkan0 compute buffer size =   296.00 MiB
llama_new_context_with_model: Vulkan_Host compute buffer size =    16.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 2
Embeddings: -1.420700 -1.298250 1.035053 -1.393191 [truncated due to long text]

[1]    16462 segmentation fault (core dumped)
giladgd commented 3 weeks ago

Thank you @0cc4m!

giladgd commented 3 weeks ago

@0cc4m I've tested the latest release, and decoding now works well with more than one context 🚀 However, I've encountered another issue where decoding multiple contexts in parallel from different threads crashes the process (this happens only in Vulkan). Is it possible to make the decoding thread safe in Vulkan?

This issue can be replicated with this code:

void embed_text(const char * text, llama_model * model, llama_context * context) {
    std::vector<llama_token> tokens = llama_tokenize(model, text, false, false);
    auto n_tokens = tokens.size();
    auto batch = llama_batch_init(n_tokens, 0, 1);

    for (size_t i = 0; i < n_tokens; i++) {
        llama_batch_add(batch, tokens[i], i, { 0 }, false);
    }
    batch.logits[batch.n_tokens - 1] = true;

    llama_decode(context, batch);
    llama_synchronize(context);

    const int n_embd = llama_n_embd(model);
    const auto* embeddings = llama_get_embeddings_seq(context, 0);
    if (embeddings == NULL) {
        embeddings = llama_get_embeddings_ith(context, tokens.size() - 1);

        if (embeddings == NULL) {
            printf("Failed to get embedding");
        }
    }

    if (embeddings != NULL) {
        printf("Embeddings: ");
        for (size_t i = 0; i < n_embd; ++i) {
            printf("%f ", embeddings[i]);
        }
    }

    llama_batch_free(batch);
}

void main() {
    llama_backend_init();

    auto model_params = llama_model_default_params();
    model_params.n_gpu_layers = 33;

    auto model_path = "/home/user/models/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf";
    auto model = llama_load_model_from_file(model_path, model_params);

    auto text1 = "Hi there";
    auto text2 = "Hello there";

    auto context_params = llama_context_default_params();
    context_params.embeddings = true;
    context_params.seed = time(NULL);
    context_params.n_ctx = 4096;
    context_params.n_threads = 6;
    context_params.n_threads_batch = context_params.n_threads;
    context_params.n_batch = 512;
    context_params.n_ubatch = 512;

    auto context1 = llama_new_context_with_model(model, context_params);
    auto context2 = llama_new_context_with_model(model, context_params);

    // one of these threads causes the process to crash
    std::thread thread1(embed_text, text1, model, context1);
    std::thread thread2(embed_text, text2, model, context2);

    thread1.join();
    thread2.join();

    llama_free(context1);
    llama_free(context2);

    llama_free_model(model);

    llama_backend_free();
}