Bug: Occasional crashes when a connection has been interrupted before completion of computation

What happened?

I am running llama-server like this:

llama-server -c 102400 -ngl 100 -m Replete-LLM-V2.5-Qwen-14b-IQ3_M.gguf --chat-template chatml --check-tensors -ctk q8_0 -ctv q8_0 -fa --parallel 10

When I make a number of /completion calls, then close those connections without waiting for response (e.g. terminating the connecting process), llama-server often crashes with

/build/source/ggml/src/ggml-cuda.cu:70: CUDA error
CUDA error: an illegal memory access was encountered
  current device: 0, in function ggml_backend_cuda_synchronize at /build/source/ggml/src/ggml-cuda.cu:2446
  cudaStreamSynchronize(cuda_ctx->stream())

I've been trying to build it with -DCMAKE_BUILD_TYPE=Debug, but for some reason I'm still seeing "variable optimized out" in my gdb; I don't quite know what's going on there... Either I or Nix may be doing something fishy. The binary definitely is the debug version since the debug info is present.

GDB output:

$ coredumpctl debug
           PID: 1067626 (llama-server)
           UID: 1000 (sliedes)
           GID: 100 (users)
        Signal: 6 (ABRT)
     Timestamp: Thu 2024-10-17 16:54:42 CEST (5min ago)
  Command Line: llama-server -c 102400 -ngl 100 -m Replete-LLM-V2.5-Qwen-14b-IQ3_M.gguf --chat-template chatml --check-tensors -ctk q8_0 -ctv q8_0 -fa --parallel 10
    Executable: /nix/store/xsjknx60if36j5d8kl393yb23hhf76ic-llama-cpp-3933/bin/llama-server
 Control Group: /user.slice/user-1000.slice/session-12.scope
          Unit: session-12.scope
         Slice: user-1000.slice
       Session: 12
     Owner UID: 1000 (sliedes)
       Boot ID: 6224b3f52c0e45468c99f5f5cc1d17f4
    Machine ID: 13629c48106c49a39ea48f0b10557f82
      Hostname: poyta
       Storage: /var/lib/systemd/coredump/core.llama-server.1000.6224b3f52c0e45468c99f5f5cc1d17f4.1067626.1729176882000000.zst (present)
  Size on Disk: 234.9M
       Message: Process 1067626 (llama-server) of user 1000 dumped core.

                Module libgomp.so.1 without build-id.
                Module libgcc_s.so.1 without build-id.
                Module libstdc++.so.6 without build-id.
                Stack trace of thread 1067626:
                #0  0x00007ffff329b7dc __pthread_kill_implementation (libc.so.6 + 0x927dc)
                #1  0x00007ffff3249516 raise (libc.so.6 + 0x40516)
                #2  0x00007ffff3231935 abort (libc.so.6 + 0x28935)
                #3  0x00007ffff381c7c5 ggml_abort.cold (libggml.so + 0x1c7c5)
                #4  0x00007ffff38ea863 _Z15ggml_cuda_errorPKcS0_S0_iS0_ (libggml.so + 0xea863)
                #5  0x00007ffff38eb80a _ZL29ggml_backend_cuda_synchronizeP12ggml_backend (libggml.so + 0xeb80a)
                #6  0x00007ffff38759e6 ggml_backend_sched_synchronize (libggml.so + 0x759e6)
                #7  0x00007ffff3877873 ggml_backend_sched_reserve (libggml.so + 0x77873)
                #8  0x00007ffff7e90076 _ZL30llama_kv_cache_update_internalR13llama_context (libllama.so + 0x70076)
                #9  0x00007ffff7e96c53 llama_decode (libllama.so + 0x76c53)
                #10 0x000000000049fd82 _ZN14server_context12update_slotsEv (llama-server + 0xa0d82)
                #11 0x0000000000487e99 _ZN12server_queue10start_loopEv (llama-server + 0x88e99)
                #12 0x000000000042644e main (llama-server + 0x2744e)
                #13 0x00007ffff323314e __libc_start_call_main (libc.so.6 + 0x2a14e)
                #14 0x00007ffff3233209 __libc_start_main@@GLIBC_2.34 (libc.so.6 + 0x2a209)
                #15 0x0000000000428095 _start (llama-server + 0x29095)

                Stack trace of thread 1067627:
                #0  0x00007ffff330ad1f __poll (libc.so.6 + 0x101d1f)
                #1  0x00007fffcc254e3f n/a (libcuda.so.1 + 0x254e3f)
                #2  0x00007fffcc327fbf n/a (libcuda.so.1 + 0x327fbf)
                #3  0x00007fffcc251113 n/a (libcuda.so.1 + 0x251113)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067637:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067638:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067636:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067639:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067631:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x00000000004f662b _ZZN10common_log6resumeEvENKUlvE_clEv (llama-server + 0xf762b)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067643:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067646:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067644:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067647:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067645:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067649:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067635:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x000000000048683b _ZN15server_response4recvERKSt13unordered_setIiSt4hashIiESt8equal_toIiESaIiEE (llama-server + 0x8783b)
                #3  0x000000000049ea13 _ZN14server_context20receive_cmpl_resultsERKSt13unordered_setIiSt4hashIiESt8equal_toIiESaIiEERKSt8functionIFvRSt6vectorI18server_task_resultSaISB_EEEERKS9_IFvN8nlohmann16json_abi_v3_11_310basic_jsonINSK_11ordered_mapESA_NSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEblmdSaNSK_14adl_serializerESA_IhSaIhEEvEEEE (llama-server + 0x9fa13)
                #4  0x000000000043e50c _ZZ4mainENKUl21server_task_cmpl_typeRN8nlohmann16json_abi_v3_11_310basic_jsonINS1_11ordered_mapESt6vectorNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEblmdSaNS1_14adl_serializerES4_IhSaIhEEvEERN7httplib8ResponseEE_clES_SF_SI_ (llama-server + 0x3f50c)
                #5  0x000000000043e702 _ZNSt17_Function_handlerIFvRKN7httplib7RequestERNS0_8ResponseEEZ4mainEUlS3_S5_E10_E9_M_invokeERKSt9_Any_dataS3_S5_ (llama-server + 0x3f702)
                #6  0x0000000000446911 _ZNK7httplib6Server16dispatch_requestERNS_7RequestERNS_8ResponseERKSt6vectorISt4pairISt10unique_ptrINS_6detail11MatcherBaseESt14default_deleteIS9_EESt8functionIFvRKS1_S4_EEESaISI_EE.isra.0 (llama-server + 0x47911)
                #7  0x00000000004b164e _ZN7httplib6Server15process_requestERNS_6StreamEbRbRKSt8functionIFvRNS_7RequestEEE (llama-server + 0xb264e)
                #8  0x00000000004b1e9e _ZN7httplib6detail26process_server_socket_coreIZNS0_21process_server_socketIZNS_6Server24process_and_close_socketEiEUlRNS_6StreamEbRbE_EEbRKSt6atomicIiEimlllllT_EUlbS6_E_EEbSB_imlSC_ (llama-server + 0xb2e9e)
                #9  0x00000000004b2178 _ZNSt17_Function_handlerIFvvEZN7httplib6Server15listen_internalEvEUlvE0_E9_M_invokeERKSt9_Any_data (llama-server + 0xb3178)
                #10 0x00000000004529bc _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x539bc)
                #11 0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #12 0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #13 0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067648:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067654:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067640:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x000000000048683b _ZN15server_response4recvERKSt13unordered_setIiSt4hashIiESt8equal_toIiESaIiEE (llama-server + 0x8783b)
                #3  0x000000000049ea13 _ZN14server_context20receive_cmpl_resultsERKSt13unordered_setIiSt4hashIiESt8equal_toIiESaIiEERKSt8functionIFvRSt6vectorI18server_task_resultSaISB_EEEERKS9_IFvN8nlohmann16json_abi_v3_11_310basic_jsonINSK_11ordered_mapESA_NSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEblmdSaNSK_14adl_serializerESA_IhSaIhEEvEEEE (llama-server + 0x9fa13)
                #4  0x000000000043e50c _ZZ4mainENKUl21server_task_cmpl_typeRN8nlohmann16json_abi_v3_11_310basic_jsonINS1_11ordered_mapESt6vectorNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEblmdSaNS1_14adl_serializerES4_IhSaIhEEvEERN7httplib8ResponseEE_clES_SF_SI_ (llama-server + 0x3f50c)
                #5  0x000000000043e702 _ZNSt17_Function_handlerIFvRKN7httplib7RequestERNS0_8ResponseEEZ4mainEUlS3_S5_E10_E9_M_invokeERKSt9_Any_dataS3_S5_ (llama-server + 0x3f702)
                #6  0x0000000000446911 _ZNK7httplib6Server16dispatch_requestERNS_7RequestERNS_8ResponseERKSt6vectorISt4pairISt10unique_ptrINS_6detail11MatcherBaseESt14default_deleteIS9_EESt8functionIFvRKS1_S4_EEESaISI_EE.isra.0 (llama-server + 0x47911)
                #7  0x00000000004b164e _ZN7httplib6Server15process_requestERNS_6StreamEbRbRKSt8functionIFvRNS_7RequestEEE (llama-server + 0xb264e)
                #8  0x00000000004b1e9e _ZN7httplib6detail26process_server_socket_coreIZNS0_21process_server_socketIZNS_6Server24process_and_close_socketEiEUlRNS_6StreamEbRbE_EEbRKSt6atomicIiEimlllllT_EUlbS6_E_EEbSB_imlSC_ (llama-server + 0xb2e9e)
                #9  0x00000000004b2178 _ZNSt17_Function_handlerIFvvEZN7httplib6Server15listen_internalEvEUlvE0_E9_M_invokeERKSt9_Any_data (llama-server + 0xb3178)
                #10 0x00000000004529bc _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x539bc)
                #11 0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #12 0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #13 0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067633:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067655:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067641:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x000000000048683b _ZN15server_response4recvERKSt13unordered_setIiSt4hashIiESt8equal_toIiESaIiEE (llama-server + 0x8783b)
                #3  0x000000000049ea13 _ZN14server_context20receive_cmpl_resultsERKSt13unordered_setIiSt4hashIiESt8equal_toIiESaIiEERKSt8functionIFvRSt6vectorI18server_task_resultSaISB_EEEERKS9_IFvN8nlohmann16json_abi_v3_11_310basic_jsonINSK_11ordered_mapESA_NSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEblmdSaNSK_14adl_serializerESA_IhSaIhEEvEEEE (llama-server + 0x9fa13)
                #4  0x000000000043e50c _ZZ4mainENKUl21server_task_cmpl_typeRN8nlohmann16json_abi_v3_11_310basic_jsonINS1_11ordered_mapESt6vectorNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEblmdSaNS1_14adl_serializerES4_IhSaIhEEvEERN7httplib8ResponseEE_clES_SF_SI_ (llama-server + 0x3f50c)
                #5  0x000000000043e702 _ZNSt17_Function_handlerIFvRKN7httplib7RequestERNS0_8ResponseEEZ4mainEUlS3_S5_E10_E9_M_invokeERKSt9_Any_dataS3_S5_ (llama-server + 0x3f702)
                #6  0x0000000000446911 _ZNK7httplib6Server16dispatch_requestERNS_7RequestERNS_8ResponseERKSt6vectorISt4pairISt10unique_ptrINS_6detail11MatcherBaseESt14default_deleteIS9_EESt8functionIFvRKS1_S4_EEESaISI_EE.isra.0 (llama-server + 0x47911)
                #7  0x00000000004b164e _ZN7httplib6Server15process_requestERNS_6StreamEbRbRKSt8functionIFvRNS_7RequestEEE (llama-server + 0xb264e)
                #8  0x00000000004b1e9e _ZN7httplib6detail26process_server_socket_coreIZNS0_21process_server_socketIZNS_6Server24process_and_close_socketEiEUlRNS_6StreamEbRbE_EEbRKSt6atomicIiEimlllllT_EUlbS6_E_EEbSB_imlSC_ (llama-server + 0xb2e9e)
                #9  0x00000000004b2178 _ZNSt17_Function_handlerIFvvEZN7httplib6Server15listen_internalEvEUlvE0_E9_M_invokeERKSt9_Any_data (llama-server + 0xb3178)
                #10 0x00000000004529bc _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x539bc)
                #11 0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #12 0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #13 0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067651:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067660:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067642:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067634:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067632:
                #0  0x00007ffff331b01f accept (libc.so.6 + 0x11201f)
                #1  0x000000000042d0e4 _ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZ4mainEUlvE1_EEEEE6_M_runEv (llama-server + 0x2e0e4)
                #2  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #3  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #4  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067650:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067665:
                #0  0x00007ffff330ad1f __poll (libc.so.6 + 0x101d1f)
                #1  0x00007fffcc254e3f n/a (libcuda.so.1 + 0x254e3f)
                #2  0x00007fffcc327fbf n/a (libcuda.so.1 + 0x327fbf)
                #3  0x00007fffcc251113 n/a (libcuda.so.1 + 0x251113)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067657:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067664:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298f45 pthread_cond_timedwait@@GLIBC_2.3.2 (libc.so.6 + 0x8ff45)
                #2  0x00007fffcc1aebca n/a (libcuda.so.1 + 0x1aebca)
                #3  0x00007fffcc251113 n/a (libcuda.so.1 + 0x251113)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067658:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067661:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067652:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067653:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067656:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067662:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067659:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)

                Stack trace of thread 1067663:
                #0  0x00007ffff32960ce __futex_abstimed_wait_common (libc.so.6 + 0x8d0ce)
                #1  0x00007ffff3298c20 pthread_cond_wait@@GLIBC_2.3.2 (libc.so.6 + 0x8fc20)
                #2  0x0000000000452a46 _ZN7httplib10ThreadPool6workerclEv (llama-server + 0x53a46)
                #3  0x00007ffff34e86d3 execute_native_thread_routine (libstdc++.so.6 + 0xe86d3)
                #4  0x00007ffff3299a42 start_thread (libc.so.6 + 0x90a42)
                #5  0x00007ffff331905c __clone3 (libc.so.6 + 0x11005c)
                ELF object binary architecture: AMD x86-64

Reading symbols from /nix/store/xsjknx60if36j5d8kl393yb23hhf76ic-llama-cpp-3933/bin/llama-server...

warning: Loadable section ".dynstr" outside of ELF segments
  in /nix/store/xsjknx60if36j5d8kl393yb23hhf76ic-llama-cpp-3933/bin/llama-server
Reading symbols from /nix/store/jvyl2rg6mff5c6z3477sbip03w86rwjw-llama-cpp-3933-debug/lib/debug/.build-id/db/86f367231952a378ab1268136a29fb91d5a98b.debug...

warning: Can't open file /dev/zero (deleted) during file-backed mapping note processing
[New LWP 1067626]
<... snipped text...>
[New LWP 1067663]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/nix/store/3dyw8dzj9ab4m8hv5dpyx7zii8d0w6fi-glibc-2.39-52/lib/libthread_db.so.1".
Core was generated by `llama-server -c 102400 -ngl 100 -m Replete-LLM-V2.5-Qwen-14b-IQ3_M.gguf --chat-'.
Program terminated with signal SIGABRT, Aborted.
#0  __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0)
    at pthread_kill.c:44
44        return INTERNAL_SYSCALL_ERROR_P (ret) ? INTERNAL_SYSCALL_ERRNO (ret) : 0;
[Current thread is 1 (Thread 0x7ffff3787000 (LWP 1067626))]
(gdb) bt
#0  __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0)
    at pthread_kill.c:44
#1  0x00007ffff329b843 in __pthread_kill_internal (signo=6, threadid=<optimized out>) at pthread_kill.c:78
#2  0x00007ffff3249516 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
#3  0x00007ffff3231935 in __GI_abort () at abort.c:79
#4  0x00007ffff381c7c5 in ggml_abort (file=0x7ffff3a3e7e0 "/build/source/ggml/src/ggml-cuda.cu", line=70,
    fmt=0x7ffff3a32d51 "CUDA error") at /build/source/ggml/src/ggml.c:305
#5  0x00007ffff38ea863 in ggml_cuda_error (stmt=stmt@entry=0x7ffff3a3f1c0 "cudaStreamSynchronize(cuda_ctx->stream())",
    func=func@entry=0x7ffff3a32e36 "ggml_backend_cuda_synchronize",
    file=file@entry=0x7ffff3a3e7e0 "/build/source/ggml/src/ggml-cuda.cu", line=line@entry=2446,
    msg=0x7ffff2e8db00 "an illegal memory access was encountered") at /build/source/ggml/src/ggml-cuda.cu:70
#6  0x00007ffff38eb80a in ggml_backend_cuda_synchronize (backend=<optimized out>)
    at /build/source/ggml/src/ggml-cuda.cu:2446
#7  0x00007ffff38759e6 in ggml_backend_sched_synchronize (sched=sched@entry=0x127e630)
    at /build/source/ggml/src/ggml-backend.cpp:2349
#8  0x00007ffff3877873 in ggml_backend_sched_reserve (sched=0x127e630, measure_graph=<optimized out>)
    at /build/source/ggml/src/ggml-backend.cpp:2307
#9  0x00007ffff7e90076 in llama_kv_cache_update_internal (lctx=...) at /build/source/src/llama.cpp:17891
#10 0x00007ffff7e90c25 in llama_kv_cache_update (ctx=<optimized out>) at /build/source/src/llama.cpp:20123
#11 0x00007ffff7e96c53 in llama_decode_internal (batch_all=..., lctx=...) at /build/source/src/llama.cpp:17248
#12 llama_decode (ctx=0x1269150, batch=...) at /build/source/src/llama.cpp:21200
#13 0x000000000049fd82 in server_context::update_slots (this=<optimized out>)
    at /build/source/examples/server/server.cpp:2292
#14 0x0000000000487e99 in std::function<void()>::operator() (this=0x7fffffffb1a8)
    at /nix/store/6mmwy4jcnqnhms3i56r1hbdn656akg1d-gcc-13.3.0/include/c++/13.3.0/bits/std_function.h:591
#15 server_queue::start_loop (this=this@entry=0x7fffffffb088) at /build/source/examples/server/server.cpp:504
#16 0x000000000042644e in main (argc=<optimized out>, argv=<optimized out>)
    at /build/source/examples/server/server.cpp:3402
(gdb) set substitute-path /build/source /home/sliedes/proj/llama.cpp
(gdb) fra 4
#4  0x00007ffff381c7c5 in ggml_abort (file=0x7ffff3a3e7e0 "/build/source/ggml/src/ggml-cuda.cu", line=70,
    fmt=0x7ffff3a32d51 "CUDA error") at /build/source/ggml/src/ggml.c:305
305     abort();
(gdb) q

Name and Version

In reality, this is b3933 (f010b77a) on NixOS; the build scripts seem to report version 0:

ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes register_backend: registered backend CUDA (1 devices) register_device: registered device CUDA0 (NVIDIA GeForce RTX 4090) register_backend: registered backend CPU (1 devices) register_device: registered device CPU (AMD Ryzen Threadripper PRO 5955WX 16-Cores) version: 0 (unknown) built with gcc (GCC) 13.3.0 for x86_64-unknown-linux-gnu

What operating system are you seeing the problem on?

Linux

Relevant log output

$ llama-server -c 102400 -ngl 100 -m Replete-LLM-V2.5-Qwen-14b-IQ3_M.gguf --chat-template chatml --check-tensors -ctk q8_0 -ctv q8_0 -fa --parallel 10
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
register_backend: registered backend CUDA (1 devices)
register_device: registered device CUDA0 (NVIDIA GeForce RTX 4090)
register_backend: registered backend CPU (1 devices)
register_device: registered device CPU (AMD Ryzen Threadripper PRO 5955WX 16-Cores)
build: 0 (unknown) with gcc (GCC) 13.3.0 for x86_64-unknown-linux-gnu (debug)
system info: n_threads = 16, n_threads_batch = 16, total_threads = 32

system_info: n_threads = 16 (n_threads_batch = 16) / 32 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | RISCV_VECT = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |

main: HTTP server is listening, hostname: 127.0.0.1, port: 8080, http threads: 31
main: loading model
llama_load_model_from_file: using device CUDA0 (NVIDIA GeForce RTX 4090) - 18924 MiB free
llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from Replete-LLM-V2.5-Qwen-14b-IQ3_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Replete LLM V2.5 Qwen 14b
llama_model_loader: - kv   3:                           general.basename str              = Replete-LLM-V2.5-Qwen
llama_model_loader: - kv   4:                         general.size_label str              = 14B
llama_model_loader: - kv   5:                            general.license str              = apache-2.0
llama_model_loader: - kv   6:                   general.base_model.count u32              = 1
llama_model_loader: - kv   7:                  general.base_model.0.name str              = Qwen2.5 14B Instruct
llama_model_loader: - kv   8:          general.base_model.0.organization str              = Qwen
llama_model_loader: - kv   9:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-1...
llama_model_loader: - kv  10:                          qwen2.block_count u32              = 48
llama_model_loader: - kv  11:                       qwen2.context_length u32              = 32768
llama_model_loader: - kv  12:                     qwen2.embedding_length u32              = 5120
llama_model_loader: - kv  13:                  qwen2.feed_forward_length u32              = 13824
llama_model_loader: - kv  14:                 qwen2.attention.head_count u32              = 40
llama_model_loader: - kv  15:              qwen2.attention.head_count_kv u32              = 8
llama_model_loader: - kv  16:                       qwen2.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  17:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  18:                          general.file_type u32              = 27
llama_model_loader: - kv  19:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  20:                         tokenizer.ggml.pre str              = qwen2
llama_model_loader: - kv  21:                      tokenizer.ggml.tokens arr[str,152064]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,152064]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  23:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 151645
llama_model_loader: - kv  25:            tokenizer.ggml.padding_token_id u32              = 151643
llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 151643
llama_model_loader: - kv  27:               tokenizer.ggml.add_bos_token bool             = false
llama_model_loader: - kv  28:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
llama_model_loader: - kv  29:               general.quantization_version u32              = 2
llama_model_loader: - kv  30:                      quantize.imatrix.file str              = /models_out/Replete-LLM-V2.5-Qwen-14b...
llama_model_loader: - kv  31:                   quantize.imatrix.dataset str              = /training_dir/calibration_datav3.txt
llama_model_loader: - kv  32:             quantize.imatrix.entries_count i32              = 336
llama_model_loader: - kv  33:              quantize.imatrix.chunks_count i32              = 128
llama_model_loader: - type  f32:  241 tensors
llama_model_loader: - type q4_K:  102 tensors
llama_model_loader: - type q6_K:    1 tensors
llama_model_loader: - type iq3_s:  235 tensors
llm_load_vocab: control token: 151660 '<|fim_middle|>' is not marked as EOG
llm_load_vocab: control token: 151659 '<|fim_prefix|>' is not marked as EOG
llm_load_vocab: control token: 151653 '<|vision_end|>' is not marked as EOG
llm_load_vocab: control token: 151648 '<|box_start|>' is not marked as EOG
llm_load_vocab: control token: 151646 '<|object_ref_start|>' is not marked as EOG
llm_load_vocab: control token: 151649 '<|box_end|>' is not marked as EOG
llm_load_vocab: control token: 151655 '<|image_pad|>' is not marked as EOG
llm_load_vocab: control token: 151651 '<|quad_end|>' is not marked as EOG
llm_load_vocab: control token: 151647 '<|object_ref_end|>' is not marked as EOG
llm_load_vocab: control token: 151652 '<|vision_start|>' is not marked as EOG
llm_load_vocab: control token: 151654 '<|vision_pad|>' is not marked as EOG
llm_load_vocab: control token: 151656 '<|video_pad|>' is not marked as EOG
llm_load_vocab: control token: 151644 '<|im_start|>' is not marked as EOG
llm_load_vocab: control token: 151661 '<|fim_suffix|>' is not marked as EOG
llm_load_vocab: control token: 151650 '<|quad_start|>' is not marked as EOG
llm_load_vocab: special tokens cache size = 22
llm_load_vocab: token to piece cache size = 0.9310 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = qwen2
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 152064
llm_load_print_meta: n_merges         = 151387
llm_load_print_meta: vocab_only       = 0
llm_load_print_meta: n_ctx_train      = 32768
llm_load_print_meta: n_embd           = 5120
llm_load_print_meta: n_layer          = 48
llm_load_print_meta: n_head           = 40
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_swa            = 0
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 5
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     = 1024
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale    = 0.0e+00
llm_load_print_meta: n_ff             = 13824
llm_load_print_meta: n_expert         = 0
llm_load_print_meta: n_expert_used    = 0
llm_load_print_meta: causal attn      = 1
llm_load_print_meta: pooling type     = 0
llm_load_print_meta: rope type        = 2
llm_load_print_meta: rope scaling     = linear
llm_load_print_meta: freq_base_train  = 1000000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_ctx_orig_yarn  = 32768
llm_load_print_meta: rope_finetuned   = unknown
llm_load_print_meta: ssm_d_conv       = 0
llm_load_print_meta: ssm_d_inner      = 0
llm_load_print_meta: ssm_d_state      = 0
llm_load_print_meta: ssm_dt_rank      = 0
llm_load_print_meta: ssm_dt_b_c_rms   = 0
llm_load_print_meta: model type       = ?B
llm_load_print_meta: model ftype      = IQ3_S mix - 3.66 bpw
llm_load_print_meta: model params     = 14.77 B
llm_load_print_meta: model size       = 6.44 GiB (3.74 BPW)
llm_load_print_meta: general.name     = Replete LLM V2.5 Qwen 14b
llm_load_print_meta: BOS token        = 151643 '<|endoftext|>'
llm_load_print_meta: EOS token        = 151645 '<|im_end|>'
llm_load_print_meta: EOT token        = 151645 '<|im_end|>'
llm_load_print_meta: PAD token        = 151643 '<|endoftext|>'
llm_load_print_meta: LF token         = 148848 'ÄĬ'
llm_load_print_meta: FIM PRE token    = 151659 '<|fim_prefix|>'
llm_load_print_meta: FIM SUF token    = 151661 '<|fim_suffix|>'
llm_load_print_meta: FIM MID token    = 151660 '<|fim_middle|>'
llm_load_print_meta: FIM PAD token    = 151662 '<|fim_pad|>'
llm_load_print_meta: FIM REP token    = 151663 '<|repo_name|>'
llm_load_print_meta: FIM SEP token    = 151664 '<|file_sep|>'
llm_load_print_meta: EOG token        = 151643 '<|endoftext|>'
llm_load_print_meta: EOG token        = 151645 '<|im_end|>'
llm_load_print_meta: EOG token        = 151662 '<|fim_pad|>'
llm_load_print_meta: EOG token        = 151663 '<|repo_name|>'
llm_load_print_meta: EOG token        = 151664 '<|file_sep|>'
llm_load_print_meta: max token length = 256
llm_load_tensors: ggml ctx size =    0.51 MiB
llm_load_tensors: offloading 48 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 49/49 layers to GPU
llm_load_tensors:        CPU buffer size =   319.04 MiB
llm_load_tensors:      CUDA0 buffer size =  6271.39 MiB
........................................................................................
llama_new_context_with_model: n_ctx      = 102400
llama_new_context_with_model: n_batch    = 2048
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 1
llama_new_context_with_model: freq_base  = 1000000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:      CUDA0 KV buffer size = 10200.00 MiB
llama_new_context_with_model: KV self size  = 10200.00 MiB, K (q8_0): 5100.00 MiB, V (q8_0): 5100.00 MiB
llama_new_context_with_model:  CUDA_Host  output buffer size =     6.38 MiB
llama_new_context_with_model:      CUDA0 compute buffer size =   340.00 MiB
llama_new_context_with_model:  CUDA_Host compute buffer size =   210.01 MiB
llama_new_context_with_model: graph nodes  = 1495
llama_new_context_with_model: graph splits = 2
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
srv          init: initializing slots, n_slots = 10
slot         init: id  0 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  1 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  2 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  3 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  4 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  5 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  6 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  7 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  8 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  9 | task -1 | new slot n_ctx_slot = 10240
main: model loaded
main: chat template, built_in: 0, chat_example: '<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there<|im_end|>
<|im_start|>user
How are you?<|im_end|>
<|im_start|>assistant
'
main: server is listening on 127.0.0.1:8080 - starting the main loop
srv  update_slots: all slots are idle
request: GET /props 127.0.0.1 200
request: POST /tokenize 127.0.0.1 200
slot launch_slot_: id  0 | task 0 | processing task
slot update_slots: id  0 | task 0 | tokenizing prompt, len = 1
slot update_slots: id  0 | task 0 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8594
slot update_slots: id  0 | task 0 | kv cache rm [0, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 2048, n_tokens = 2048, progress = 0.238306
slot launch_slot_: id  1 | task 2 | processing task
slot launch_slot_: id  2 | task 3 | processing task
slot launch_slot_: id  3 | task 4 | processing task
slot launch_slot_: id  4 | task 5 | processing task
slot launch_slot_: id  5 | task 6 | processing task
slot launch_slot_: id  6 | task 7 | processing task
slot launch_slot_: id  7 | task 8 | processing task
slot launch_slot_: id  8 | task 9 | processing task
slot launch_slot_: id  9 | task 10 | processing task
slot update_slots: id  0 | task 0 | kv cache rm [2048, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 4096, n_tokens = 2048, progress = 0.476612
slot update_slots: id  0 | task 0 | kv cache rm [4096, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 6144, n_tokens = 2048, progress = 0.714917
slot update_slots: id  0 | task 0 | kv cache rm [6144, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 8192, n_tokens = 2048, progress = 0.953223
slot update_slots: id  0 | task 0 | kv cache rm [8192, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 8594, n_tokens = 402, progress = 1.000000
slot update_slots: id  0 | task 0 | prompt done, n_past = 8594, n_tokens = 402
slot update_slots: id  1 | task 2 | tokenizing prompt, len = 1
slot update_slots: id  1 | task 2 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8585
slot update_slots: id  1 | task 2 | kv cache rm [0, end)
slot update_slots: id  1 | task 2 | prompt processing progress, n_past = 1646, n_tokens = 2048, progress = 0.191730
slot update_slots: id  1 | task 2 | kv cache rm [1646, end)
slot update_slots: id  1 | task 2 | prompt processing progress, n_past = 3693, n_tokens = 2048, progress = 0.430169
slot update_slots: id  1 | task 2 | kv cache rm [3693, end)
slot update_slots: id  1 | task 2 | prompt processing progress, n_past = 5740, n_tokens = 2048, progress = 0.668608
slot update_slots: id  1 | task 2 | kv cache rm [5740, end)
slot update_slots: id  1 | task 2 | prompt processing progress, n_past = 7787, n_tokens = 2048, progress = 0.907047
slot update_slots: id  1 | task 2 | kv cache rm [7787, end)
slot update_slots: id  1 | task 2 | prompt processing progress, n_past = 8585, n_tokens = 799, progress = 1.000000
slot update_slots: id  1 | task 2 | prompt done, n_past = 8585, n_tokens = 799
slot update_slots: id  2 | task 3 | tokenizing prompt, len = 1
slot update_slots: id  2 | task 3 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8562
slot update_slots: id  2 | task 3 | kv cache rm [0, end)
slot update_slots: id  2 | task 3 | prompt processing progress, n_past = 1249, n_tokens = 2048, progress = 0.145877
slot update_slots: id  2 | task 3 | kv cache rm [1249, end)
slot update_slots: id  2 | task 3 | prompt processing progress, n_past = 3295, n_tokens = 2048, progress = 0.384840
slot update_slots: id  2 | task 3 | kv cache rm [3295, end)
slot update_slots: id  2 | task 3 | prompt processing progress, n_past = 5341, n_tokens = 2048, progress = 0.623803
slot update_slots: id  2 | task 3 | kv cache rm [5341, end)
slot update_slots: id  2 | task 3 | prompt processing progress, n_past = 7387, n_tokens = 2048, progress = 0.862766
slot update_slots: id  2 | task 3 | kv cache rm [7387, end)
slot update_slots: id  2 | task 3 | prompt processing progress, n_past = 8562, n_tokens = 1177, progress = 1.000000
slot update_slots: id  2 | task 3 | prompt done, n_past = 8562, n_tokens = 1177
slot update_slots: id  3 | task 4 | tokenizing prompt, len = 1
slot update_slots: id  3 | task 4 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8457
slot update_slots: id  3 | task 4 | kv cache rm [0, end)
slot update_slots: id  3 | task 4 | prompt processing progress, n_past = 871, n_tokens = 2048, progress = 0.102992
slot update_slots: id  3 | task 4 | kv cache rm [871, end)
slot update_slots: id  3 | task 4 | prompt processing progress, n_past = 2916, n_tokens = 2048, progress = 0.344803
slot update_slots: id  3 | task 4 | kv cache rm [2916, end)
slot update_slots: id  3 | task 4 | prompt processing progress, n_past = 4961, n_tokens = 2048, progress = 0.586615
slot update_slots: id  3 | task 4 | kv cache rm [4961, end)
slot update_slots: id  3 | task 4 | prompt processing progress, n_past = 7006, n_tokens = 2048, progress = 0.828426
slot update_slots: id  3 | task 4 | kv cache rm [7006, end)
slot update_slots: id  3 | task 4 | prompt processing progress, n_past = 8457, n_tokens = 1454, progress = 1.000000
slot update_slots: id  3 | task 4 | prompt done, n_past = 8457, n_tokens = 1454
slot update_slots: id  4 | task 5 | tokenizing prompt, len = 1
slot update_slots: id  4 | task 5 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 9007
slot update_slots: id  4 | task 5 | kv cache rm [0, end)
slot update_slots: id  4 | task 5 | prompt processing progress, n_past = 594, n_tokens = 2048, progress = 0.065949
slot update_slots: id  4 | task 5 | kv cache rm [594, end)
slot update_slots: id  4 | task 5 | prompt processing progress, n_past = 2638, n_tokens = 2048, progress = 0.292883
slot update_slots: id  4 | task 5 | kv cache rm [2638, end)
slot update_slots: id  4 | task 5 | prompt processing progress, n_past = 4682, n_tokens = 2048, progress = 0.519818
slot update_slots: id  4 | task 5 | kv cache rm [4682, end)
slot update_slots: id  4 | task 5 | prompt processing progress, n_past = 6726, n_tokens = 2048, progress = 0.746753
slot update_slots: id  4 | task 5 | kv cache rm [6726, end)
slot update_slots: id  4 | task 5 | prompt processing progress, n_past = 8770, n_tokens = 2048, progress = 0.973687
slot update_slots: id  4 | task 5 | kv cache rm [8770, end)
slot update_slots: id  4 | task 5 | prompt processing progress, n_past = 9007, n_tokens = 241, progress = 1.000000
slot update_slots: id  4 | task 5 | prompt done, n_past = 9007, n_tokens = 241
slot update_slots: id  5 | task 6 | tokenizing prompt, len = 1
slot update_slots: id  5 | task 6 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8853
slot update_slots: id  5 | task 6 | kv cache rm [0, end)
slot update_slots: id  5 | task 6 | prompt processing progress, n_past = 1807, n_tokens = 2048, progress = 0.204112
slot update_slots: id  5 | task 6 | kv cache rm [1807, end)
slot update_slots: id  5 | task 6 | prompt processing progress, n_past = 3850, n_tokens = 2048, progress = 0.434881
slot update_slots: id  5 | task 6 | kv cache rm [3850, end)
slot update_slots: id  5 | task 6 | prompt processing progress, n_past = 5893, n_tokens = 2048, progress = 0.665650
slot update_slots: id  5 | task 6 | kv cache rm [5893, end)
slot update_slots: id  5 | task 6 | prompt processing progress, n_past = 7936, n_tokens = 2048, progress = 0.896419
slot update_slots: id  5 | task 6 | kv cache rm [7936, end)
slot update_slots: id  5 | task 6 | prompt processing progress, n_past = 8853, n_tokens = 922, progress = 1.000000
slot update_slots: id  5 | task 6 | prompt done, n_past = 8853, n_tokens = 922
slot update_slots: id  6 | task 7 | tokenizing prompt, len = 1
slot update_slots: id  6 | task 7 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 9213
slot update_slots: id  6 | task 7 | kv cache rm [0, end)
slot update_slots: id  6 | task 7 | prompt processing progress, n_past = 1126, n_tokens = 2048, progress = 0.122219
slot update_slots: id  6 | task 7 | kv cache rm [1126, end)
slot update_slots: id  6 | task 7 | prompt processing progress, n_past = 3168, n_tokens = 2048, progress = 0.343862
slot update_slots: id  6 | task 7 | kv cache rm [3168, end)
slot update_slots: id  6 | task 7 | prompt processing progress, n_past = 5210, n_tokens = 2048, progress = 0.565505
slot update_slots: id  6 | task 7 | kv cache rm [5210, end)
slot update_slots: id  6 | task 7 | prompt processing progress, n_past = 7252, n_tokens = 2048, progress = 0.787149
slot update_slots: id  6 | task 7 | kv cache rm [7252, end)
slot update_slots: id  6 | task 7 | prompt processing progress, n_past = 9213, n_tokens = 1967, progress = 1.000000
slot update_slots: id  6 | task 7 | prompt done, n_past = 9213, n_tokens = 1967
slot update_slots: id  7 | task 8 | tokenizing prompt, len = 1
slot update_slots: id  7 | task 8 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 9446
slot update_slots: id  7 | task 8 | kv cache rm [0, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 81, n_tokens = 2048, progress = 0.008575
slot update_slots: id  7 | task 8 | kv cache rm [81, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 2122, n_tokens = 2048, progress = 0.224645
slot update_slots: id  7 | task 8 | kv cache rm [2122, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 4163, n_tokens = 2048, progress = 0.440716
slot update_slots: id  7 | task 8 | kv cache rm [4163, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 6204, n_tokens = 2048, progress = 0.656786
slot update_slots: id  7 | task 8 | kv cache rm [6204, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 8245, n_tokens = 2048, progress = 0.872856
slot update_slots: id  7 | task 8 | kv cache rm [8245, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 9446, n_tokens = 1208, progress = 1.000000
slot update_slots: id  7 | task 8 | prompt done, n_past = 9446, n_tokens = 1208
slot update_slots: id  8 | task 9 | tokenizing prompt, len = 1
slot update_slots: id  8 | task 9 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8661
slot update_slots: id  8 | task 9 | kv cache rm [0, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 840, n_tokens = 2048, progress = 0.096986
slot update_slots: id  8 | task 9 | kv cache rm [840, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 2880, n_tokens = 2048, progress = 0.332525
slot update_slots: id  8 | task 9 | kv cache rm [2880, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 4920, n_tokens = 2048, progress = 0.568064
slot update_slots: id  8 | task 9 | kv cache rm [4920, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 6960, n_tokens = 2048, progress = 0.803602
slot update_slots: id  8 | task 9 | kv cache rm [6960, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 8661, n_tokens = 1709, progress = 1.000000
slot update_slots: id  8 | task 9 | prompt done, n_past = 8661, n_tokens = 1709
slot update_slots: id  9 | task 10 | tokenizing prompt, len = 1
slot update_slots: id  9 | task 10 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8390
slot update_slots: id  9 | task 10 | kv cache rm [0, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 339, n_tokens = 2048, progress = 0.040405
slot update_slots: id  9 | task 10 | kv cache rm [339, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 2378, n_tokens = 2048, progress = 0.283433
slot update_slots: id  9 | task 10 | kv cache rm [2378, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 4417, n_tokens = 2048, progress = 0.526460
slot update_slots: id  9 | task 10 | kv cache rm [4417, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 6456, n_tokens = 2048, progress = 0.769488
slot update_slots: id  9 | task 10 | kv cache rm [6456, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 8390, n_tokens = 1943, progress = 1.000000
slot update_slots: id  9 | task 10 | prompt done, n_past = 8390, n_tokens = 1943
slot      release: id  8 | task 9 | stop processing: n_past = 8904, truncated = 0
slot print_timing: id  8 | task 9 |
prompt eval time =   14863.10 ms /  8661 tokens (    1.72 ms per token,   582.72 tokens per second)
       eval time =   37648.51 ms /   244 tokens (  154.30 ms per token,     6.48 tokens per second)
      total time =   52511.61 ms /  8905 tokens
request: POST /completion 127.0.0.1 200
slot      release: id  2 | task 3 | stop processing: n_past = 9072, truncated = 0
slot print_timing: id  2 | task 3 |
prompt eval time =    5774.81 ms /  8562 tokens (    0.67 ms per token,  1482.65 tokens per second)
       eval time =  120757.56 ms /   511 tokens (  236.32 ms per token,     4.23 tokens per second)
      total time =  126532.36 ms /  9073 tokens
request: POST /completion 127.0.0.1 200
slot      release: id  6 | task 7 | stop processing: n_past = 9708, truncated = 0
slot print_timing: id  6 | task 7 |
prompt eval time =   11687.75 ms /  9213 tokens (    1.27 ms per token,   788.26 tokens per second)
       eval time =   88451.60 ms /   496 tokens (  178.33 ms per token,     5.61 tokens per second)
      total time =  100139.35 ms /  9709 tokens
request: POST /completion 127.0.0.1 200
slot      release: id  9 | task 10 | stop processing: n_past = 8899, truncated = 0
slot print_timing: id  9 | task 10 |
prompt eval time =   16603.01 ms /  8390 tokens (    1.98 ms per token,   505.33 tokens per second)
       eval time =   52316.64 ms /   510 tokens (  102.58 ms per token,     9.75 tokens per second)
      total time =   68919.64 ms /  8900 tokens
request: POST /completion 127.0.0.1 200
slot      release: id  4 | task 5 | stop processing: n_past = 9582, truncated = 0
slot print_timing: id  4 | task 5 |
prompt eval time =   10398.17 ms /  9007 tokens (    1.15 ms per token,   866.21 tokens per second)
       eval time =  113429.50 ms /   576 tokens (  196.93 ms per token,     5.08 tokens per second)
      total time =  123827.67 ms /  9583 tokens
request: POST /completion 127.0.0.1 200
slot      release: id  0 | task 0 | stop processing: n_past = 9206, truncated = 0
slot print_timing: id  0 | task 0 |
prompt eval time =    3030.07 ms /  8594 tokens (    0.35 ms per token,  2836.24 tokens per second)
       eval time =  138377.86 ms /   613 tokens (  225.74 ms per token,     4.43 tokens per second)
      total time =  141407.93 ms /  9207 tokens
request: POST /completion 127.0.0.1 200
slot      release: id  3 | task 4 | stop processing: n_past = 9216, truncated = 0
slot print_timing: id  3 | task 4 |
prompt eval time =    7145.12 ms /  8457 tokens (    0.84 ms per token,  1183.61 tokens per second)
       eval time =  139217.79 ms /   760 tokens (  183.18 ms per token,     5.46 tokens per second)
      total time =  146362.91 ms /  9217 tokens
request: POST /completion 127.0.0.1 200
slot update_slots: id  7 | task 8 | slot context shift, n_keep = 0, n_left = 10239, n_discard = 5119
/build/source/ggml/src/ggml-cuda.cu:70: CUDA error
CUDA error: an illegal memory access was encountered
  current device: 0, in function ggml_backend_cuda_synchronize at /build/source/ggml/src/ggml-cuda.cu:2446
  cudaStreamSynchronize(cuda_ctx->stream())

It clearly made it less common, but I did see a crash with essentially the same backtrace after a couple of "failed to find free space in the KV cache" and a "slot context shift" (and without the client disconnecting anything spuriously). Should I open a new bug report?

This is b9399 with the patch from https://github.com/ggerganov/llama.cpp/issues/9928#issuecomment-2419929042 .

llama-server output:

$ llama-server -c 102400 -ngl 100 -m Replete-LLM-V2.5-Qwen-14b-IQ3_M.gguf --chat-template chatml --check-tensors -ctk q8_0 -ctv q8_0 -fa --parallel 10
ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
register_backend: registered backend CUDA (1 devices)
register_device: registered device CUDA0 (NVIDIA GeForce RTX 4090)
register_backend: registered backend CPU (1 devices)
register_device: registered device CPU (AMD Ryzen Threadripper PRO 5955WX 16-Cores)
build: 0 (unknown) with gcc (GCC) 13.3.0 for x86_64-unknown-linux-gnu (debug)
system info: n_threads = 16, n_threads_batch = 16, total_threads = 32

system_info: n_threads = 16 (n_threads_batch = 16) / 32 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | RISCV_VECT = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |

main: HTTP server is listening, hostname: 127.0.0.1, port: 8080, http threads: 31
main: loading model
llama_load_model_from_file: using device CUDA0 (NVIDIA GeForce RTX 4090) - 18688 MiB free
llama_model_loader: loaded meta data with 34 key-value pairs and 579 tensors from Replete-LLM-V2.5-Qwen-14b-IQ3_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Replete LLM V2.5 Qwen 14b
llama_model_loader: - kv   3:                           general.basename str              = Replete-LLM-V2.5-Qwen
llama_model_loader: - kv   4:                         general.size_label str              = 14B
llama_model_loader: - kv   5:                            general.license str              = apache-2.0
llama_model_loader: - kv   6:                   general.base_model.count u32              = 1
llama_model_loader: - kv   7:                  general.base_model.0.name str              = Qwen2.5 14B Instruct
llama_model_loader: - kv   8:          general.base_model.0.organization str              = Qwen
llama_model_loader: - kv   9:              general.base_model.0.repo_url str              = https://huggingface.co/Qwen/Qwen2.5-1...
llama_model_loader: - kv  10:                          qwen2.block_count u32              = 48
llama_model_loader: - kv  11:                       qwen2.context_length u32              = 32768
llama_model_loader: - kv  12:                     qwen2.embedding_length u32              = 5120
llama_model_loader: - kv  13:                  qwen2.feed_forward_length u32              = 13824
llama_model_loader: - kv  14:                 qwen2.attention.head_count u32              = 40
llama_model_loader: - kv  15:              qwen2.attention.head_count_kv u32              = 8
llama_model_loader: - kv  16:                       qwen2.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  17:     qwen2.attention.layer_norm_rms_epsilon f32              = 0.000001
llama_model_loader: - kv  18:                          general.file_type u32              = 27
llama_model_loader: - kv  19:                       tokenizer.ggml.model str              = gpt2
llama_model_loader: - kv  20:                         tokenizer.ggml.pre str              = qwen2
llama_model_loader: - kv  21:                      tokenizer.ggml.tokens arr[str,152064]  = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv  22:                  tokenizer.ggml.token_type arr[i32,152064]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv  23:                      tokenizer.ggml.merges arr[str,151387]  = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
llama_model_loader: - kv  24:                tokenizer.ggml.eos_token_id u32              = 151645
llama_model_loader: - kv  25:            tokenizer.ggml.padding_token_id u32              = 151643
llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 151643
llama_model_loader: - kv  27:               tokenizer.ggml.add_bos_token bool             = false
llama_model_loader: - kv  28:                    tokenizer.chat_template str              = {%- if tools %}\n    {{- '<|im_start|>...
llama_model_loader: - kv  29:               general.quantization_version u32              = 2
llama_model_loader: - kv  30:                      quantize.imatrix.file str              = /models_out/Replete-LLM-V2.5-Qwen-14b...
llama_model_loader: - kv  31:                   quantize.imatrix.dataset str              = /training_dir/calibration_datav3.txt
llama_model_loader: - kv  32:             quantize.imatrix.entries_count i32              = 336
llama_model_loader: - kv  33:              quantize.imatrix.chunks_count i32              = 128
llama_model_loader: - type  f32:  241 tensors
llama_model_loader: - type q4_K:  102 tensors
llama_model_loader: - type q6_K:    1 tensors
llama_model_loader: - type iq3_s:  235 tensors
llm_load_vocab: control token: 151660 '<|fim_middle|>' is not marked as EOG
llm_load_vocab: control token: 151659 '<|fim_prefix|>' is not marked as EOG
llm_load_vocab: control token: 151653 '<|vision_end|>' is not marked as EOG
llm_load_vocab: control token: 151648 '<|box_start|>' is not marked as EOG
llm_load_vocab: control token: 151646 '<|object_ref_start|>' is not marked as EOG
llm_load_vocab: control token: 151649 '<|box_end|>' is not marked as EOG
llm_load_vocab: control token: 151655 '<|image_pad|>' is not marked as EOG
llm_load_vocab: control token: 151651 '<|quad_end|>' is not marked as EOG
llm_load_vocab: control token: 151647 '<|object_ref_end|>' is not marked as EOG
llm_load_vocab: control token: 151652 '<|vision_start|>' is not marked as EOG
llm_load_vocab: control token: 151654 '<|vision_pad|>' is not marked as EOG
llm_load_vocab: control token: 151656 '<|video_pad|>' is not marked as EOG
llm_load_vocab: control token: 151644 '<|im_start|>' is not marked as EOG
llm_load_vocab: control token: 151661 '<|fim_suffix|>' is not marked as EOG
llm_load_vocab: control token: 151650 '<|quad_start|>' is not marked as EOG
llm_load_vocab: special tokens cache size = 22
llm_load_vocab: token to piece cache size = 0.9310 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = qwen2
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 152064
llm_load_print_meta: n_merges         = 151387
llm_load_print_meta: vocab_only       = 0
llm_load_print_meta: n_ctx_train      = 32768
llm_load_print_meta: n_embd           = 5120
llm_load_print_meta: n_layer          = 48
llm_load_print_meta: n_head           = 40
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_swa            = 0
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 5
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     = 1024
llm_load_print_meta: f_norm_eps       = 0.0e+00
llm_load_print_meta: f_norm_rms_eps   = 1.0e-06
llm_load_print_meta: f_clamp_kqv      = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: f_logit_scale    = 0.0e+00
llm_load_print_meta: n_ff             = 13824
llm_load_print_meta: n_expert         = 0
llm_load_print_meta: n_expert_used    = 0
llm_load_print_meta: causal attn      = 1
llm_load_print_meta: pooling type     = 0
llm_load_print_meta: rope type        = 2
llm_load_print_meta: rope scaling     = linear
llm_load_print_meta: freq_base_train  = 1000000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_ctx_orig_yarn  = 32768
llm_load_print_meta: rope_finetuned   = unknown
llm_load_print_meta: ssm_d_conv       = 0
llm_load_print_meta: ssm_d_inner      = 0
llm_load_print_meta: ssm_d_state      = 0
llm_load_print_meta: ssm_dt_rank      = 0
llm_load_print_meta: ssm_dt_b_c_rms   = 0
llm_load_print_meta: model type       = ?B
llm_load_print_meta: model ftype      = IQ3_S mix - 3.66 bpw
llm_load_print_meta: model params     = 14.77 B
llm_load_print_meta: model size       = 6.44 GiB (3.74 BPW)
llm_load_print_meta: general.name     = Replete LLM V2.5 Qwen 14b
llm_load_print_meta: BOS token        = 151643 '<|endoftext|>'
llm_load_print_meta: EOS token        = 151645 '<|im_end|>'
llm_load_print_meta: EOT token        = 151645 '<|im_end|>'
llm_load_print_meta: PAD token        = 151643 '<|endoftext|>'
llm_load_print_meta: LF token         = 148848 'ÄĬ'
llm_load_print_meta: FIM PRE token    = 151659 '<|fim_prefix|>'
llm_load_print_meta: FIM SUF token    = 151661 '<|fim_suffix|>'
llm_load_print_meta: FIM MID token    = 151660 '<|fim_middle|>'
llm_load_print_meta: FIM PAD token    = 151662 '<|fim_pad|>'
llm_load_print_meta: FIM REP token    = 151663 '<|repo_name|>'
llm_load_print_meta: FIM SEP token    = 151664 '<|file_sep|>'
llm_load_print_meta: EOG token        = 151643 '<|endoftext|>'
llm_load_print_meta: EOG token        = 151645 '<|im_end|>'
llm_load_print_meta: EOG token        = 151662 '<|fim_pad|>'
llm_load_print_meta: EOG token        = 151663 '<|repo_name|>'
llm_load_print_meta: EOG token        = 151664 '<|file_sep|>'
llm_load_print_meta: max token length = 256
llm_load_tensors: ggml ctx size =    0.51 MiB
llm_load_tensors: offloading 48 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 49/49 layers to GPU
llm_load_tensors:        CPU buffer size =   319.04 MiB
llm_load_tensors:      CUDA0 buffer size =  6271.39 MiB
........................................................................................
llama_new_context_with_model: n_ctx      = 102400
llama_new_context_with_model: n_batch    = 2048
llama_new_context_with_model: n_ubatch   = 512
llama_new_context_with_model: flash_attn = 1
llama_new_context_with_model: freq_base  = 1000000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:      CUDA0 KV buffer size = 10200.00 MiB
llama_new_context_with_model: KV self size  = 10200.00 MiB, K (q8_0): 5100.00 MiB, V (q8_0): 5100.00 MiB
llama_new_context_with_model:  CUDA_Host  output buffer size =     6.38 MiB
llama_new_context_with_model:      CUDA0 compute buffer size =   340.00 MiB
llama_new_context_with_model:  CUDA_Host compute buffer size =   210.01 MiB
llama_new_context_with_model: graph nodes  = 1495
llama_new_context_with_model: graph splits = 2
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
srv          init: initializing slots, n_slots = 10
slot         init: id  0 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  1 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  2 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  3 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  4 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  5 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  6 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  7 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  8 | task -1 | new slot n_ctx_slot = 10240
slot         init: id  9 | task -1 | new slot n_ctx_slot = 10240
main: model loaded
main: chat template, built_in: 0, chat_example: '<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there<|im_end|>
<|im_start|>user
How are you?<|im_end|>
<|im_start|>assistant
'
main: server is listening on 127.0.0.1:8080 - starting the main loop
srv  update_slots: all slots are idle
request: GET /props 127.0.0.1 200
request: POST /tokenize 127.0.0.1 200
slot launch_slot_: id  0 | task 0 | processing task
slot launch_slot_: id  1 | task 1 | processing task
slot launch_slot_: id  2 | task 2 | processing task
slot launch_slot_: id  3 | task 3 | processing task
slot launch_slot_: id  4 | task 4 | processing task
slot launch_slot_: id  5 | task 5 | processing task
slot launch_slot_: id  6 | task 6 | processing task
slot update_slots: id  0 | task 0 | tokenizing prompt, len = 1
slot update_slots: id  0 | task 0 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8594
slot update_slots: id  0 | task 0 | kv cache rm [0, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 2048, n_tokens = 2048, progress = 0.238306
slot launch_slot_: id  7 | task 8 | processing task
slot launch_slot_: id  8 | task 9 | processing task
slot launch_slot_: id  9 | task 10 | processing task
slot update_slots: id  0 | task 0 | kv cache rm [2048, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 4096, n_tokens = 2048, progress = 0.476612
slot update_slots: id  0 | task 0 | kv cache rm [4096, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 6144, n_tokens = 2048, progress = 0.714917
slot update_slots: id  0 | task 0 | kv cache rm [6144, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 8192, n_tokens = 2048, progress = 0.953223
slot update_slots: id  0 | task 0 | kv cache rm [8192, end)
slot update_slots: id  0 | task 0 | prompt processing progress, n_past = 8594, n_tokens = 402, progress = 1.000000
slot update_slots: id  0 | task 0 | prompt done, n_past = 8594, n_tokens = 402
slot update_slots: id  1 | task 1 | tokenizing prompt, len = 1
slot update_slots: id  1 | task 1 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8585
slot update_slots: id  1 | task 1 | kv cache rm [0, end)
slot update_slots: id  1 | task 1 | prompt processing progress, n_past = 1646, n_tokens = 2048, progress = 0.191730
slot update_slots: id  1 | task 1 | kv cache rm [1646, end)
slot update_slots: id  1 | task 1 | prompt processing progress, n_past = 3693, n_tokens = 2048, progress = 0.430169
slot update_slots: id  1 | task 1 | kv cache rm [3693, end)
slot update_slots: id  1 | task 1 | prompt processing progress, n_past = 5740, n_tokens = 2048, progress = 0.668608
slot update_slots: id  1 | task 1 | kv cache rm [5740, end)
slot update_slots: id  1 | task 1 | prompt processing progress, n_past = 7787, n_tokens = 2048, progress = 0.907047
slot update_slots: id  1 | task 1 | kv cache rm [7787, end)
slot update_slots: id  1 | task 1 | prompt processing progress, n_past = 8585, n_tokens = 799, progress = 1.000000
slot update_slots: id  1 | task 1 | prompt done, n_past = 8585, n_tokens = 799
slot update_slots: id  2 | task 2 | tokenizing prompt, len = 1
slot update_slots: id  2 | task 2 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8457
slot update_slots: id  2 | task 2 | kv cache rm [0, end)
slot update_slots: id  2 | task 2 | prompt processing progress, n_past = 1249, n_tokens = 2048, progress = 0.147688
slot update_slots: id  2 | task 2 | kv cache rm [1249, end)
slot update_slots: id  2 | task 2 | prompt processing progress, n_past = 3295, n_tokens = 2048, progress = 0.389618
slot update_slots: id  2 | task 2 | kv cache rm [3295, end)
slot update_slots: id  2 | task 2 | prompt processing progress, n_past = 5341, n_tokens = 2048, progress = 0.631548
slot update_slots: id  2 | task 2 | kv cache rm [5341, end)
slot update_slots: id  2 | task 2 | prompt processing progress, n_past = 7387, n_tokens = 2048, progress = 0.873478
slot update_slots: id  2 | task 2 | kv cache rm [7387, end)
slot update_slots: id  2 | task 2 | prompt processing progress, n_past = 8457, n_tokens = 1072, progress = 1.000000
slot update_slots: id  2 | task 2 | prompt done, n_past = 8457, n_tokens = 1072
slot update_slots: id  3 | task 3 | tokenizing prompt, len = 1
slot update_slots: id  3 | task 3 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8562
slot update_slots: id  3 | task 3 | kv cache rm [0, end)
slot update_slots: id  3 | task 3 | prompt processing progress, n_past = 976, n_tokens = 2048, progress = 0.113992
slot update_slots: id  3 | task 3 | kv cache rm [976, end)
slot update_slots: id  3 | task 3 | prompt processing progress, n_past = 3021, n_tokens = 2048, progress = 0.352838
slot update_slots: id  3 | task 3 | kv cache rm [3021, end)
slot update_slots: id  3 | task 3 | prompt processing progress, n_past = 5066, n_tokens = 2048, progress = 0.591684
slot update_slots: id  3 | task 3 | kv cache rm [5066, end)
slot update_slots: id  3 | task 3 | prompt processing progress, n_past = 7111, n_tokens = 2048, progress = 0.830530
slot update_slots: id  3 | task 3 | kv cache rm [7111, end)
slot update_slots: id  3 | task 3 | prompt processing progress, n_past = 8562, n_tokens = 1454, progress = 1.000000
slot update_slots: id  3 | task 3 | prompt done, n_past = 8562, n_tokens = 1454
slot update_slots: id  4 | task 4 | tokenizing prompt, len = 1
slot update_slots: id  4 | task 4 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8661
slot update_slots: id  4 | task 4 | kv cache rm [0, end)
slot update_slots: id  4 | task 4 | prompt processing progress, n_past = 594, n_tokens = 2048, progress = 0.068583
slot update_slots: id  4 | task 4 | kv cache rm [594, end)
slot update_slots: id  4 | task 4 | prompt processing progress, n_past = 2638, n_tokens = 2048, progress = 0.304584
slot update_slots: id  4 | task 4 | kv cache rm [2638, end)
slot update_slots: id  4 | task 4 | prompt processing progress, n_past = 4682, n_tokens = 2048, progress = 0.540584
slot update_slots: id  4 | task 4 | kv cache rm [4682, end)
slot update_slots: id  4 | task 4 | prompt processing progress, n_past = 6726, n_tokens = 2048, progress = 0.776585
slot update_slots: id  4 | task 4 | kv cache rm [6726, end)
slot update_slots: id  4 | task 4 | prompt processing progress, n_past = 8661, n_tokens = 1939, progress = 1.000000
slot update_slots: id  4 | task 4 | prompt done, n_past = 8661, n_tokens = 1939
slot update_slots: id  5 | task 5 | tokenizing prompt, len = 1
slot update_slots: id  5 | task 5 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 9007
slot update_slots: id  5 | task 5 | kv cache rm [0, end)
slot update_slots: id  5 | task 5 | prompt processing progress, n_past = 109, n_tokens = 2048, progress = 0.012102
slot update_slots: id  5 | task 5 | kv cache rm [109, end)
slot update_slots: id  5 | task 5 | prompt processing progress, n_past = 2152, n_tokens = 2048, progress = 0.238925
slot update_slots: id  5 | task 5 | kv cache rm [2152, end)
slot update_slots: id  5 | task 5 | prompt processing progress, n_past = 4195, n_tokens = 2048, progress = 0.465749
slot update_slots: id  5 | task 5 | kv cache rm [4195, end)
slot update_slots: id  5 | task 5 | prompt processing progress, n_past = 6238, n_tokens = 2048, progress = 0.692572
slot update_slots: id  5 | task 5 | kv cache rm [6238, end)
slot update_slots: id  5 | task 5 | prompt processing progress, n_past = 8281, n_tokens = 2048, progress = 0.919396
slot update_slots: id  5 | task 5 | kv cache rm [8281, end)
slot update_slots: id  5 | task 5 | prompt processing progress, n_past = 9007, n_tokens = 731, progress = 1.000000
slot update_slots: id  5 | task 5 | prompt done, n_past = 9007, n_tokens = 731
slot update_slots: id  6 | task 6 | tokenizing prompt, len = 1
slot update_slots: id  6 | task 6 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8853
slot update_slots: id  6 | task 6 | kv cache rm [0, end)
slot update_slots: id  6 | task 6 | prompt processing progress, n_past = 1317, n_tokens = 2048, progress = 0.148763
slot update_slots: id  6 | task 6 | kv cache rm [1317, end)
slot update_slots: id  6 | task 6 | prompt processing progress, n_past = 3359, n_tokens = 2048, progress = 0.379419
slot update_slots: id  6 | task 6 | kv cache rm [3359, end)
slot update_slots: id  6 | task 6 | prompt processing progress, n_past = 5401, n_tokens = 2048, progress = 0.610076
slot update_slots: id  6 | task 6 | kv cache rm [5401, end)
slot update_slots: id  6 | task 6 | prompt processing progress, n_past = 7443, n_tokens = 2048, progress = 0.840732
slot update_slots: id  6 | task 6 | kv cache rm [7443, end)
slot update_slots: id  6 | task 6 | prompt processing progress, n_past = 8853, n_tokens = 1416, progress = 1.000000
slot update_slots: id  6 | task 6 | prompt done, n_past = 8853, n_tokens = 1416
slot update_slots: id  7 | task 8 | tokenizing prompt, len = 1
slot update_slots: id  7 | task 8 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8390
slot update_slots: id  7 | task 8 | kv cache rm [0, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 632, n_tokens = 2048, progress = 0.075328
slot update_slots: id  7 | task 8 | kv cache rm [632, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 2673, n_tokens = 2048, progress = 0.318594
slot update_slots: id  7 | task 8 | kv cache rm [2673, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 4714, n_tokens = 2048, progress = 0.561859
slot update_slots: id  7 | task 8 | kv cache rm [4714, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 6755, n_tokens = 2048, progress = 0.805125
slot update_slots: id  7 | task 8 | kv cache rm [6755, end)
slot update_slots: id  7 | task 8 | prompt processing progress, n_past = 8390, n_tokens = 1642, progress = 1.000000
slot update_slots: id  7 | task 8 | prompt done, n_past = 8390, n_tokens = 1642
slot update_slots: id  8 | task 9 | tokenizing prompt, len = 1
slot update_slots: id  8 | task 9 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 9446
slot update_slots: id  8 | task 9 | kv cache rm [0, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 406, n_tokens = 2048, progress = 0.042981
slot update_slots: id  8 | task 9 | kv cache rm [406, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 2446, n_tokens = 2048, progress = 0.258946
slot update_slots: id  8 | task 9 | kv cache rm [2446, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 4486, n_tokens = 2048, progress = 0.474910
slot update_slots: id  8 | task 9 | kv cache rm [4486, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 6526, n_tokens = 2048, progress = 0.690874
slot update_slots: id  8 | task 9 | kv cache rm [6526, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 8566, n_tokens = 2048, progress = 0.906839
slot update_slots: id  8 | task 9 | kv cache rm [8566, end)
slot update_slots: id  8 | task 9 | prompt processing progress, n_past = 9446, n_tokens = 888, progress = 1.000000
slot update_slots: id  8 | task 9 | prompt done, n_past = 9446, n_tokens = 888
slot update_slots: id  9 | task 10 | tokenizing prompt, len = 1
slot update_slots: id  9 | task 10 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 9213
slot update_slots: id  9 | task 10 | kv cache rm [0, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 1160, n_tokens = 2048, progress = 0.125909
slot update_slots: id  9 | task 10 | kv cache rm [1160, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 3199, n_tokens = 2048, progress = 0.347227
slot update_slots: id  9 | task 10 | kv cache rm [3199, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 5238, n_tokens = 2048, progress = 0.568544
slot update_slots: id  9 | task 10 | kv cache rm [5238, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 7277, n_tokens = 2048, progress = 0.789862
slot update_slots: id  9 | task 10 | kv cache rm [7277, end)
slot update_slots: id  9 | task 10 | prompt processing progress, n_past = 9213, n_tokens = 1945, progress = 1.000000
slot update_slots: id  9 | task 10 | prompt done, n_past = 9213, n_tokens = 1945
slot      release: id  7 | task 8 | stop processing: n_past = 8725, truncated = 0
slot print_timing: id  7 | task 8 |
prompt eval time =   35711.84 ms /  8390 tokens (    4.26 ms per token,   234.94 tokens per second)
       eval time =  117466.70 ms /   336 tokens (  349.60 ms per token,     2.86 tokens per second)
      total time =  153178.54 ms /  8726 tokens
request: POST /completion 127.0.0.1 200
slot launch_slot_: id  7 | task 380 | processing task
slot update_slots: id  7 | task 380 | tokenizing prompt, len = 1
slot update_slots: id  7 | task 380 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 9022
slot update_slots: id  7 | task 380 | kv cache rm [0, end)
slot update_slots: id  7 | task 380 | prompt processing progress, n_past = 2039, n_tokens = 2048, progress = 0.226003
slot update_slots: id  7 | task 380 | kv cache rm [2039, end)
slot update_slots: id  7 | task 380 | prompt processing progress, n_past = 4078, n_tokens = 2048, progress = 0.452006
slot update_slots: id  7 | task 380 | kv cache rm [4078, end)
slot update_slots: id  7 | task 380 | prompt processing progress, n_past = 6117, n_tokens = 2048, progress = 0.678009
slot update_slots: id  7 | task 380 | kv cache rm [6117, end)
slot update_slots: id  7 | task 380 | prompt processing progress, n_past = 8156, n_tokens = 2048, progress = 0.904012
slot update_slots: id  7 | task 380 | kv cache rm [8156, end)
slot update_slots: id  7 | task 380 | prompt processing progress, n_past = 9022, n_tokens = 875, progress = 1.000000
slot update_slots: id  7 | task 380 | prompt done, n_past = 9022, n_tokens = 875
slot      release: id  3 | task 3 | stop processing: n_past = 8929, truncated = 0
slot print_timing: id  3 | task 3 |
prompt eval time =   20520.58 ms /  8562 tokens (    2.40 ms per token,   417.24 tokens per second)
       eval time =  268091.24 ms /   368 tokens (  728.51 ms per token,     1.37 tokens per second)
      total time =  288611.81 ms /  8930 tokens
request: POST /completion 127.0.0.1 200
slot launch_slot_: id  3 | task 396 | processing task
slot update_slots: id  3 | task 396 | tokenizing prompt, len = 1
slot update_slots: id  3 | task 396 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8835
slot update_slots: id  3 | task 396 | kv cache rm [0, end)
slot update_slots: id  3 | task 396 | prompt processing progress, n_past = 2039, n_tokens = 2048, progress = 0.230787
slot update_slots: id  3 | task 396 | kv cache rm [2039, end)
slot update_slots: id  3 | task 396 | prompt processing progress, n_past = 4078, n_tokens = 2048, progress = 0.461573
slot update_slots: id  3 | task 396 | kv cache rm [4078, end)
slot update_slots: id  3 | task 396 | prompt processing progress, n_past = 6117, n_tokens = 2048, progress = 0.692360
slot update_slots: id  3 | task 396 | kv cache rm [6117, end)
slot update_slots: id  3 | task 396 | prompt processing progress, n_past = 8156, n_tokens = 2048, progress = 0.923147
slot update_slots: id  3 | task 396 | kv cache rm [8156, end)
slot update_slots: id  3 | task 396 | prompt processing progress, n_past = 8835, n_tokens = 688, progress = 1.000000
slot update_slots: id  3 | task 396 | prompt done, n_past = 8835, n_tokens = 688
slot      release: id  5 | task 5 | stop processing: n_past = 9506, truncated = 0
slot print_timing: id  5 | task 5 |
prompt eval time =   32731.03 ms /  9007 tokens (    3.63 ms per token,   275.18 tokens per second)
       eval time =  289810.29 ms /   500 tokens (  579.62 ms per token,     1.73 tokens per second)
      total time =  322541.32 ms /  9507 tokens
request: POST /completion 127.0.0.1 200
slot launch_slot_: id  5 | task 538 | processing task
slot update_slots: id  5 | task 538 | tokenizing prompt, len = 1
slot update_slots: id  5 | task 538 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8856
slot update_slots: id  5 | task 538 | kv cache rm [0, end)
slot update_slots: id  5 | task 538 | prompt processing progress, n_past = 2039, n_tokens = 2048, progress = 0.230239
slot update_slots: id  5 | task 538 | kv cache rm [2039, end)
slot update_slots: id  5 | task 538 | prompt processing progress, n_past = 4078, n_tokens = 2048, progress = 0.460479
slot update_slots: id  5 | task 538 | kv cache rm [4078, end)
slot update_slots: id  5 | task 538 | prompt processing progress, n_past = 6117, n_tokens = 2048, progress = 0.690718
slot update_slots: id  5 | task 538 | kv cache rm [6117, end)
slot update_slots: id  5 | task 538 | prompt processing progress, n_past = 8156, n_tokens = 2048, progress = 0.920958
slot update_slots: id  5 | task 538 | kv cache rm [8156, end)
slot update_slots: id  5 | task 538 | prompt processing progress, n_past = 8856, n_tokens = 709, progress = 1.000000
slot update_slots: id  5 | task 538 | prompt done, n_past = 8856, n_tokens = 709
slot      release: id  0 | task 0 | stop processing: n_past = 9143, truncated = 0
slot print_timing: id  0 | task 0 |
prompt eval time =    3965.86 ms /  8594 tokens (    0.46 ms per token,  2167.00 tokens per second)
       eval time =  428741.08 ms /   550 tokens (  779.53 ms per token,     1.28 tokens per second)
      total time =  432706.94 ms /  9144 tokens
request: POST /completion 127.0.0.1 200
slot launch_slot_: id  0 | task 568 | processing task
slot update_slots: id  0 | task 568 | tokenizing prompt, len = 1
slot update_slots: id  0 | task 568 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8886
slot update_slots: id  0 | task 568 | kv cache rm [0, end)
slot update_slots: id  0 | task 568 | prompt processing progress, n_past = 2039, n_tokens = 2048, progress = 0.229462
slot update_slots: id  0 | task 568 | kv cache rm [2039, end)
slot update_slots: id  0 | task 568 | prompt processing progress, n_past = 4078, n_tokens = 2048, progress = 0.458924
slot update_slots: id  0 | task 568 | kv cache rm [4078, end)
slot update_slots: id  0 | task 568 | prompt processing progress, n_past = 6117, n_tokens = 2048, progress = 0.688386
slot update_slots: id  0 | task 568 | kv cache rm [6117, end)
slot update_slots: id  0 | task 568 | prompt processing progress, n_past = 8156, n_tokens = 2048, progress = 0.917848
slot update_slots: id  0 | task 568 | kv cache rm [8156, end)
slot update_slots: id  0 | task 568 | prompt processing progress, n_past = 8886, n_tokens = 739, progress = 1.000000
slot update_slots: id  0 | task 568 | prompt done, n_past = 8886, n_tokens = 739
slot      release: id  1 | task 1 | stop processing: n_past = 9194, truncated = 0
slot print_timing: id  1 | task 1 |
prompt eval time =    8207.36 ms /  8585 tokens (    0.96 ms per token,  1046.01 tokens per second)
       eval time =  479258.13 ms /   610 tokens (  785.67 ms per token,     1.27 tokens per second)
      total time =  487465.50 ms /  9195 tokens
request: POST /completion 127.0.0.1 200
slot launch_slot_: id  1 | task 633 | processing task
slot update_slots: id  1 | task 633 | tokenizing prompt, len = 1
slot update_slots: id  1 | task 633 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8639
slot update_slots: id  1 | task 633 | kv cache rm [0, end)
slot update_slots: id  1 | task 633 | prompt processing progress, n_past = 2039, n_tokens = 2048, progress = 0.236023
slot update_slots: id  1 | task 633 | kv cache rm [2039, end)
slot update_slots: id  1 | task 633 | prompt processing progress, n_past = 4078, n_tokens = 2048, progress = 0.472045
slot update_slots: id  1 | task 633 | kv cache rm [4078, end)
slot update_slots: id  1 | task 633 | prompt processing progress, n_past = 6117, n_tokens = 2048, progress = 0.708068
slot update_slots: id  1 | task 633 | kv cache rm [6117, end)
slot update_slots: id  1 | task 633 | prompt processing progress, n_past = 8156, n_tokens = 2048, progress = 0.944091
slot update_slots: id  1 | task 633 | kv cache rm [8156, end)
slot update_slots: id  1 | task 633 | prompt processing progress, n_past = 8639, n_tokens = 492, progress = 1.000000
slot update_slots: id  1 | task 633 | prompt done, n_past = 8639, n_tokens = 492
slot      release: id  9 | task 10 | stop processing: n_past = 9834, truncated = 0
slot print_timing: id  9 | task 10 |
prompt eval time =   39605.11 ms /  9213 tokens (    4.30 ms per token,   232.62 tokens per second)
       eval time =  332654.16 ms /   622 tokens (  534.81 ms per token,     1.87 tokens per second)
      total time =  372259.28 ms /  9835 tokens
request: POST /completion 127.0.0.1 200
slot launch_slot_: id  9 | task 680 | processing task
slot update_slots: id  9 | task 680 | tokenizing prompt, len = 1
slot update_slots: id  9 | task 680 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8213
slot update_slots: id  9 | task 680 | kv cache rm [0, end)
slot update_slots: id  9 | task 680 | prompt processing progress, n_past = 2039, n_tokens = 2048, progress = 0.248265
slot update_slots: id  9 | task 680 | kv cache rm [2039, end)
slot update_slots: id  9 | task 680 | prompt processing progress, n_past = 4078, n_tokens = 2048, progress = 0.496530
slot update_slots: id  9 | task 680 | kv cache rm [4078, end)
slot update_slots: id  9 | task 680 | prompt processing progress, n_past = 6117, n_tokens = 2048, progress = 0.744795
slot update_slots: id  9 | task 680 | kv cache rm [6117, end)
slot update_slots: id  9 | task 680 | prompt processing progress, n_past = 8156, n_tokens = 2048, progress = 0.993060
slot update_slots: id  9 | task 680 | kv cache rm [8156, end)
slot update_slots: id  9 | task 680 | prompt processing progress, n_past = 8213, n_tokens = 66, progress = 1.000000
slot update_slots: id  9 | task 680 | prompt done, n_past = 8213, n_tokens = 66
slot      release: id  7 | task 380 | stop processing: n_past = 9435, truncated = 0
slot print_timing: id  7 | task 380 |
prompt eval time =   43106.06 ms /  9022 tokens (    4.78 ms per token,   209.30 tokens per second)
       eval time =  304459.27 ms /   414 tokens (  735.41 ms per token,     1.36 tokens per second)
      total time =  347565.33 ms /  9436 tokens
request: POST /completion 127.0.0.1 200
slot launch_slot_: id  7 | task 805 | processing task
slot update_slots: id  7 | task 805 | tokenizing prompt, len = 1
slot update_slots: id  7 | task 805 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8092
slot update_slots: id  7 | task 805 | kv cache rm [0, end)
slot update_slots: id  7 | task 805 | prompt processing progress, n_past = 2039, n_tokens = 2048, progress = 0.251977
slot update_slots: id  7 | task 805 | kv cache rm [2039, end)
slot update_slots: id  7 | task 805 | prompt processing progress, n_past = 4078, n_tokens = 2048, progress = 0.503955
slot update_slots: id  7 | task 805 | kv cache rm [4078, end)
slot update_slots: id  7 | task 805 | prompt processing progress, n_past = 6117, n_tokens = 2048, progress = 0.755932
slot update_slots: id  7 | task 805 | kv cache rm [6117, end)
slot update_slots: id  7 | task 805 | prompt processing progress, n_past = 8092, n_tokens = 1984, progress = 1.000000
slot update_slots: id  7 | task 805 | prompt done, n_past = 8092, n_tokens = 1984
slot      release: id  6 | task 6 | stop processing: n_past = 9620, truncated = 0
slot print_timing: id  6 | task 6 |
prompt eval time =   30677.54 ms /  8853 tokens (    3.47 ms per token,   288.58 tokens per second)
       eval time =  534281.44 ms /   768 tokens (  695.68 ms per token,     1.44 tokens per second)
      total time =  564958.98 ms /  9621 tokens
request: POST /completion 127.0.0.1 200
slot      release: id  7 | task 805 | stop processing: n_past = 8097, truncated = 0
slot print_timing: id  7 | task 805 |
prompt eval time =   39857.22 ms /  8092 tokens (    4.93 ms per token,   203.02 tokens per second)
       eval time =     749.29 ms /     6 tokens (  124.88 ms per token,     8.01 tokens per second)
      total time =   40606.51 ms /  8098 tokens
request: POST /completion 127.0.0.1 200
slot launch_slot_: id  6 | task 815 | processing task
slot update_slots: id  6 | task 815 | tokenizing prompt, len = 1
slot update_slots: id  6 | task 815 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8638
slot update_slots: id  6 | task 815 | kv cache rm [0, end)
slot update_slots: id  6 | task 815 | prompt processing progress, n_past = 2040, n_tokens = 2048, progress = 0.236166
slot launch_slot_: id  7 | task 817 | processing task
slot update_slots: id  6 | task 815 | kv cache rm [2040, end)
slot update_slots: id  6 | task 815 | prompt processing progress, n_past = 4080, n_tokens = 2048, progress = 0.472332
slot update_slots: id  6 | task 815 | kv cache rm [4080, end)
slot update_slots: id  6 | task 815 | prompt processing progress, n_past = 6120, n_tokens = 2048, progress = 0.708497
slot update_slots: id  6 | task 815 | kv cache rm [6120, end)
slot update_slots: id  6 | task 815 | prompt processing progress, n_past = 8160, n_tokens = 2048, progress = 0.944663
slot update_slots: id  6 | task 815 | kv cache rm [8160, end)
slot update_slots: id  6 | task 815 | prompt processing progress, n_past = 8638, n_tokens = 486, progress = 1.000000
slot update_slots: id  6 | task 815 | prompt done, n_past = 8638, n_tokens = 486
slot update_slots: id  7 | task 817 | tokenizing prompt, len = 1
slot update_slots: id  7 | task 817 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 8983
slot update_slots: id  7 | task 817 | kv cache rm [0, end)
slot update_slots: id  7 | task 817 | prompt processing progress, n_past = 1562, n_tokens = 2048, progress = 0.173884
slot update_slots: id  7 | task 817 | kv cache rm [1562, end)
slot update_slots: id  7 | task 817 | prompt processing progress, n_past = 3601, n_tokens = 2048, progress = 0.400868
slot update_slots: id  7 | task 817 | kv cache rm [3601, end)
slot update_slots: id  7 | task 817 | prompt processing progress, n_past = 5640, n_tokens = 2048, progress = 0.627853
slot update_slots: id  7 | task 817 | kv cache rm [5640, end)
slot update_slots: id  7 | task 817 | prompt processing progress, n_past = 7679, n_tokens = 2048, progress = 0.854837
slot update_slots: id  7 | task 817 | kv cache rm [7679, end)
slot update_slots: id  7 | task 817 | prompt processing progress, n_past = 8983, n_tokens = 1313, progress = 1.000000
slot update_slots: id  7 | task 817 | prompt done, n_past = 8983, n_tokens = 1313
slot      release: id  0 | task 568 | stop processing: n_past = 9146, truncated = 0
slot print_timing: id  0 | task 568 |
prompt eval time =   48338.12 ms /  8886 tokens (    5.44 ms per token,   183.83 tokens per second)
       eval time =  266315.21 ms /   261 tokens ( 1020.36 ms per token,     0.98 tokens per second)
      total time =  314653.33 ms /  9147 tokens
request: POST /completion 127.0.0.1 200
slot launch_slot_: id  0 | task 840 | processing task
slot update_slots: id  0 | task 840 | tokenizing prompt, len = 1
slot update_slots: id  0 | task 840 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 9200
slot update_slots: id  0 | task 840 | kv cache rm [0, end)
slot update_slots: id  0 | task 840 | prompt processing progress, n_past = 2039, n_tokens = 2048, progress = 0.221630
slot update_slots: id  0 | task 840 | kv cache rm [2039, end)
slot update_slots: id  0 | task 840 | prompt processing progress, n_past = 4078, n_tokens = 2048, progress = 0.443261
slot update_slots: id  0 | task 840 | kv cache rm [4078, end)
slot update_slots: id  0 | task 840 | prompt processing progress, n_past = 6117, n_tokens = 2048, progress = 0.664891
slot      release: id  2 | task 2 | stop processing: n_past = 9268, truncated = 0
slot print_timing: id  2 | task 2 |
prompt eval time =   11974.73 ms /  8457 tokens (    1.42 ms per token,   706.24 tokens per second)
       eval time =  760456.20 ms /   812 tokens (  936.52 ms per token,     1.07 tokens per second)
      total time =  772430.93 ms /  9269 tokens
request: POST /completion 127.0.0.1 200
slot update_slots: id  0 | task 840 | kv cache rm [6117, end)
slot update_slots: id  0 | task 840 | prompt processing progress, n_past = 8157, n_tokens = 2048, progress = 0.886630
srv  update_slots: failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = -1024, n_batch = 1024, ret = 1
srv  update_slots: failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = -512, n_batch = 512, ret = 1
srv  update_slots: failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = -256, n_batch = 256, ret = 1
slot launch_slot_: id  2 | task 845 | processing task
slot update_slots: id  0 | task 840 | kv cache rm [8157, end)
slot update_slots: id  0 | task 840 | prompt processing progress, n_past = 9200, n_tokens = 1051, progress = 1.000000
slot update_slots: id  0 | task 840 | prompt done, n_past = 9200, n_tokens = 1051
slot update_slots: id  2 | task 845 | tokenizing prompt, len = 1
slot update_slots: id  2 | task 845 | prompt tokenized, n_ctx_slot = 10240, n_keep = 0, n_prompt_tokens = 9283
slot update_slots: id  2 | task 845 | kv cache rm [0, end)
slot update_slots: id  2 | task 845 | prompt processing progress, n_past = 997, n_tokens = 2048, progress = 0.107401
slot update_slots: id  2 | task 845 | kv cache rm [997, end)
slot update_slots: id  2 | task 845 | prompt processing progress, n_past = 3036, n_tokens = 2048, progress = 0.327049
slot update_slots: id  2 | task 845 | kv cache rm [3036, end)
slot update_slots: id  2 | task 845 | prompt processing progress, n_past = 5075, n_tokens = 2048, progress = 0.546698
slot update_slots: id  2 | task 845 | kv cache rm [5075, end)
slot update_slots: id  2 | task 845 | prompt processing progress, n_past = 7114, n_tokens = 2048, progress = 0.766347
slot update_slots: id  2 | task 845 | kv cache rm [7114, end)
slot update_slots: id  2 | task 845 | prompt processing progress, n_past = 9153, n_tokens = 2048, progress = 0.985996
srv  update_slots: failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = -1024, n_batch = 1024, ret = 1
srv  update_slots: failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = -512, n_batch = 512, ret = 1
srv  update_slots: failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = -256, n_batch = 256, ret = 1
slot update_slots: id  2 | task 845 | kv cache rm [9153, end)
slot update_slots: id  2 | task 845 | prompt processing progress, n_past = 9283, n_tokens = 139, progress = 1.000000
slot update_slots: id  2 | task 845 | prompt done, n_past = 9283, n_tokens = 139
slot update_slots: id  8 | task 9 | slot context shift, n_keep = 0, n_left = 10239, n_discard = 5119
/build/source/ggml/src/ggml-cuda.cu:70: CUDA error
CUDA error: an illegal memory access was encountered
  current device: 0, in function ggml_backend_cuda_synchronize at /build/source/ggml/src/ggml-cuda.cu:2446
  cudaStreamSynchronize(cuda_ctx->stream())
Aborted

Backtrace:

Program terminated with signal SIGABRT, Aborted.
#0  __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44
44        return INTERNAL_SYSCALL_ERROR_P (ret) ? INTERNAL_SYSCALL_ERRNO (ret) : 0;
[Current thread is 1 (Thread 0x7ffff7e4d000 (LWP 1118265))]
(gdb) bt
#0  __pthread_kill_implementation (threadid=<optimized out>, signo=signo@entry=6, no_tid=no_tid@entry=0) at pthread_kill.c:44
#1  0x00007ffff2a9b843 in __pthread_kill_internal (signo=6, threadid=<optimized out>) at pthread_kill.c:78
#2  0x00007ffff2a49516 in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
#3  0x00007ffff2a31935 in __GI_abort () at abort.c:79
#4  0x00007ffff3041695 in ggml_abort (file=0x7ffff353dab8 "/build/source/ggml/src/ggml-cuda.cu", line=70, fmt=0x7ffff353daad "CUDA error") at /build/source/ggml/src/ggml.c:305
#5  0x00007ffff317e066 in ggml_cuda_error (stmt=0x7ffff353fd60 "cudaStreamSynchronize(cuda_ctx->stream())", func=0x7ffff353fd41 "ggml_backend_cuda_synchronize",
    file=0x7ffff353dab8 "/build/source/ggml/src/ggml-cuda.cu", line=2446, msg=0x7ffff268db00 "an illegal memory access was encountered") at /build/source/ggml/src/ggml-cuda.cu:70
#6  0x00007ffff3187ee7 in ggml_backend_cuda_synchronize (backend=0x16d6010) at /build/source/ggml/src/ggml-cuda.cu:2446
#7  0x00007ffff3096018 in ggml_backend_synchronize (backend=0x16d6010) at /build/source/ggml/src/ggml-backend.cpp:287
#8  0x00007ffff309c6db in ggml_backend_sched_synchronize (sched=0x14028d0) at /build/source/ggml/src/ggml-backend.cpp:2350
#9  0x00007ffff309c4f5 in ggml_backend_sched_reserve (sched=0x14028d0, measure_graph=0x16d60e0) at /build/source/ggml/src/ggml-backend.cpp:2302
#10 0x00007ffff7a872ee in llama_kv_cache_update_internal (lctx=...) at /build/source/src/llama.cpp:17891
#11 0x00007ffff7a9256b in llama_kv_cache_update (ctx=0x13ed3f0) at /build/source/src/llama.cpp:20123
#12 0x00007ffff7a850b0 in llama_decode_internal (lctx=..., batch_all=...) at /build/source/src/llama.cpp:17248
#13 0x00007ffff7a93f4d in llama_decode (ctx=0x13ed3f0, batch=...) at /build/source/src/llama.cpp:21200
#14 0x00000000004cccdd in server_context::update_slots (this=0x7fffffff9e40) at /build/source/examples/server/server.cpp:2292
#15 0x00000000005754f7 in std::__invoke_impl<void, void (server_context::*&)(), server_context*&> (
    __f=@0x4f1f610: (void (server_context::*)(struct server_context * const)) 0x4c9a02 <server_context::update_slots()>, __t=@0x4f1f620: 0x7fffffff9e40)
    at /nix/store/6mmwy4jcnqnhms3i56r1hbdn656akg1d-gcc-13.3.0/include/c++/13.3.0/bits/invoke.h:74
#16 0x0000000000568d19 in std::__invoke<void (server_context::*&)(), server_context*&> (
    __fn=@0x4f1f610: (void (server_context::*)(struct server_context * const)) 0x4c9a02 <server_context::update_slots()>)
    at /nix/store/6mmwy4jcnqnhms3i56r1hbdn656akg1d-gcc-13.3.0/include/c++/13.3.0/bits/invoke.h:96
#17 0x00000000005589ef in std::_Bind<void (server_context::*(server_context*))()>::__call<void, , 0ul>(std::tuple<>&&, std::_Index_tuple<0ul>) (this=0x4f1f610, __args=...)
    at /nix/store/6mmwy4jcnqnhms3i56r1hbdn656akg1d-gcc-13.3.0/include/c++/13.3.0/functional:506
#18 0x000000000054b721 in std::_Bind<void (server_context::*(server_context*))()>::operator()<, void>() (this=0x4f1f610)
    at /nix/store/6mmwy4jcnqnhms3i56r1hbdn656akg1d-gcc-13.3.0/include/c++/13.3.0/functional:591
#19 0x00000000005389ca in std::__invoke_impl<void, std::_Bind<void (server_context::*(server_context*))()>&>(std::__invoke_other, std::_Bind<void (server_context::*(server_context*))()>&) (__f=...) at /nix/store/6mmwy4jcnqnhms3i56r1hbdn656akg1d-gcc-13.3.0/include/c++/13.3.0/bits/invoke.h:61
#20 0x000000000052602e in std::__invoke_r<void, std::_Bind<void (server_context::*(server_context*))()>&>(std::_Bind<void (server_context::*(server_context*))()>&) (__fn=...)
    at /nix/store/6mmwy4jcnqnhms3i56r1hbdn656akg1d-gcc-13.3.0/include/c++/13.3.0/bits/invoke.h:111
#21 0x000000000050a917 in std::_Function_handler<void (), std::_Bind<void (server_context::*(server_context*))()> >::_M_invoke(std::_Any_data const&) (__functor=...)
    at /nix/store/6mmwy4jcnqnhms3i56r1hbdn656akg1d-gcc-13.3.0/include/c++/13.3.0/bits/std_function.h:290
#22 0x00000000004d1896 in std::function<void()>::operator() (this=0x7fffffffb1a8)
    at /nix/store/6mmwy4jcnqnhms3i56r1hbdn656akg1d-gcc-13.3.0/include/c++/13.3.0/bits/std_function.h:591
#23 0x00000000004b4ffd in server_queue::start_loop (this=0x7fffffffb088) at /build/source/examples/server/server.cpp:504
#24 0x000000000048c3c5 in main (argc=17, argv=0x7fffffffb428) at /build/source/examples/server/server.cpp:3402

ggerganov / llama.cpp