ggerganov / llama.cpp

LLM inference in C/C++
MIT License
67.91k stars 9.74k forks source link

mac m1 series bug #5236

Closed zyxcambridge closed 9 months ago

zyxcambridge commented 9 months ago

llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 7 llm_load_print_meta: n_embd_k_gqa = 1024 llm_load_print_meta: n_embd_v_gqa = 1024 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-05 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 20480 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 5000000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 30B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 34.39 B llm_load_print_meta: model size = 34.03 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '<|startoftext|>' llm_load_print_meta: EOS token = 2 '<|endoftext|>' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: PAD token = 0 '' llm_load_print_meta: LF token = 315 '<0x0A>' llm_load_tensors: ggml ctx size = 0.41 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model '/Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf' main: error: unable to load model (base) zhangyixin@zhangyixin llama.cpp % hist zsh: command not found: hist (base) zhangyixin@zhangyixin llama.cpp % history 1010 ls 1011 code . 1012 meta-llama/Llama-2-13b-hf 1013 huggingface-cli download --token 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 --resume-download --local-dir-use-symlinks False meta-llama/Llama-2-13b-hf 1014 ikawrakow/qwen-14b-chat-gguf 1015 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False ikawrakow/qwen-14b-chat-gguf --include "qwen-14b-chat-q5-0.gguf" --local-dir ikawrakow/qwen-14b-chat-gguf\n 1016 cd ../llama.cpp 1017 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False ikawrakow/qwen-14b-chat-gguf --include "qwen-14b-chat-q5-0.gguf" --local-dir ikawrakow/qwen-14b-chat-gguf\n 1018 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF\n 1019 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF\n 1020 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF\n 1021 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF\n 1022 chmod +x /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf 1023 /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf 1024 ./main --frequency-penalty 0.5 --frequency-penalty 0.5 --top-k 5 --top-p 0.9 -m /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e\n 1025 hist


llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 18: general.quantization_version u32 = 2 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q8_0: 226 tensors llm_load_vocab: special tokens definition check successful ( 259/32000 ). llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 32 llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 4096 llm_load_print_meta: n_embd_v_gqa = 4096 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 11008 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 6.74 B llm_load_print_meta: model size = 6.67 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf' {"timestamp":1706616717,"level":"ERROR","function":"load_model","line":374,"message":"unable to load model","model":"TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf"} libc++abi: terminating zsh: abort ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model (base) zhangyixin@zhangyixin llama.cpp % ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf {"timestamp":1706616751,"level":"INFO","function":"main","line":2419,"message":"build info","build":1992,"commit":"b2b2bf98"} {"timestamp":1706616751,"level":"INFO","function":"main","line":2426,"message":"system info","n_threads":8,"n_threads_batch":-1,"total_threads":10,"system_info":"AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | "}

llama server listening at http://0.0.0.0:8080

{"timestamp":1706616751,"level":"INFO","function":"main","line":2525,"message":"HTTP server listening","port":"8080","hostname":"0.0.0.0"} llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf (version GGUF V2) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.name str = LLaMA v2 llama_model_loader: - kv 2: llama.context_length u32 = 4096 llama_model_loader: - kv 3: llama.embedding_length u32 = 4096 llama_model_loader: - kv 4: llama.block_count u32 = 32 llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008 llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128 llama_model_loader: - kv 7: llama.attention.head_count u32 = 32 llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32 llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 10: general.file_type u32 = 7 llama_model_loader: - kv 11: tokenizer.ggml.model str = llama llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 18: general.quantization_version u32 = 2 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q8_0: 226 tensors llm_load_vocab: special tokens definition check successful ( 259/32000 ). llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 32 llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 4096 llm_load_print_meta: n_embd_v_gqa = 4096 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 11008 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 6.74 B llm_load_print_meta: model size = 6.67 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf' {"timestamp":1706616751,"level":"ERROR","function":"load_model","line":374,"message":"unable to load model","model":"TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf"} libc++abi: terminating zsh: abort ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model (base) zhangyixin@zhangyixin llama.cpp % ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf {"timestamp":1706686090,"level":"INFO","function":"main","line":2419,"message":"build info","build":1992,"commit":"b2b2bf98"} {"timestamp":1706686090,"level":"INFO","function":"main","line":2426,"message":"system info","n_threads":8,"n_threads_batch":-1,"total_threads":10,"system_info":"AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | "}

llama server listening at http://0.0.0.0:8080

{"timestamp":1706686090,"level":"INFO","function":"main","line":2525,"message":"HTTP server listening","port":"8080","hostname":"0.0.0.0"} llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf (version GGUF V2) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.name str = LLaMA v2 llama_model_loader: - kv 2: llama.context_length u32 = 4096 llama_model_loader: - kv 3: llama.embedding_length u32 = 4096 llama_model_loader: - kv 4: llama.block_count u32 = 32 llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008 llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128 llama_model_loader: - kv 7: llama.attention.head_count u32 = 32 llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32 llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 10: general.file_type u32 = 7 llama_model_loader: - kv 11: tokenizer.ggml.model str = llama llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 18: general.quantization_version u32 = 2 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q8_0: 226 tensors llm_load_vocab: special tokens definition check successful ( 259/32000 ). llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 32 llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 4096 llm_load_print_meta: n_embd_v_gqa = 4096 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 11008 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 6.74 B llm_load_print_meta: model size = 6.67 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf' {"timestamp":1706686090,"level":"ERROR","function":"load_model","line":374,"message":"unable to load model","model":"TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf"} libc++abi: terminating zsh: abort ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model (base) zhangyixin@zhangyixin llama.cpp % open . (base) zhangyixin@zhangyixin llama.cpp % find . -name "*.gguf"

./TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf ./yi-chat-6B-GGUF/yi-chat-6b.Q2_K.gguf ./models/ggml-vocab-mpt.gguf ./models/ggml-vocab-refact.gguf ./models/ggml-vocab-baichuan.gguf ./models/ggml-vocab-aquila.gguf ./models/ggml-vocab-stablelm-3b-4e1t.gguf ./models/ggml-vocab-starcoder.gguf ./models/ggml-vocab-gpt2.gguf ./models/ggml-vocab-llama.gguf ./models/ggml-vocab-falcon.gguf ./models/ggml-vocab-gpt-neox.gguf ./Orion-14B-Chat.gguf (base) zhangyixin@zhangyixin llama.cpp %


in/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/gguf/gguf.cpp ggml.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o gguf -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o benchmark-matmult -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/export-lora/export-lora.cpp ggml.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o export-lora -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/main/main.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o console.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o main -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/quantize/quantize.cpp build-info.o ggml.o llama.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o quantize -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o quantize-stats -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/perplexity/perplexity.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o perplexity -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/imatrix/imatrix.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o imatrix -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/embedding/embedding.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o embedding -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o train.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o train-text-from-scratch -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o convert-llama2c-to-ggml -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/simple/simple.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o simple -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/batched/batched.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o batched -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o batched-bench -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/save-load-state/save-load-state.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o save-load-state -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi -Iexamples/server examples/server/server.cpp examples/llava/clip.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o server -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib -Wno-cast-qual arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/llama-bench/llama-bench.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o llama-bench -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi -static -fPIC -c examples/llava/llava.cpp -o libllava.a -Wno-cast-qual arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/llava/llava-cli.cpp examples/llava/clip.cpp examples/llava/llava.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o llava-cli -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib -Wno-cast-qual arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o train.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o baby-llama -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/beam-search/beam-search.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o beam-search -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/speculative/speculative.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o speculative -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/infill/infill.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o console.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o infill -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/tokenize/tokenize.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o tokenize -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/parallel/parallel.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o parallel -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/finetune/finetune.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o train.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o finetune -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/lookahead/lookahead.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o lookahead -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/lookup/lookup.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o lookup -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/passkey/passkey.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o passkey -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib

==== Run ./main -h for help. ====

Log start main: build = 1992 (b2b2bf98) main: built with clang version 14.0.6 for arm64-apple-darwin20.0.0 main: seed = 1706610115 llama_model_load: error loading model: failed to open models/llama-13b-v2/ggml-model-q4_0.gguf: No such file or directory llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'models/llama-13b-v2/ggml-model-q4_0.gguf' main: error: unable to load model (base) zhangyixin@zhangyixin llama.cpp % make -j && ./main -m aa.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e I llama.cpp build info: I UNAME_S: Darwin I UNAME_P: arm I UNAME_M: arm64 I CFLAGS: -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -Wunreachable-code-break -Wunreachable-code-return -Wdouble-promotion -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -isystem /Users/zhangyixin/anaconda3/include I CXXFLAGS: -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi I NVCCFLAGS:
I LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib I CC: clang version 14.0.6 I CXX: clang version 14.0.6

make: Nothing to be done for `default'. Log start main: build = 1992 (b2b2bf98) main: built with clang version 14.0.6 for arm64-apple-darwin20.0.0 main: seed = 1706623354 llama_model_loader: loaded meta data with 21 key-value pairs and 444 tensors from aa.gguf (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = orion llama_model_loader: - kv 1: general.file_type u32 = 1 llama_model_loader: - kv 2: general.name str = Orion-14B-Chat llama_model_loader: - kv 3: orion.tensor_data_layout str = Meta AI original pth llama_model_loader: - kv 4: orion.context_length u32 = 4096 llama_model_loader: - kv 5: orion.embedding_length u32 = 5120 llama_model_loader: - kv 6: orion.block_count u32 = 40 llama_model_loader: - kv 7: orion.feed_forward_length u32 = 15360 llama_model_loader: - kv 8: orion.attention.head_count u32 = 40 llama_model_loader: - kv 9: orion.attention.head_count_kv u32 = 40 llama_model_loader: - kv 10: orion.attention.layer_norm_epsilon f32 = 0.000010 llama_model_loader: - kv 11: tokenizer.ggml.model str = llama llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,84608] = ["", "", "", " ", "▁▁... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,84608] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,84608] = [2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.padding_token_id u32 = 0 llama_model_loader: - kv 18: tokenizer.ggml.add_bos_token bool = false llama_model_loader: - kv 19: tokenizer.ggml.add_eos_token bool = false llama_model_loader: - kv 20: tokenizer.chat_template str = {% for message in messages %}{% if lo... llama_model_loader: - type f32: 162 tensors llama_model_loader: - type f16: 282 tensors llm_load_vocab: mismatch in special tokens definition ( 423/84608 vs 4/84608 ). llm_load_print_meta: format = GGUF V3 (latest) llm_load_print_meta: arch = orion llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 84608 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 5120 llm_load_print_meta: n_head = 40 llm_load_print_meta: n_head_kv = 40 llm_load_print_meta: n_layer = 40 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 5120 llm_load_print_meta: n_embd_v_gqa = 5120 llm_load_print_meta: f_norm_eps = 1.0e-05 llm_load_print_meta: f_norm_rms_eps = 0.0e+00 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 15360 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 14B llm_load_print_meta: model ftype = F16 llm_load_print_meta: model params = 14.50 B llm_load_print_meta: model size = 27.01 GiB (16.00 BPW) llm_load_print_meta: general.name = Orion-14B-Chat llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: PAD token = 0 '' llm_load_print_meta: LF token = 64 '<0x0A>' llm_load_tensors: ggml ctx size = 0.34 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'aa.gguf' main: error: unable to load model (base) zhangyixin@zhangyixin llama.cpp % ls CMakeLists.txt benchmark-matmult convert-persimmon-to-gguf.py ggml-backend.h ggml-quants.h llama.cpp perplexity speculative LICENSE build-info.o convert.py ggml-backend.o ggml-quants.o llama.h pocs spm-headers Makefile build.zig docs ggml-cuda.cu ggml.c llama.log prompts tests Orion-14B-Chat.gguf chat.gguf embedding ggml-cuda.h ggml.h llama.o q8dot tokenize Package.swift ci examples ggml-impl.h ggml.o llava-cli quantize train-text-from-scratch README.md cmake export-lora ggml-metal.h gguf lookahead quantize-stats train.o SHA256SUMS codecov.yml finetune ggml-metal.m gguf-py lookup rag.gguf tst_openai_api.py TheBloke common flake.lock ggml-metal.metal grammar-parser.o main requirements unicode.h aa.gguf common.o flake.nix ggml-metal.o grammars main.log requirements.txt vdot awq-py console.o ggml-alloc.c ggml-mpi.c imatrix media sampling.o yi-chat-6B-GGUF baby-llama convert-hf-to-gguf.py ggml-alloc.h ggml-mpi.h infill models save-load-state batched convert-llama-ggml-to-gguf.py ggml-alloc.o ggml-opencl.cpp jartine mypy.ini scripts batched-bench convert-llama2c-to-ggml ggml-backend-impl.h ggml-opencl.h libllava.a parallel server beam-search convert-lora-to-ggml.py ggml-backend.c ggml-quants.c llama-bench passkey simple (base) zhangyixin@zhangyixin llama.cpp %

zyxcambridge commented 9 months ago

llm_load_print_meta: n_ff = 14336 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 32768 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q4_0 llm_load_print_meta: model params = 7.24 B llm_load_print_meta: model size = 3.83 GiB (4.54 BPW) llm_load_print_meta: general.name = mistralai_mistral-7b-v0.1 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model '../mistral-7b-v0.1.Q4_0.gguf' {"timestamp":1706691513,"level":"ERROR","function":"load_model","line":375,"message":"unable to load model","model":"../mistral-7b-v0.1.Q4_0.gguf"} libc++abi: terminating ./server-llm.sh: line 389: 42877 Abort trap: 6 ./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args (base) zhangyixin@zhangyixin scripts % history 1063 sudo chmod +x /Users/zhangyixin/Desktop/llama.cpp/jartine/mistral-7b.llamafile/mistral-7b-instruct-v0.1-Q4_K_M.llamafile 1064 /Users/zhangyixin/Desktop/llama.cpp/jartine/mistral-7b.llamafile/mistral-7b-instruct-v0.1-Q4_K_M.llamafile -ngl 999 1065 df -h 1066 /Users/zhangyixin/Desktop/llama.cpp/jartine/mistral-7b.llamafile/mistral-7b-instruct-v0.1-Q4_K_M.llamafile -ngl 999 1067 /Users/zhangyixin/Desktop/llama.cpp/jartine/mistral-7b.llamafile/mistral-7b-instruct-v0.1-Q4_K_M.llamafile -ngl 999 1068 ls 1069 ./main --frequency-penalty 0.5 --frequency-penalty 0.5 --top-k 5 --top-p 0.9 -m chat.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e 1070 ./main --frequency-penalty 0.5 --frequency-penalty 0.5 --top-k 5 --top-p 0.9 -m chat.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e 1071 ./scripts/server-llm.sh 1072 export https_proxy=http://127.0.0.1:7890 http_proxy=http://127.0.0.1:7890 all_proxy=socks5://127.0.0.1:7890 1073 ./scripts/server-llm.sh 1074 find . -name mistral-7b-v0.1.Q2_K.gguf 1075 cd scripts 1076 ./server-llm.sh 1077 ./server-llm.sh 1078 ./server-llm.sh


odels/65B/ggml-model-f16.bin X
models/65B/ggml-model-q4_0.bin X
models/65B/ggml-model-q4_1.bin X
models/65B/ggml-model-q5_0.bin X
models/65B/ggml-model-q5_1.bin X
models/65B/params.json X
models/tokenizer.model X
(base) zhangyixin@zhangyixin llama.cpp % ./main \ -m https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q2_K.gguf \ -p "Hello world" Log start main: build = 1992 (b2b2bf98) main: built with clang version 14.0.6 for arm64-apple-darwin20.0.0 main: seed = 1706690140 llama_model_load: error loading model: failed to open https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q2_K.gguf: No such file or directory llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q2_K.gguf' main: error: unable to load model (base) zhangyixin@zhangyixin llama.cpp % ./server-llm.sh zsh: no such file or directory: ./server-llm.sh (base) zhangyixin@zhangyixin llama.cpp % find -name ./server-llm.sh find: illegal option -- n usage: find [-H | -L | -P] [-EXdsx] [-f path] path ... [expression] find [-H | -L | -P] [-EXdsx] -f path [path ...] [expression] (base) zhangyixin@zhangyixin llama.cpp % find . -name server-llm.sh ./scripts/server-llm.sh (base) zhangyixin@zhangyixin llama.cpp % ./scripts/server-llm.sh zsh: permission denied: ./scripts/server-llm.sh (base) zhangyixin@zhangyixin llama.cpp % sudo chmod +x ./scripts/* Password: (base) zhangyixin@zhangyixin llama.cpp % ./scripts/server-llm.sh

[I] This is a helper script for deploying llama.cpp's server on this machine.

Based on the options that follow, the script might download a model file
from the internet, which can be a few GBs in size. The script will also
build the latest llama.cpp source code from GitHub, which can be unstable.

Upon success, an HTTP server will be started and it will serve the selected
model using llama.cpp for demonstration purposes.

Please note:

- All new data will be stored in the current folder
- The server will be listening on all network interfaces
- The server will run with default settings which are not always optimal
- Do not judge the quality of a model based on the results from this script
- Do not use this script to benchmark llama.cpp
- Do not use this script in production
- This script is only for demonstration purposes

If you don't know what you are doing, please press Ctrl-C to abort now

Press Enter to continue ...

[+] No repo provided from the command line Please select a number from the list below or enter an URL:

 0) https://huggingface.co/TheBloke/Llama-2-7B-GGUF
 1) https://huggingface.co/TheBloke/Llama-2-13B-GGUF
 2) https://huggingface.co/TheBloke/Llama-2-70B-GGUF
 3) https://huggingface.co/TheBloke/CodeLlama-7B-GGUF
 4) https://huggingface.co/TheBloke/CodeLlama-13B-GGUF
 5) https://huggingface.co/TheBloke/CodeLlama-34B-GGUF
 6) https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF
 7) https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF
 8) https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF
 9) https://huggingface.co/TheBloke/CausalLM-7B-GGUF

Or choose one from: https://huggingface.co/models?sort=trending&search=gguf

[+] Select repo: 0 [+] Checking for GGUF model files in https://huggingface.co/TheBloke/Llama-2-7B-GGUF

[+] Model files:

[+] Select weight type: [-] Invalid weight type:

[+] Select weight type: ^C (base) zhangyixin@zhangyixin llama.cpp %


OSError: [Errno 28] No space left on device (base) zhangyixin@zhangyixin llama.cpp % huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF

Consider using hf_transfer for faster downloads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details. Fetching 1 files: 0%| | 0/1 [00:00<?, ?it/s]downloading https://hf-mirror.com/TheBloke/Yi-34B-Chat-GGUF/resolve/ee827f906e733381ab65c650dcfdd2caf6de8762/yi-34b-chat.Q8_0.gguf to /Users/zhangyixin/.cache/huggingface/hub/models--TheBloke--Yi-34B-Chat-GGUF/blobs/163011ba9907f85336681293420c12a3c7ab3e531618552be2340bcb298eba1e.incomplete Error while downloading from https://cdn-lfs-us-1.hf-mirror.com/repos/3c/42/3c429a32fd39f4234ac446273af0c092c59000a64cb19950a9c561a44e41c043/163011ba9907f85336681293420c12a3c7ab3e531618552be2340bcb298eba1e?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27yi-34b-chat.Q8_0.gguf%3B+filename%3D%22yi-34b-chat.Q8_0.gguf%22%3B&Expires=1706945273&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjk0NTI3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzNjLzQyLzNjNDI5YTMyZmQzOWY0MjM0YWM0NDYyNzNhZjBjMDkyYzU5MDAwYTY0Y2IxOTk1MGE5YzU2MWE0NGU0MWMwNDMvMTYzMDExYmE5OTA3Zjg1MzM2NjgxMjkzNDIwYzEyYTNjN2FiM2U1MzE2MTg1NTJiZTIzNDBiY2IyOThlYmExZT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=gT614DUvGFxP4iS1OF4HBgm4jvE8yNNPYoOKwnTam2XqBfS1-fAGZBb2yLSIuAledqTMU6anetGlfyi4Vpet0lPVR1cncRImhWv4llDgUG8ZEDfd12N0SBBPjYKApG2512Vs413AZABL1Ku%7EsMvUbKOe3Gsly7zYQp0zr-BFhu8ApyvvZvMQh4-R-lgDipzcRNFKiaZyxXc77S8RkuO1wXmJ8whG4dNA8%7EytJ3lYJuIPfhwleRFkEYll5S8HGHe7DlCV7jQjxjQb6DmBXwhTpeKQxbpWe8O8LKzKOPXI57enY1J7Sz4s5sOnE4%7E6fHHq3gSrGCxKbNu18Doty32qOw__&Key-Pair-Id=KCD77M1F0VK2B: HTTPSConnectionPool(host='cdn-lfs-us-1.hf-mirror.com', port=443): Read timed out. Trying to resume download... Error while downloading from https://cdn-lfs-us-1.hf-mirror.com/repos/3c/42/3c429a32fd39f4234ac446273af0c092c59000a64cb19950a9c561a44e41c043/163011ba9907f85336681293420c12a3c7ab3e531618552be2340bcb298eba1e?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27yi-34b-chat.Q8_0.gguf%3B+filename%3D%22yi-34b-chat.Q8_0.gguf%22%3B&Expires=1706945273&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjk0NTI3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzNjLzQyLzNjNDI5YTMyZmQzOWY0MjM0YWM0NDYyNzNhZjBjMDkyYzU5MDAwYTY0Y2IxOTk1MGE5YzU2MWE0NGU0MWMwNDMvMTYzMDExYmE5OTA3Zjg1MzM2NjgxMjkzNDIwYzEyYTNjN2FiM2U1MzE2MTg1NTJiZTIzNDBiY2IyOThlYmExZT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=gT614DUvGFxP4iS1OF4HBgm4jvE8yNNPYoOKwnTam2XqBfS1-fAGZBb2yLSIuAledqTMU6anetGlfyi4Vpet0lPVR1cncRImhWv4llDgUG8ZEDfd12N0SBBPjYKApG2512Vs413AZABL1Ku%7EsMvUbKOe3Gsly7zYQp0zr-BFhu8ApyvvZvMQh4-R-lgDipzcRNFKiaZyxXc77S8RkuO1wXmJ8whG4dNA8%7EytJ3lYJuIPfhwleRFkEYll5S8HGHe7DlCV7jQjxjQb6DmBXwhTpeKQxbpWe8O8LKzKOPXI57enY1J7Sz4s5sOnE4%7E6fHHq3gSrGCxKbNu18Doty32qOw__&Key-Pair-Id=KCD77M1F0VK2B: HTTPSConnectionPool(host='cdn-lfs-us-1.hf-mirror.com', port=443): Read timed out. Trying to resume download... yi-34b-chat.Q8_0.gguf: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36.5G/36.5G [14:28<00:00, 18.5MB/s] yi-34b-chat.Q8_0.gguf: 56%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 20.5G/36.5G [26:52<1:06:24, 4.03MB/s] yi-34b-chat.Q8_0.gguf: 38%|███████████████████████████████████████████████████████████████████████████▌ | 14.0G/36.5G [27:37<41:14:06, 152kB/s] Fetching 1 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [27:40<00:00, 1660.03s/it] /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF (base) zhangyixin@zhangyixin llama.cpp % huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF

(base) zhangyixin@zhangyixin llama.cpp % chmod +x /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf (base) zhangyixin@zhangyixin llama.cpp % /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf zsh: exec format error: /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf (base) zhangyixin@zhangyixin llama.cpp % ./main --frequency-penalty 0.5 --frequency-penalty 0.5 --top-k 5 --top-p 0.9 -m /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e

Log start main: build = 1992 (b2b2bf98) main: built with clang version 14.0.6 for arm64-apple-darwin20.0.0 main: seed = 1706691529 llama_model_loader: loaded meta data with 23 key-value pairs and 543 tensors from /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.name str = LLaMA v2 llama_model_loader: - kv 2: llama.context_length u32 = 4096 llama_model_loader: - kv 3: llama.embedding_length u32 = 7168 llama_model_loader: - kv 4: llama.block_count u32 = 60 llama_model_loader: - kv 5: llama.feed_forward_length u32 = 20480 llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128 llama_model_loader: - kv 7: llama.attention.head_count u32 = 56 llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8 llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 llama_model_loader: - kv 10: llama.rope.freq_base f32 = 5000000.000000 llama_model_loader: - kv 11: general.file_type u32 = 7 llama_model_loader: - kv 12: tokenizer.ggml.model str = llama llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,64000] = ["", "<|startoftext|>", "<|endof... llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,64000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,64000] = [2, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 3, ... llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 18: tokenizer.ggml.padding_token_id u32 = 0 llama_model_loader: - kv 19: tokenizer.ggml.add_bos_token bool = false llama_model_loader: - kv 20: tokenizer.ggml.add_eos_token bool = false llama_model_loader: - kv 21: tokenizer.chat_template str = {% if not add_generation_prompt is de... llama_model_loader: - kv 22: general.quantization_version u32 = 2 llama_model_loader: - type f32: 121 tensors llama_model_loader: - type q8_0: 422 tensors llm_load_vocab: mismatch in special tokens definition ( 498/64000 vs 267/64000 ). llm_load_print_meta: format = GGUF V3 (latest) llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 64000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 7168 llm_load_print_meta: n_head = 56 llm_load_print_meta: n_head_kv = 8 llm_load_print_meta: n_layer = 60 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 7 llm_load_print_meta: n_embd_k_gqa = 1024 llm_load_print_meta: n_embd_v_gqa = 1024 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-05 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 20480 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 5000000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 30B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 34.39 B llm_load_print_meta: model size = 34.03 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '<|startoftext|>' llm_load_print_meta: EOS token = 2 '<|endoftext|>' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: PAD token = 0 '' llm_load_print_meta: LF token = 315 '<0x0A>' llm_load_tensors: ggml ctx size = 0.41 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model '/Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf' main: error: unable to load model (base) zhangyixin@zhangyixin llama.cpp % hist zsh: command not found: hist (base) zhangyixin@zhangyixin llama.cpp % history 1010 ls 1011 code . 1012 meta-llama/Llama-2-13b-hf 1013 huggingface-cli download --token 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 --resume-download --local-dir-use-symlinks False meta-llama/Llama-2-13b-hf 1014 ikawrakow/qwen-14b-chat-gguf 1015 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False ikawrakow/qwen-14b-chat-gguf --include "qwen-14b-chat-q5-0.gguf" --local-dir ikawrakow/qwen-14b-chat-gguf\n 1016 cd ../llama.cpp 1017 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False ikawrakow/qwen-14b-chat-gguf --include "qwen-14b-chat-q5-0.gguf" --local-dir ikawrakow/qwen-14b-chat-gguf\n 1018 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF\n 1019 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF\n 1020 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF\n 1021 huggingface-cli download --token YOUR_TOKEN --resume-download --local-dir-use-symlinks False TheBloke/Yi-34B-Chat-GGUF --include "yi-34b-chat.Q8_0.gguf" --local-dir TheBloke/Yi-34B-Chat-GGUF\n 1022 chmod +x /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf 1023 /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf 1024 ./main --frequency-penalty 0.5 --frequency-penalty 0.5 --top-k 5 --top-p 0.9 -m /Users/zhangyixin/Desktop/llama.cpp/TheBloke/Yi-34B-Chat-GGUF/yi-34b-chat.Q8_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e\n 1025 hist


llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 18: general.quantization_version u32 = 2 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q8_0: 226 tensors llm_load_vocab: special tokens definition check successful ( 259/32000 ). llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 32 llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 4096 llm_load_print_meta: n_embd_v_gqa = 4096 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 11008 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 6.74 B llm_load_print_meta: model size = 6.67 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf' {"timestamp":1706616717,"level":"ERROR","function":"load_model","line":374,"message":"unable to load model","model":"TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf"} libc++abi: terminating zsh: abort ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model (base) zhangyixin@zhangyixin llama.cpp % ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf {"timestamp":1706616751,"level":"INFO","function":"main","line":2419,"message":"build info","build":1992,"commit":"b2b2bf98"} {"timestamp":1706616751,"level":"INFO","function":"main","line":2426,"message":"system info","n_threads":8,"n_threads_batch":-1,"total_threads":10,"system_info":"AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | "}

llama server listening at http://0.0.0.0:8080

{"timestamp":1706616751,"level":"INFO","function":"main","line":2525,"message":"HTTP server listening","port":"8080","hostname":"0.0.0.0"} llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf (version GGUF V2) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.name str = LLaMA v2 llama_model_loader: - kv 2: llama.context_length u32 = 4096 llama_model_loader: - kv 3: llama.embedding_length u32 = 4096 llama_model_loader: - kv 4: llama.block_count u32 = 32 llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008 llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128 llama_model_loader: - kv 7: llama.attention.head_count u32 = 32 llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32 llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 10: general.file_type u32 = 7 llama_model_loader: - kv 11: tokenizer.ggml.model str = llama llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 18: general.quantization_version u32 = 2 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q8_0: 226 tensors llm_load_vocab: special tokens definition check successful ( 259/32000 ). llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 32 llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 4096 llm_load_print_meta: n_embd_v_gqa = 4096 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 11008 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 6.74 B llm_load_print_meta: model size = 6.67 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf' {"timestamp":1706616751,"level":"ERROR","function":"load_model","line":374,"message":"unable to load model","model":"TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf"} libc++abi: terminating zsh: abort ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model (base) zhangyixin@zhangyixin llama.cpp % ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf {"timestamp":1706686090,"level":"INFO","function":"main","line":2419,"message":"build info","build":1992,"commit":"b2b2bf98"} {"timestamp":1706686090,"level":"INFO","function":"main","line":2426,"message":"system info","n_threads":8,"n_threads_batch":-1,"total_threads":10,"system_info":"AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | "}

llama server listening at http://0.0.0.0:8080

{"timestamp":1706686090,"level":"INFO","function":"main","line":2525,"message":"HTTP server listening","port":"8080","hostname":"0.0.0.0"} llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf (version GGUF V2) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.name str = LLaMA v2 llama_model_loader: - kv 2: llama.context_length u32 = 4096 llama_model_loader: - kv 3: llama.embedding_length u32 = 4096 llama_model_loader: - kv 4: llama.block_count u32 = 32 llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008 llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128 llama_model_loader: - kv 7: llama.attention.head_count u32 = 32 llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32 llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 10: general.file_type u32 = 7 llama_model_loader: - kv 11: tokenizer.ggml.model str = llama llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 18: general.quantization_version u32 = 2 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q8_0: 226 tensors llm_load_vocab: special tokens definition check successful ( 259/32000 ). llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 32 llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 4096 llm_load_print_meta: n_embd_v_gqa = 4096 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 11008 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 6.74 B llm_load_print_meta: model size = 6.67 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf' {"timestamp":1706686090,"level":"ERROR","function":"load_model","line":374,"message":"unable to load model","model":"TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf"} libc++abi: terminating zsh: abort ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model (base) zhangyixin@zhangyixin llama.cpp % open . (base) zhangyixin@zhangyixin llama.cpp % find . -name "*.gguf"

./TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf ./yi-chat-6B-GGUF/yi-chat-6b.Q2_K.gguf ./models/ggml-vocab-mpt.gguf ./models/ggml-vocab-refact.gguf ./models/ggml-vocab-baichuan.gguf ./models/ggml-vocab-aquila.gguf ./models/ggml-vocab-stablelm-3b-4e1t.gguf ./models/ggml-vocab-starcoder.gguf ./models/ggml-vocab-gpt2.gguf ./models/ggml-vocab-llama.gguf ./models/ggml-vocab-falcon.gguf ./models/ggml-vocab-gpt-neox.gguf ./Orion-14B-Chat.gguf (base) zhangyixin@zhangyixin llama.cpp %


in/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/gguf/gguf.cpp ggml.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o gguf -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o benchmark-matmult -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/export-lora/export-lora.cpp ggml.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o export-lora -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/main/main.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o console.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o main -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/quantize/quantize.cpp build-info.o ggml.o llama.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o quantize -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o quantize-stats -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/perplexity/perplexity.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o perplexity -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-darwin20.0.0-clang++ -I. -Icommon -D_XOPEN_SOURCE=600 -D_DARWIN_C_SOURCE -DNDEBUG -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_METAL -D_FORTIFY_SOURCE=2 -isystem /Users/zhangyixin/anaconda3/include -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -ftree-vectorize -fPIC -fPIE -fstack-protector-strong -O2 -pipe -stdlib=libc++ -fvisibility-inlines-hidden -fmessage-length=0 -isystem /Users/zhangyixin/anaconda3/include -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi examples/imatrix/imatrix.cpp ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o ggml-metal.o ggml-alloc.o ggml-backend.o ggml-quants.o -o imatrix -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -Wl,-pie -Wl,-headerpad_max_install_names -Wl,-dead_strip_dylibs -Wl,-rpath,/Users/zhangyixin/anaconda3/lib -L/Users/zhangyixin/anaconda3/lib arm64-apple-d

zyxcambridge commented 9 months ago

llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 18: general.quantization_version u32 = 2 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q8_0: 226 tensors llm_load_vocab: special tokens definition check successful ( 259/32000 ). llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 32 llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 4096 llm_load_print_meta: n_embd_v_gqa = 4096 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 11008 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 6.74 B llm_load_print_meta: model size = 6.67 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf' {"timestamp":1706616717,"level":"ERROR","function":"load_model","line":374,"message":"unable to load model","model":"TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf"} libc++abi: terminating zsh: abort ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model (base) zhangyixin@zhangyixin llama.cpp % ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf {"timestamp":1706616751,"level":"INFO","function":"main","line":2419,"message":"build info","build":1992,"commit":"b2b2bf98"} {"timestamp":1706616751,"level":"INFO","function":"main","line":2426,"message":"system info","n_threads":8,"n_threads_batch":-1,"total_threads":10,"system_info":"AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | "}

llama server listening at http://0.0.0.0:8080

{"timestamp":1706616751,"level":"INFO","function":"main","line":2525,"message":"HTTP server listening","port":"8080","hostname":"0.0.0.0"} llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf (version GGUF V2) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.name str = LLaMA v2 llama_model_loader: - kv 2: llama.context_length u32 = 4096 llama_model_loader: - kv 3: llama.embedding_length u32 = 4096 llama_model_loader: - kv 4: llama.block_count u32 = 32 llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008 llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128 llama_model_loader: - kv 7: llama.attention.head_count u32 = 32 llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32 llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 10: general.file_type u32 = 7 llama_model_loader: - kv 11: tokenizer.ggml.model str = llama llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 18: general.quantization_version u32 = 2 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q8_0: 226 tensors llm_load_vocab: special tokens definition check successful ( 259/32000 ). llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 32 llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 4096 llm_load_print_meta: n_embd_v_gqa = 4096 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 11008 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 6.74 B llm_load_print_meta: model size = 6.67 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf' {"timestamp":1706616751,"level":"ERROR","function":"load_model","line":374,"message":"unable to load model","model":"TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf"} libc++abi: terminating zsh: abort ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model (base) zhangyixin@zhangyixin llama.cpp % ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf {"timestamp":1706686090,"level":"INFO","function":"main","line":2419,"message":"build info","build":1992,"commit":"b2b2bf98"} {"timestamp":1706686090,"level":"INFO","function":"main","line":2426,"message":"system info","n_threads":8,"n_threads_batch":-1,"total_threads":10,"system_info":"AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | "}

llama server listening at http://0.0.0.0:8080

{"timestamp":1706686090,"level":"INFO","function":"main","line":2525,"message":"HTTP server listening","port":"8080","hostname":"0.0.0.0"} llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf (version GGUF V2) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = llama llama_model_loader: - kv 1: general.name str = LLaMA v2 llama_model_loader: - kv 2: llama.context_length u32 = 4096 llama_model_loader: - kv 3: llama.embedding_length u32 = 4096 llama_model_loader: - kv 4: llama.block_count u32 = 32 llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008 llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128 llama_model_loader: - kv 7: llama.attention.head_count u32 = 32 llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32 llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000001 llama_model_loader: - kv 10: general.file_type u32 = 7 llama_model_loader: - kv 11: tokenizer.ggml.model str = llama llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["", "", "", "<0x00>", "<... llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000... llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ... llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1 llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2 llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0 llama_model_loader: - kv 18: general.quantization_version u32 = 2 llama_model_loader: - type f32: 65 tensors llama_model_loader: - type q8_0: 226 tensors llm_load_vocab: special tokens definition check successful ( 259/32000 ). llm_load_print_meta: format = GGUF V2 llm_load_print_meta: arch = llama llm_load_print_meta: vocab type = SPM llm_load_print_meta: n_vocab = 32000 llm_load_print_meta: n_merges = 0 llm_load_print_meta: n_ctx_train = 4096 llm_load_print_meta: n_embd = 4096 llm_load_print_meta: n_head = 32 llm_load_print_meta: n_head_kv = 32 llm_load_print_meta: n_layer = 32 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 4096 llm_load_print_meta: n_embd_v_gqa = 4096 llm_load_print_meta: f_norm_eps = 0.0e+00 llm_load_print_meta: f_norm_rms_eps = 1.0e-06 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: n_ff = 11008 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: rope scaling = linear llm_load_print_meta: freq_base_train = 10000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 4096 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: model type = 7B llm_load_print_meta: model ftype = Q8_0 llm_load_print_meta: model params = 6.74 B llm_load_print_meta: model size = 6.67 GiB (8.50 BPW) llm_load_print_meta: general.name = LLaMA v2 llm_load_print_meta: BOS token = 1 '' llm_load_print_meta: EOS token = 2 '' llm_load_print_meta: UNK token = 0 '' llm_load_print_meta: LF token = 13 '<0x0A>' llm_load_tensors: ggml ctx size = 0.22 MiB ggml_backend_metal_buffer_from_ptr: error: failed to allocate buffer, size = 0.00 MiB llama_model_load: error loading model: failed to allocate buffer llama_load_model_from_file: failed to load model llama_init_from_gpt_params: error: failed to load model 'TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf' {"timestamp":1706686090,"level":"ERROR","function":"load_model","line":374,"message":"unable to load model","model":"TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf"} libc++abi: terminating zsh: abort ./server --ctx-size 2048 --host 0.0.0.0 --n-gpu-layers 64 --model (base) zhangyixin@zhangyixin llama.cpp % open . (base) zhangyixin@zhangyixin llama.cpp % find . -name "*.gguf"

./TheBloke/Llama-2-7B-Chat-GGUF/llama-2-7b-chat.Q8_0.gguf ./yi-chat-6B-GGUF/yi-chat-6b.Q2_K.gguf ./models/ggml-vocab-mpt.gguf ./models/ggml-vocab-refact.gguf ./models/ggml-vocab-baichuan.gguf ./models/ggml-vocab-aquila.gguf ./models/ggml-vocab-stablelm-3b-4e1t.gguf ./models/ggml-vocab-starcoder.gguf ./models/ggml-vocab-gpt2.gguf ./models/ggml-vocab-llama.gguf ./models/ggml-vocab-falcon.gguf ./models/ggml-vocab-gpt-neox.gguf ./Orion-14B-Chat.gguf (base) zhangyixin@zhangyixin llama.cpp %