Open determo13 opened 1 year ago
example I use to run, works fine without cuda but slow as f....
const { LLama } = require("../@llama-node/llama-cpp.linux-x64-gnu.node"); const path = require("path");
const run = async () => { const llama = await LLama.load( { modelPath: path.resolve(process.cwd(), "ggml-vic7b-uncensored-q5_1.bin"), enableLogging: true, nCtx: 1024, nParts: -1, seed: 0, f16Kv: false, logitsAll: false, vocabOnly: false, useMlock: false, embedding: false, useMmap: false, nGpuLayers: 8 }, true );
const prompt = `Who is the president of the United States?`;
const params = {
nThreads: 4,
nTokPredict: 2048,
topK: 40,
topP: 0.1,
temp: 0.2,
repeatPenalty: 1,
prompt,
};
const abort = llama.inference(params, (data) => {
console.log(data.data && data.data.token ? data.data.token : "");
});
};
run();
can you run on llama.cpp without llama-node?
~/llama-node/packages/llama-cpp$ node example/mycode.ts llama.cpp: loading model from /llama-node/packages/llama-cpp/ggml-vic7b-uncensored-q5_1.bin llama_model_load_internal: format = ggjt v2 (latest) llama_model_load_internal: n_vocab = 32001 llama_model_load_internal: n_ctx = 1024 llama_model_load_internal: n_embd = 4096 llama_model_load_internal: n_mult = 256 llama_model_load_internal: n_head = 32 llama_model_load_internal: n_layer = 32 llama_model_load_internal: n_rot = 128 llama_model_load_internal: ftype = 9 (mostly Q5_1) llama_model_load_internal: n_ff = 11008 llama_model_load_internal: n_parts = 1 llama_model_load_internal: model size = 7B llama_model_load_internal: ggml ctx size = 72.75 KB llama_model_load_internal: mem required = 6612.59 MB (+ 2052.00 MB per state) llama_model_load_internal: [cublas] offloading 8 layers to GPU llama_model_load_internal: [cublas] total VRAM used: 1158 MB llama_init_from_file: kv self size = 1024.00 MB [Fri, 26 May 2023 09:45:06 +0000 - INFO - llama_node_cpp::context] - AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | [Fri, 26 May 2023 09:45:06 +0000 - INFO - llama_node_cpp::llama] - tokenized_stop_prompt: None CUDA error 209 at /llama-node/packages/llama-cpp/llama-sys/llama.cpp/ggml-cuda.cu:693: no kernel image is available for execution on the device
got TESLA K80 card running on ubuntu. any advice what to do and where to look would be appreciated