Open ayttop opened 1 month ago
The Community contributions part includes a CPU version and the following is a llama_cpp implementation from the Wechat group:
import ctypes
import os
import multiprocessing
import struct
import llama_cpp
import torch
llama_cpp.llama_backend_init(numa=False)
N_THREADS = multiprocessing.cpu_count()
MODEL_PATH = os.environ.get("MODEL", b"C:\ocr\GOT-OCR2.0-main\GOT-OCR2.0-main\llama-b3804-bin-win-avx-x64\None-619M-123-F16.gguf")
prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
lparams = llama_cpp.llama_model_default_params()
cparams = llama_cpp.llama_context_default_params()
cparams.n_ctx = 151643
model = llama_cpp.llama_load_model_from_file(MODEL_PATH, lparams)
ctx = llama_cpp.llama_new_context_with_model(model, cparams)
n_past = 0
prompt = b" " + prompt
"""embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
n_of_tok = llama_cpp.llama_tokenize(model,bytes(str(prompt), "utf-8"),len(embd_inp),embd_inp,len(embd_inp),False,False)
embd_inp = embd_inp[:n_of_tok]"""
tensor = torch.load('tensor.pt')
aa = torch.tensor(tensor).shape
embd_inp = torch.tensor(tensor).to(torch.int32).squeeze().cpu().tolist()#.to(torch.float32)
#embd_inp = [ctypes.c_int32(int(x)) for x in embd_inp]
#embd_inp = [ctypes.c_int32(struct.unpack('i', struct.pack('f', x))[0]) for x in embd_inp]
n_ctx = llama_cpp.llama_n_ctx(ctx)
n_predict = 20
n_predict = min(n_predict, n_ctx - len(embd_inp))
input_consumed = 0
input_noecho = False
remaining_tokens = n_predict
embd = []
last_n_size = 64
last_n_tokens_data = [0] * last_n_size
n_batch = 24
last_n_repeat = 64
repeat_penalty = 1
frequency_penalty = 0.0
presence_penalty = 0.0
while remaining_tokens > 0:
n_past += len(embd)
embd = []
if len(embd_inp) <= input_consumed:
logits = llama_cpp.llama_get_logits(ctx)
n_vocab = llama_cpp.llama_n_vocab(model)
_arr = (llama_cpp.llama_token_data * n_vocab)(
*[
llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
for token_id in range(n_vocab)
]
)
candidates_p = llama_cpp.ctypes.pointer(
llama_cpp.llama_token_data_array(_arr, len(_arr), False)
)
_arr = (llama_cpp.llama_token * len(last_n_tokens_data))(*last_n_tokens_data)
llama_cpp.llama_sample_repetition_penalties(
ctx,
candidates_p,
_arr,
last_n_repeat,
repeat_penalty,
frequency_penalty,
presence_penalty,
)
llama_cpp.llama_sample_top_k(ctx, candidates_p, 40,1)
llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8, 1)
#llama_cpp.llama_sample_temperature(ctx, candidates_p,0.2)
id = llama_cpp.llama_sample_token(ctx, candidates_p)
last_n_tokens_data = last_n_tokens_data[1:] + [id]
embd.append(id)
input_noecho = False
remaining_tokens -= 1
else:
while len(embd_inp) > input_consumed:
embd.append(embd_inp[input_consumed])
last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
input_consumed += 1
if len(embd) >= n_batch:
break
if not input_noecho:
for id in embd:
size = 32
buffer = (ctypes.c_char * size)()
n = llama_cpp.llama_token_to_piece(
model, llama_cpp.llama_token(id), buffer, size,0,False
)
assert n <= size
print(
buffer[:n].decode("utf-8"),
end="",
flush=True,
)
if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
break
print()
llama_cpp.llama_print_timings(ctx)
llama_cpp.llama_free(ctx)
how convert GOT-OCR2.0 model to gguf format?
cpp weights is open? help open!
@ayttop @1694439208 has implemented the inference of llama_cpp https://github.com/1694439208/GOT-OCR-Inference
thank you
@ayttop how's the accuracy in comparison to the CUDA version?
is anyone able to run with CPU? i am facing installation issues please help
how run GOT-OCR2.0 on cpu only?