cpu - Githubissues

ayttop commented 1 month ago

how run GOT-OCR2.0 on cpu only?

Ucas-HaoranWei commented 1 month ago

The Community contributions part includes a CPU version and the following is a llama_cpp implementation from the Wechat group:

import ctypes
import os
import multiprocessing
import struct

import llama_cpp
import torch

llama_cpp.llama_backend_init(numa=False)

N_THREADS = multiprocessing.cpu_count()
MODEL_PATH = os.environ.get("MODEL", b"C:\ocr\GOT-OCR2.0-main\GOT-OCR2.0-main\llama-b3804-bin-win-avx-x64\None-619M-123-F16.gguf")

prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"

lparams = llama_cpp.llama_model_default_params()
cparams = llama_cpp.llama_context_default_params()
cparams.n_ctx = 151643
model = llama_cpp.llama_load_model_from_file(MODEL_PATH, lparams)
ctx = llama_cpp.llama_new_context_with_model(model, cparams)

n_past = 0

prompt = b" " + prompt

"""embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
n_of_tok = llama_cpp.llama_tokenize(model,bytes(str(prompt), "utf-8"),len(embd_inp),embd_inp,len(embd_inp),False,False)
embd_inp = embd_inp[:n_of_tok]"""

tensor = torch.load('tensor.pt')

aa = torch.tensor(tensor).shape

embd_inp = torch.tensor(tensor).to(torch.int32).squeeze().cpu().tolist()#.to(torch.float32)

#embd_inp = [ctypes.c_int32(int(x)) for x in embd_inp]
#embd_inp = [ctypes.c_int32(struct.unpack('i', struct.pack('f', x))[0]) for x in embd_inp]

n_ctx = llama_cpp.llama_n_ctx(ctx)

n_predict = 20
n_predict = min(n_predict, n_ctx - len(embd_inp))

input_consumed = 0
input_noecho = False

remaining_tokens = n_predict

embd = []
last_n_size = 64
last_n_tokens_data = [0] * last_n_size
n_batch = 24
last_n_repeat = 64
repeat_penalty = 1
frequency_penalty = 0.0
presence_penalty = 0.0

while remaining_tokens > 0:

    n_past += len(embd)
    embd = []
    if len(embd_inp) <= input_consumed:
        logits = llama_cpp.llama_get_logits(ctx)
        n_vocab = llama_cpp.llama_n_vocab(model)

        _arr = (llama_cpp.llama_token_data * n_vocab)(
            *[
                llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
                for token_id in range(n_vocab)
            ]
        )
        candidates_p = llama_cpp.ctypes.pointer(
            llama_cpp.llama_token_data_array(_arr, len(_arr), False)
        )

        _arr = (llama_cpp.llama_token * len(last_n_tokens_data))(*last_n_tokens_data)
        llama_cpp.llama_sample_repetition_penalties(
            ctx,
            candidates_p,
            _arr,
            last_n_repeat,
            repeat_penalty,
            frequency_penalty,
            presence_penalty,
        )

        llama_cpp.llama_sample_top_k(ctx, candidates_p, 40,1)
        llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8, 1)
        #llama_cpp.llama_sample_temperature(ctx, candidates_p,0.2)
        id = llama_cpp.llama_sample_token(ctx, candidates_p)

        last_n_tokens_data = last_n_tokens_data[1:] + [id]
        embd.append(id)
        input_noecho = False
        remaining_tokens -= 1
    else:
        while len(embd_inp) > input_consumed:
            embd.append(embd_inp[input_consumed])
            last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]]
            input_consumed += 1
            if len(embd) >= n_batch:
                break
    if not input_noecho:
        for id in embd:
            size = 32
            buffer = (ctypes.c_char * size)()
            n = llama_cpp.llama_token_to_piece(
                model, llama_cpp.llama_token(id), buffer, size,0,False
            )
            assert n <= size
            print(
                buffer[:n].decode("utf-8"),
                end="",
                flush=True,
            )

    if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx):
        break

print()

llama_cpp.llama_print_timings(ctx)

llama_cpp.llama_free(ctx)

jerrylsu commented 1 month ago

how convert GOT-OCR2.0 model to gguf format?

tbwang-clound commented 1 month ago

cpp weights is open? help open!

Ucas-HaoranWei commented 1 month ago

@ayttop @1694439208 has implemented the inference of llama_cpp https://github.com/1694439208/GOT-OCR-Inference

ayttop commented 1 month ago

thank you

LoadingALIAS commented 1 month ago

@ayttop how's the accuracy in comparison to the CUDA version?

santosh-gkg commented 1 month ago

is anyone able to run with CPU? i am facing installation issues please help

Ucas-HaoranWei / GOT-OCR2.0

cpu #84