RealUnrealGameDev commented 1 year ago

Prerequisites

[x] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
[x] I carefully followed the README.md.
[x] I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
[x] I reviewed the Discussions, and have a new bug or useful enhancement to share.

Expected Behavior

It should interact with the user in a chat system infinitly

Current Behavior

After a few questions and answers the program throws this error at me:

Traceback (most recent call last):
  File "app.py", line 160, in <module>
    init()
  File "app.py", line 43, in init
    process_user_input(input_txt)
  File "app.py", line 64, in process_user_input
    repeat_penalty=REPEAT_PENALTY,
  File "app.py", line 104, in m_generate
    m_eval(model, tokens, True)
  File "app.py", line 157, in m_eval
    __eval()
  File "app.py", line 150, in __eval
    model.eval(batch)
  File "D:\Work\AI Freind\Venv\lib\site-packages\llama_cpp\llama.py", line 480, in eval
    )[:] = llama_cpp.llama_get_logits(self.ctx)[: rows * cols]
ValueError: could not broadcast input array from shape (32000,) into shape (0,)

Environment and Context

I am using Llama With Persona for the chat system whilst running a GTX 1660 Ti, 16GB RAM, i7-9750H CPU, Windows 11

Failure Information (for bugs)

The error seems to be happening on Line 480, It only happens after a few messages.

Steps to Reproduce

Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.

git clone https://github.com/ngxson/llama-persona.git
pip install -r requirements.txt
Download llama.cpp compatible model and place it in the project root directory.
Copy config.example.py to config.py
Copy my config code into your config file
Copy my app.py code into your app.py file
run app.py with python app.py
Go back and fourth with some messages and the error should pop up

Failure Logs

This is my app.py/main.py file:

import llama_cpp
import sys
import re
import signal
import imp
from alive_progress import alive_bar

try:
    imp.find_module("config")
    from config import *
except ImportError:
    print("Cannot find config.py")
    exit(1)

model = llama_cpp.Llama(
    model_path=MODEL_PATH,
    seed=SEED,
    n_threads=N_THREADS,
    last_n_tokens_size=N_LAST_TOKENS,
    n_ctx=N_CTX,
)

TOKEN_BOS = model.token_bos()
TOKEN_EOS = model.token_eos()

PROMPT_INIT = f""" {PERSONA_DESC}

Pretend that you are {PERSONA_NAME}. Below is an instruction that describes a task. Write a response that appropriately completes the request.""".encode()

is_received_stop_signal = False  # TODO: catching SIGINT signal

def init():
    global state_after_init_prompt
    print("")
    m_eval(model, m_tokenize(model, PROMPT_INIT, True), False, "Starting up...")

    try:
        while True:
            print("\n> ", end="", flush=True)
            input_txt = input("You: ")
            process_user_input(input_txt)
    except KeyboardInterrupt:
        pass

def process_user_input(text):
    global state_after_init_prompt, is_received_stop_signal
    is_received_stop_signal = False

    # generate response
    response_bytes = b""
    response_txt = ""
    input_tokens = m_tokenize(
        model, (f"\n\n### Instruction:\n\n{text}\n\n### Response:\n\n").encode()
    )
    for token in m_generate(
        model,
        input_tokens,
        top_k=TOP_K,
        top_p=TOP_P,
        temp=TEMP,
        repeat_penalty=REPEAT_PENALTY,
    ):
        if token == TOKEN_EOS:
            break
        should_stop = False
        response_added_bytes = model.detokenize([token])
        response_bytes += response_added_bytes
        response_txt = response_bytes.decode("utf-8", errors="ignore")
        if "###" in response_txt:
            response_txt = re.sub(r"\s+###", "", response_txt)
            sys.stdout.write("\033[K")  # Clear to the end of line
            print(response_txt.split("\n")[-1], end="", flush=True)
            should_stop = True
        print(response_added_bytes.decode(errors="ignore"), end="", flush=True)
        if should_stop:
            break

    # build context for next message
    input_ins_truncated = " ".join(text.split(" ")[:N_TOKENS_KEEP_INS])
    print("1")
    input_res_truncated = " ".join(response_txt.split(" ")[:N_TOKENS_KEEP_RES])
    print("2")
    input_history = f"\n\n### Instruction:\n\n{input_ins_truncated}\n\n### Response:\n\n{input_res_truncated}"
    print("3")
    history_tokens = m_tokenize(model, input_history.encode())
    print("4")
    print("\n\n", end="", flush=True)
    print("5")
    m_eval(model, history_tokens, False, "Build context...")
    print("6")

def m_generate(model: llama_cpp.Llama, tokens, top_k, top_p, temp, repeat_penalty):
    """Generate without self.reset()"""
    global is_received_stop_signal
    is_received_stop_signal = False
    try:
        while True:
            if is_received_stop_signal:
                yield TOKEN_EOS
            m_eval(model, tokens, True)
            token = model.sample(
                top_k=top_k,
                top_p=top_p,
                temp=temp,
                repeat_penalty=repeat_penalty,
            )
            tokens_or_none = yield token
            tokens = [token]
            if tokens_or_none is not None:
                tokens.extend(tokens_or_none)
    except KeyboardInterrupt:
        pass

def m_tokenize(model: llama_cpp.Llama, text: bytes, add_bos=False):
    assert model.ctx is not None
    n_ctx = llama_cpp.llama_n_ctx(model.ctx)
    tokens = (llama_cpp.llama_token * int(n_ctx))()
    n_tokens = llama_cpp.llama_tokenize(
        model.ctx,
        text,
        tokens,
        n_ctx,
        llama_cpp.c_bool(add_bos),
    )
    if int(n_tokens) < 0:
        raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
    return list(tokens[:n_tokens])

def m_eval(model: llama_cpp.Llama, tokens, stop_on_signal=False, show_progress=False):
    global is_received_stop_signal

    def chunks(lst, n):
        return [lst[i : i + n] for i in range(0, len(lst), n)]

    batches = chunks(tokens, N_BATCH)

    def __eval(bar=None):
        global is_received_stop_signal
        for i, batch in enumerate(batches):
            if stop_on_signal and is_received_stop_signal:
                is_received_stop_signal = False
                return
            else:
                model.eval(batch)
                bar(len(batch)) if bar is not None else None

    if show_progress:
        with alive_bar(len(tokens), theme="classic", title=show_progress) as bar:
            __eval(bar)
    else:
        __eval()

init()

This is my config.py file:

import random

MODEL_PATH = "D:\\Work\\AI Freind\\Components\\Character AI\\llama-persona\\Models\\13b_Roleplay_(3min).bin"
N_THREADS = 12
TOP_K = 80
TOP_P = 1
TEMP = 0.4
REPEAT_PENALTY = 1.1
N_BATCH = 20
N_CTX = 2048 * 2  # 10000000 2048
N_LAST_TOKENS = 48
SEED = random.randint(1, 10000000000000000)

# persona; ideally in one paragraph (about 200-300 words)
PERSONA_NAME = "DanTDM"
PERSONA_DESC = "Your name is DanTDM"

# number of tokens to be kept for context history
N_TOKENS_KEEP_INS = 10000000
N_TOKENS_KEEP_RES = 20000000

abtExp commented 11 months ago

I'm having similar issues when using with langchain

alex4321 commented 11 months ago

I got similar issues when amount of tokens (prompt + generated) were more than n_ctx (argument in Llama constructor):

model = Llama(
    model_path="../../model/saiga_mistral_7b_gguf/model-q8_0.gguf",
    n_gpu_layers=-1,
)
count = 0
for token in model.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1):
    count += 1
    print(model.detokenize([token]))

So I found that I was getting it while generating the 513th token.

A few debug prints traced me to the place it, at first:

initialize scores buffer as np.ndarray( (n_ctx, self._n_vocab), dtype=np.single ) array
than try to record scores into n_ctx + 1 position
while n_ctx by default is 512
adding more tokens to n_ctx changed failing token position

So if the error is the same for you you can limit generation size.

@abtExp

Not sure about @RealUnrealGameDev 's code

giulioco commented 10 months ago

Getting a similar issue when using from langchain.embeddings import LlamaCppEmbeddings in

File langchain/embeddings/llamacpp.py", line 113, in <listcomp>
    embeddings = [self.client.embed(text) for text in texts]
                  ^^^^^^^^^^^^^^^^^^^^^^^
File llama_cpp/llama.py", line 899, in embed
    return list(map(float, self.create_embedding(input)["data"][0]["embedding"]))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File llama_cpp/llama.py", line 863, in create_embedding
    self.eval(tokens)
File llama_cpp/llama.py", line 543, in eval
    self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
    ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not broadcast input array from shape (8,) into shape (0,)

abetlen / llama-cpp-python

ValueError: could not broadcast input array from shape (32000,) into shape (0,) in line 480 #657