from pyllamacpp.model import Model
model = Model('/home/devel/Downloads/chatllama-ggml-q4_0.bin')
#model.generate("从前,", n_predict=64, new_text_callback=new_text_callback, n_threads=4, verbose=True)
for token in model.generate("从前,", n_predict=512):
#try:
# tok = token.decode('utf-8')
#except UnicodeDecodeError:
# tok = token.decode('utf-8', 'replace')
print(token, end='', flush=True)
Tested using the model https://huggingface.co/P01son/ChatLLaMA-zh-7B-int4 in issue 61.
Test 1,
Test 2
The responses felt really faster, i guess it is because you use generators to return the tokens.