Open simonw opened 4 months ago
Step one:
diff --git a/pyproject.toml b/pyproject.toml
index e541b67..c893fdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,9 +11,9 @@ classifiers = [
dependencies = [
"llm",
"requests-mock",
- "replicate>=0.9.0",
+ "replicate>=0.25.2",
]
-requires-python = ">3.7"
+requires-python = ">3.8"
[project.optional-dependencies]
test = ["pytest"]
It supports streaming now:
import replicate
# https://replicate.com/meta/llama-2-70b-chat
model_version = "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
for event in replicate.stream(
model_version,
input={
"prompt": "Please write a haiku about llamas.",
},
):
print(str(event), end="")
It prefers you to use a REPLICATE_API_TOKEN
environment variable but you can still create a Client()
and pass it api_token=
to its constructor.
For Llama 3 this post: https://replicate.com/meta/meta-llama-3-70b-instruct?input=python suggests this:
# The meta/meta-llama-3-70b-instruct model can stream output as it's running.
for event in replicate.stream(
"meta/meta-llama-3-70b-instruct",
input={
"top_k": 50,
"top_p": 0.9,
"prompt": "Work through this problem step by step:\n\nQ: Sarah has 7 llamas. Her friend gives her 3 more trucks of llamas. Each truck has 5 llamas. How many llamas does Sarah have in total?",
"max_tokens": 512,
"min_tokens": 0,
"temperature": 0.6,
"prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
"presence_penalty": 1.15,
"frequency_penalty": 0.2
},
):
print(str(event), end="")
Note the need for a prompt_template
of "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
.
Those docs also say:
Prompt template. The string
{prompt}
will be substituted for the input prompt. If you want to generate dialog output, use this template as a starting point and construct the prompt string manually, leavingprompt_template={prompt}
.
https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models says:
The fine-tuned models were trained for dialogue applications. To get the expected features and performance for them, a specific formatting defined in
ChatFormat
needs to be followed: The prompt begins with a<|begin_of_text|>
special token, after which one or more messages follow. Each message starts with the<|start_header_id|>
tag, the rolesystem
,user
orassistant
, and the<|end_header_id|>
tag. After a double newline\n\n
the contents of the message follow. The end of each message is marked by the<|eot_id|>
token.
Useful example code from https://github.com/meta-llama/llama3/blob/87d55e86f7170e31c6f46b8f521ed65242109938/llama/tokenizer.py#L202-L229
class ChatFormat:
def __init__(self, tokenizer: Tokenizer):
self.tokenizer = tokenizer
def encode_header(self, message: Message) -> List[int]:
tokens = []
tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
return tokens
def encode_message(self, message: Message) -> List[int]:
tokens = self.encode_header(message)
tokens.extend(
self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
)
tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
return tokens
def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
tokens = []
tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
for message in dialog:
tokens.extend(self.encode_message(message))
# Add the start of an assistant message for the model to complete.
tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
return tokens
The challenge in implementing this is figuring out what to do about those prompt templates.
I think I need model card support - I need to give users a very quick way to add a model that includes the configured prompt template.
I'm going to delay implementing this until I've figured out the model card mechanism for LLM itself.
Confirmed that this works:
client = replicate.Client(api_token='r8_...')
for event in client.stream(
"meta/meta-llama-3-70b-instruct",
input={
"top_k": 50,
"top_p": 0.9,
"prompt": "Work through this problem step by step:\n\nQ: Sarah has 7 llamas. Her friend gives her 3 more trucks of llamas. Each truck has 5 llamas. How many llamas does Sarah have in total?",
"max_tokens": 512,
"min_tokens": 0,
"temperature": 0.6,
"prompt_template": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
"presence_penalty": 1.15,
"frequency_penalty": 0.2
},
):
print(str(event), end="")
The trickiest bit of the prompt template will be getting conversation mode to work.
Originally posted by @simonw in https://github.com/simonw/llm-replicate/issues/24#issuecomment-2068518475