#!/usr/bin/env python3
import tiktoken
import sys
m = tiktoken.encoding_for_model("gpt-3.5-turbo-0613")
for v in sys.argv[1:]:
print(v, [m.decode([x]) for x in m.encode(v)])
extern crate fancy_regex;
use fancy_regex::Regex;
fn main() {
let re = Regex::new(r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+").unwrap();
let hays = [
"llo",
];
for hay in hays.iter() {
for mat in re.find_iter(hay) {
print!("{:?} ", mat.expect("").as_str());
}
println!("");
}
}
The following program:
outputs:
Why? The regex for https://github.com/openai/tiktoken/blob/5d970c1100d3210b42497203d6b5c1e30cfda6cb/tiktoken_ext/openai_public.py#L63 cl100k_base matches
llo
as a single match.The following rust program:
Outputs a single token:
Why are
llo
two tokens and not one?