dqbd / tiktoken

JS port and JS/WASM bindings for openai/tiktoken
MIT License
704 stars 53 forks source link

Add decode_with_offsets function (or PR guidance request) #94

Open Sam1320 opened 6 months ago

Sam1320 commented 6 months ago

It would be great to have 1:1 parity with the python implementation.

Adding this function would be a step in that direction.

I already created my own implementation:

// This is an implementation of the missing TikTok token decoding functions available in Python
// The code is based on the Python implementation in the TikTok library

function decode_token_bytes(tokens, enc) {
    let result = [];
    for (const token of tokens) {
        result.push(enc.decode_single_token_bytes(token));
    }
    return result;
}

function decode_with_offsets(tokens, enc) {
    /**
     * Decodes a list of tokens into a string and a list of offsets.
     *
     * Each offset is the index into text corresponding to the start of each token.
     * If UTF-8 character boundaries do not line up with token boundaries, the offset is_ the index
     * of the first character that contains bytes from the token.
     *
     * This will currently throw an error if given tokens that decode to invalid UTF-8; this behavior may
     * change in the future to be more permissive.
     *
     * >>> enc.decodeWithOffsets([31373, 995])
     * ['hello world', [0, 5]]
     */

    const tokenBytes = decode_token_bytes(tokens, enc);

    let textLen = 0;
    const offsets = [];
    for (const token of tokenBytes) {
      offsets.push(Math.max(0, textLen - (token[0] >= 0x80 && token[0] < 0xC0 ? 1 : 0)));
      textLen += token.filter(c => !(c >= 0x80 && c < 0xC0)).length;
    }

    // TODO: assess correctness for errors="ignore" and errors="replace"
    const text = Buffer.concat(tokenBytes).toString('utf-8');
    return [text, offsets];
}

export { decode_with_offsets }

which I then use like this:

import { encoding_for_model } from "tiktoken";
import { decode_with_offsets } from "./modules/TiktokenExtra.js";

const enc = encoding_for_model("gpt-4");
const test_string = "lorém ipsum dolor sit amet, consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.";
let tokens = enc.encode(test_string);
let [decoded_text, offsets] = decode_with_offsets(tokens, enc);

I'm happy to create a PR to add this to the library. I just need some guidance into where exactly this logic should go