dqbd / tiktoken

JS port and JS/WASM bindings for openai/tiktoken
MIT License
648 stars 49 forks source link

Support fine tunned models #76

Open italojs opened 8 months ago

italojs commented 8 months ago

When I use my finetuning model i get this error Failed to calculate number of tokens, falling back to approximate count Error: Unknown model i took a look in the code and the getEncodingNameForModel function expect the exact model's name, but if we use a simple contains it will solve the problem since the openAI models use the base model's name in the finetuned models

can I implement this fix?

e.g

function getEncodingNameForModel(model) {
  const modelMappings = [
    { keywords: ["gpt2"], encoding: "gpt2" },
    { keywords: ["code-cushman-001", "code-cushman-002", "code-davinci-001", "code-davinci-002", "cushman-codex", "davinci-codex", "text-davinci-002", "text-davinci-003"], encoding: "p50k_base" },
    { keywords: ["code-davinci-edit-001", "text-davinci-edit-001"], encoding: "p50k_edit" },
    { keywords: ["ada", "babbage", "code-search-ada-code-001", "code-search-babbage-code-001", "curie", "davinci", "text-ada-001", "text-babbage-001", "text-curie-001", "text-davinci-001", "text-search-ada-doc-001", "text-search-babbage-doc-001", "text-search-curie-doc-001", "text-search-davinci-doc-001", "text-similarity-ada-001", "text-similarity-babbage-001", "text-similarity-curie-001", "text-similarity-davinci-001"], encoding: "r50k_base" },
    { keywords: ["gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0301", "gpt-3.5-turbo", "gpt-4-32k-0613", "gpt-4-32k-0314", "gpt-4-32k", "gpt-4-0613", "gpt-4-0314", "gpt-4", "text-embedding-ada-002"], encoding: "cl100k_base" },
  ];

  for (const mapping of modelMappings) {
    if (mapping.keywords.some(keyword => model.includes(keyword))) {
      return mapping.encoding;
    }
  }

  throw new Error("Unknown model");
}
italojs commented 8 months ago

@dqbd dqbd