Closed tarekgh closed 1 week ago
namespace Microsoft.ML.Tokenizers
{
public partial class WordPieceTokenizer : Tokenizer
{
public static WordPieceTokenizer Create(
string vocabFilePath,
PreTokenizer? preTokenizer = null,
Normalizer? normalizer = null,
IReadOnlyDictionary<string, int>? specialTokens = null,
string unknownToken = "[UNK]",
string continuingSubwordPrefix = DefaultContinuingSubwordPrefix,
int maxInputCharsPerWord = DefaultMaxInputCharsPerWord
);
public static WordPieceTokenizer Create(
Stream vocabStream,
PreTokenizer? preTokenizer = null,
Normalizer? normalizer = null,
IReadOnlyDictionary<string, int>? specialTokens = null,
string unknownToken = "[UNK]",
string continuingSubwordPrefix = DefaultContinuingSubwordPrefix,
int maxInputCharsPerWord = DefaultMaxInputCharsPerWord
);
public static async Task<WordPieceTokenizer> CreateAsync(
Stream vocabStream,
PreTokenizer? preTokenizer = null,
Normalizer? normalizer = null,
IReadOnlyDictionary<string, int>? specialTokens = null,
string unknownToken = "[UNK]",
string continuingSubwordPrefix = DefaultContinuingSubwordPrefix,
int maxInputCharsPerWord = DefaultMaxInputCharsPerWord,
CancellationToken cancellationToken = default
);
public string UnknownToken { get; }
public int UnknownTokenId { get; }
public string ContinuingSubwordPrefix { get; }
public int MaxInputCharsPerWord { get; }
public IReadOnlyDictionary<string, int>? SpecialTokens { get; }
public string Decode(IEnumerable<int> ids, bool skipSpecialTokens);
public OperationStatus Decode(
IEnumerable<int> ids,
Span<char> destination,
bool skipSpecialTokens,
out int idsConsumed,
out int charsWritten
);
}
}
sep
, pad
, cls
tokenizeChineseChars
to something more accurate, such as tokenizeCjkChars
stripAccents
correct? Discussion seems to indicate that we might strip more. Also, replace strip
with remove
.do
prefix
doLowerCase
-> convertToLowerCase
doBasicTokenization
-> applyBasicTokenization
BuildInputsWithSpecialTokens
/ GetSpecialTokensMask
/ CreateTokenTypeIdsFromSequences
tokenIds0
-> tokenIds
tokenIds1
-> additionalTokenIds
buffer
-> destination
written
-> valuesWritten
namespace Microsoft.ML.Tokenizers;
public sealed partial class BertTokenizer : WordPieceTokenizer
{
public static BertTokenizer Create(
string vocabFilePath,
bool doLowerCase = true,
bool doBasicTokenization = true,
bool splitOnSpecialTokens = true,
string unknownToken = "[UNK]",
string sepToken = "[SEP]",
string padToken = "[PAD]",
string clsToken = "[CLS]",
string maskToken = "[MASK]",
bool tokenizeChineseChars = true,
bool stripAccents = false
);
public static BertTokenizer Create(
Stream vocabStream,
bool doLowerCase = true,
bool doBasicTokenization = true,
bool splitOnSpecialTokens = true,
string unknownToken = "[UNK]",
string sepToken = "[SEP]",
string padToken = "[PAD]",
string clsToken = "[CLS]",
string maskToken = "[MASK]",
bool tokenizeChineseChars = true,
bool stripAccents = false
);
public static async Task<BertTokenizer> CreateAsync(
Stream vocabStream,
bool doLowerCase = true,
bool doBasicTokenization = true,
bool splitOnSpecialTokens = true,
string unknownToken = "[UNK]",
string sepToken = "[SEP]",
string padToken = "[PAD]",
string clsToken = "[CLS]",
string maskToken = "[MASK]",
bool tokenizeChineseChars = true,
bool stripAccents = false
);
public bool DoLowerCase { get; }
public bool DoBasicTokenization { get; }
public bool SplitOnSpecialTokens { get; }
public string SepToken { get; }
public int SepTokenId { get; }
public string PadToken { get; }
public int PadTokenId { get; }
public string ClsToken { get; }
public int ClsTokenId { get; }
public string MaskToken { get; }
public int MaskTokenId { get; }
public bool TokenizeChineseChars { get; }
public bool StripAccents { get; }
public IReadOnlyList<int> EncodeToIds(
string text,
bool addSpecialTokens,
bool considerPreTokenization = true,
bool considerNormalization = true
);
public IReadOnlyList<int> EncodeToIds(
ReadOnlySpan<char> text,
bool addSpecialTokens,
bool considerPreTokenization = true,
bool considerNormalization = true
);
public IReadOnlyList<int> EncodeToIds(
string text,
int maxTokenCount,
bool addSpecialTokens,
out string? normalizedText,
out int charsConsumed,
bool considerPreTokenization = true,
bool considerNormalization = true
);
public IReadOnlyList<int> EncodeToIds(
ReadOnlySpan<char> text,
int maxTokenCount,
bool addSpecialTokens,
out string? normalizedText,
out int charsConsumed,
bool considerPreTokenization = true,
bool considerNormalization = true
);
public IReadOnlyList<int> BuildInputsWithSpecialTokens(
IEnumerable<int> tokenIds0,
IEnumerable<int>? tokenIds1 = null
);
public OperationStatus BuildInputsWithSpecialTokens(
IEnumerable<int> tokenIds0,
Span<int> buffer,
out int written,
IEnumerable<int>? tokenIds1 = null
);
public IReadOnlyList<int> GetSpecialTokensMask(
IEnumerable<int> tokenIds0,
IEnumerable<int>? tokenIds1 = null,
bool alreadyHasSpecialTokens = false
);
public OperationStatus GetSpecialTokensMask(
IEnumerable<int> tokenIds0,
Span<int> buffer,
out int written,
IEnumerable<int>? tokenIds1 = null,
bool alreadyHasSpecialTokens = false
);
public IReadOnlyList<int> CreateTokenTypeIdsFromSequences(
IEnumerable<int> tokenIds0,
IEnumerable<int>? tokenIds1 = null
);
public OperationStatus CreateTokenTypeIdsFromSequences(
IEnumerable<int> tokenIds0,
Span<int> buffer,
out int written,
IEnumerable<int>? tokenIds1 = null
);
}
PreTokenizer
suffixCreateWhiteSpaceOrPunctuation
-> CreateWordOrPunctuation
specialTokensEncoder
-> specialTokens
namespace Microsoft.ML.Tokenizers;
public partial class PreTokenizer
{
public static PreTokenizer CreateWordOrPunctuation(IReadOnlyDictionary<string, int>? specialTokens = null);
public static PreTokenizer CreateWordOrNonWord(IReadOnlyDictionary<string, int>? specialTokens = null);
public static PreTokenizer CreateWhitespace(IReadOnlyDictionary<string, int>? specialTokens = null);
}
From @tarekgh
TokenizeChineseChars
:
- We are going to cover the some ranges in CJK but not all ranges. So I suggest to use the name IndividuallyTokenizeCjk. The doc will list the exact ranges we use.
stripAccent
- I discovered we need to remove only the non-spacing marks. so I suggest naming it as
removeNonSpacingMarks
doLowerCase
,doBasicTokenization
- I suggest
lowerCaseBeforeTokenization
andapplyBasicTokenization
. I am fine withapplyLowerCasing
if Jeremy is fine with it- Using options
- I am seeing the polymorphism is not bad idea after all with options. I am suggesting the following:
Incorporating both our feedback above and the options proposal, I believe the totality of the API looks as below.
namespace Microsoft.ML.Tokenizers;
public class WordPieceOptions
{
public WordPieceOptions();
public PreTokenizer? PreTokenizer { get; set; }
public Normalizer? Normalizer { get; set; }
public IReadOnlyDictionary<string, int>? SpecialTokens { get; set; }
public string UnknownToken { get; set; } = "[UNK]";
public string ContinuingSubwordPrefix { get; set; } = DefaultContinuingSubwordPrefix;
public int MaxInputCharsPerWord { get; set; } = DefaultMaxInputCharsPerWord;
}
public sealed class BertOptions : WordPieceOptions
{
public BertOptions();
public bool LowerCaseBeforeTokenization { get; set; } = true;
public bool ApplyBasicTokenization { get; set; } = true;
public bool SplitOnSpecialTokens { get; set; } = true;
public string SeparatorToken { get; set; } = "[SEP]";
public string PaddingToken { get; set; } = "[PAD]";
public string ClassificationToken { get; set; } = "[CLS]";
public string MaskingToken { get; set; } = "[MASK]";a
public bool IndividuallyTokenizeCjk { get; set; } = true;
public bool RemoveNonSpacingMarks { get; set; };
}
public partial class WordPieceTokenizer : Tokenizer
{
public static WordPieceTokenizer Create(string vocabFilePath, WordPieceOptions? options = null);
public static WordPieceTokenizer Create(Stream vocabStream, WordPieceOptions? options = null);
public static Task<WordPieceTokenizer> CreateAsync(Stream vocabStream, WordPieceOptions? options = null, CancellationToken cancellationToken = default);
public string UnknownToken { get; }
public int UnknownTokenId { get; }
public string ContinuingSubwordPrefix { get; }
public int MaxInputCharsPerWord { get; }
public IReadOnlyDictionary<string, int>? SpecialTokens { get; }
public string Decode(IEnumerable<int> ids, bool skipSpecialTokens);
public OperationStatus Decode(
IEnumerable<int> ids,
Span<char> destination,
bool skipSpecialTokens,
out int idsConsumed,
out int charsWritten
);
}
public sealed partial class BertTokenizer : WordPieceTokenizer
{
public static BertTokenizer Create(string vocabFilePath, BertOptions? options = null);
public static BertTokenizer Create(Stream vocabStream, BertOptions? options = null);
public static Task<BertTokenizer> CreateAsync(Stream vocabStream, BertOptions? options = null, CancellationToken cancellationToken = default);
public bool LowerCaseBeforeTokenization { get; }
public bool ApplyBasicTokenization { get; }
public bool SplitOnSpecialTokens { get; }
public string SeparatorToken { get; }
public int SeparatorTokenId { get; }
public string PaddingToken { get; }
public int PaddingTokenId { get; }
public string ClassificationToken { get; }
public int ClassificationTokenId { get; }
public string MaskingToken { get; }
public int MaskingTokenId { get; }
public bool IndividuallyTokenizeCjk { get; }
public bool RemoveNonSpacingMarks { get; }
public IReadOnlyList<int> EncodeToIds(
string text,
bool addSpecialTokens,
bool considerPreTokenization = true,
bool considerNormalization = true
);
public IReadOnlyList<int> EncodeToIds(
ReadOnlySpan<char> text,
bool addSpecialTokens,
bool considerPreTokenization = true,
bool considerNormalization = true
);
public IReadOnlyList<int> EncodeToIds(
string text,
int maxTokenCount,
bool addSpecialTokens,
out string? normalizedText,
out int charsConsumed,
bool considerPreTokenization = true,
bool considerNormalization = true
);
public IReadOnlyList<int> EncodeToIds(
ReadOnlySpan<char> text,
int maxTokenCount,
bool addSpecialTokens,
out string? normalizedText,
out int charsConsumed,
bool considerPreTokenization = true,
bool considerNormalization = true
);
public IReadOnlyList<int> BuildInputsWithSpecialTokens(
IEnumerable<int> tokenIds,
IEnumerable<int>? additionalTokenIds = null
);
public OperationStatus BuildInputsWithSpecialTokens(
IEnumerable<int> tokenIds,
Span<int> destination,
out int valuesWritten,
IEnumerable<int>? additionalTokenIds = null
);
public IReadOnlyList<int> GetSpecialTokensMask(
IEnumerable<int> tokenIds,
IEnumerable<int>? additionalTokenIds = null,
bool alreadyHasSpecialTokens = false
);
public OperationStatus GetSpecialTokensMask(
IEnumerable<int> tokenIds,
Span<int> destination,
out int valuesWritten,
IEnumerable<int>? additionalTokenIds = null,
bool alreadyHasSpecialTokens = false
);
public IReadOnlyList<int> CreateTokenTypeIdsFromSequences(
IEnumerable<int> tokenIds,
IEnumerable<int>? additionalTokenIds = null
);
public OperationStatus CreateTokenTypeIdsFromSequences(
IEnumerable<int> tokenIds,
Span<int> destination,
out int valuesWritten,
IEnumerable<int>? additionalTokenIds = null
);
}
public partial class PreTokenizer
{
public static PreTokenizer CreateWordOrPunctuation(IReadOnlyDictionary<string, int>? specialTokens = null);
public static PreTokenizer CreateWordOrNonWord(IReadOnlyDictionary<string, int>? specialTokens = null);
public static PreTokenizer CreateWhitespace(IReadOnlyDictionary<string, int>? specialTokens = null);
}
Proposal
The proposal omitted the overridden properties and method that is defined in the abstraction we already reviewed before.
WordPiece Tokenizer
Bert Tokenizer
PreTokenizer Factory methods