bert vocab - Githubissues

We use 500 txt files (including Chinese and English), use WordPieceTrainer, and set vocab_size to 30522, but the json is 32430.

bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]",max_input_chars_per_word=3)) bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) bert_tokenizer.pre_tokenizer = Whitespace() data_path = './data/' files_all =[] for root,dir,filenames in os.walk(data_path): for filename in filenames: files_all.append(os.path.join(root.strip('./data'),filename))

files = [f"data/{split}" for split in files_all] print(len(files)) files_split = [f"{files[split]}"for split in range(500)]

bert_tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], )

trainer = WordPieceTrainer( vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] ) bert_tokenizer.train(files_split, trainer)

bert_tokenizer.save("bert-wiki_zh_raw_500.json")

huggingface / tokenizers

bert vocab #761