mcxiaoxiao / annotated-transformer-Chinese

Transformer论文Attention is All You Need的代码中文注释实现,翻译自harvardnlp/annotated-transformer
https://nlp.seas.harvard.edu/
MIT License
16 stars 2 forks source link

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 37: invalid start byte #1

Open hackhaye opened 3 months ago

hackhaye commented 3 months ago

请问你是怎么解决“def build_vocabulary(spacy_de, spacy_en): def tokenize_de(text): return tokenize(text, spacy_de)

def tokenize_en(text):
    return tokenize(text, spacy_en)

print("Building German Vocabulary ...")

train, val, test = datasets.Multi30k(language_pair=("de", "en"))

train = datasets.Multi30k(root='.data', split='train', language_pair=('de', 'en'))
val = datasets.Multi30k(root='.data', split='valid', language_pair=('de', 'en'))
test = datasets.Multi30k(root='.data', split='test', language_pair=('de', 'en'))
vocab_src = build_vocab_from_iterator(
    yield_tokens(train + val + test, tokenize_de, index=0),
    min_freq=2,
    specials=["<s>", "</s>", "<blank>", "<unk>"],
)

print("Building English Vocabulary ...")

train, val, test = datasets.Multi30k(language_pair=("de", "en"))

train = datasets.Multi30k(root='.data', split='train', language_pair=('de', 'en'))
val = datasets.Multi30k(root='.data', split='valid', language_pair=('de', 'en'))
test = datasets.Multi30k(root='.data', split='test', language_pair=('de', 'en'))
vocab_tgt = build_vocab_from_iterator(
    yield_tokens(train + val + test, tokenize_en, index=1),
    min_freq=2,
    specials=["<s>", "</s>", "<blank>", "<unk>"],
)

vocab_src.set_default_index(vocab_src["<unk>"])
vocab_tgt.set_default_index(vocab_tgt["<unk>"])

return vocab_src, vocab_tgt

def load_vocab(spacy_de, spacy_en): if not exists("vocab.pt"): vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en) torch.save((vocab_src, vocab_tgt), "vocab.pt") else: vocab_src, vocab_tgt = torch.load("vocab.pt") print("Finished.\nVocabulary sizes:") print(len(vocab_src)) print(len(vocab_tgt)) return vocab_src, vocab_tgt

if is_interactive_notebook():

global variables used later in the script

spacy_de, spacy_en = show_example(load_tokenizers)
vocab_src, vocab_tgt = show_example(load_vocab, args=[spacy_de, spacy_en])” 引入的UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 37: invalid start byte问题的?
mcxiaoxiao commented 3 months ago

哎非常抱歉 我也卡在这一步😨。如果dalao能解决的话希望可以提PR