Open hackhaye opened 3 months ago
请问你是怎么解决“def build_vocabulary(spacy_de, spacy_en): def tokenize_de(text): return tokenize(text, spacy_de)
def tokenize_en(text): return tokenize(text, spacy_en) print("Building German Vocabulary ...")
train = datasets.Multi30k(root='.data', split='train', language_pair=('de', 'en')) val = datasets.Multi30k(root='.data', split='valid', language_pair=('de', 'en')) test = datasets.Multi30k(root='.data', split='test', language_pair=('de', 'en')) vocab_src = build_vocab_from_iterator( yield_tokens(train + val + test, tokenize_de, index=0), min_freq=2, specials=["<s>", "</s>", "<blank>", "<unk>"], ) print("Building English Vocabulary ...")
train = datasets.Multi30k(root='.data', split='train', language_pair=('de', 'en')) val = datasets.Multi30k(root='.data', split='valid', language_pair=('de', 'en')) test = datasets.Multi30k(root='.data', split='test', language_pair=('de', 'en')) vocab_tgt = build_vocab_from_iterator( yield_tokens(train + val + test, tokenize_en, index=1), min_freq=2, specials=["<s>", "</s>", "<blank>", "<unk>"], ) vocab_src.set_default_index(vocab_src["<unk>"]) vocab_tgt.set_default_index(vocab_tgt["<unk>"]) return vocab_src, vocab_tgt
def load_vocab(spacy_de, spacy_en): if not exists("vocab.pt"): vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en) torch.save((vocab_src, vocab_tgt), "vocab.pt") else: vocab_src, vocab_tgt = torch.load("vocab.pt") print("Finished.\nVocabulary sizes:") print(len(vocab_src)) print(len(vocab_tgt)) return vocab_src, vocab_tgt
if is_interactive_notebook():
spacy_de, spacy_en = show_example(load_tokenizers) vocab_src, vocab_tgt = show_example(load_vocab, args=[spacy_de, spacy_en])” 引入的UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 37: invalid start byte问题的?
哎非常抱歉 我也卡在这一步😨。如果dalao能解决的话希望可以提PR
请问你是怎么解决“def build_vocabulary(spacy_de, spacy_en): def tokenize_de(text): return tokenize(text, spacy_de)
train, val, test = datasets.Multi30k(language_pair=("de", "en"))
train, val, test = datasets.Multi30k(language_pair=("de", "en"))
def load_vocab(spacy_de, spacy_en): if not exists("vocab.pt"): vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en) torch.save((vocab_src, vocab_tgt), "vocab.pt") else: vocab_src, vocab_tgt = torch.load("vocab.pt") print("Finished.\nVocabulary sizes:") print(len(vocab_src)) print(len(vocab_tgt)) return vocab_src, vocab_tgt
if is_interactive_notebook():
global variables used later in the script