Base dictionary update - Githubissues

Unknown 태그 제거용 코드

train_data = WordMorphemePairs('../data/train.txt', num_sents=-1)
with open('../data/train_clean.txt', 'w', encoding='utf-8') as f:
    for words_text, morphs_text in train_data:
        try:
            # parsing
            words = text_to_words(words_text, morphs_text)
            wordtags = [(word.word, word.tag0) for word in words]

            # error check
            has_error = False
            for word, tag in wordtags[1:-1]:
                if not dictionary.check(word, tag):
                    has_error = True
                    break
            if has_error:
                continue

            # write
            words = words_text.split('  ')
            morphs = morphs_text.split('  ')
            for word, morph in zip(words, morphs):
                f.write('{}\t{}\n'.format(word, morph))
            f.write('\n')
        except Exception as e:
            continue

lovit / lattice_based_tagger

Base dictionary update #8