Open lovit opened 5 years ago
Unknown 태그 제거용 코드
train_data = WordMorphemePairs('../data/train.txt', num_sents=-1)
with open('../data/train_clean.txt', 'w', encoding='utf-8') as f:
for words_text, morphs_text in train_data:
try:
# parsing
words = text_to_words(words_text, morphs_text)
wordtags = [(word.word, word.tag0) for word in words]
# error check
has_error = False
for word, tag in wordtags[1:-1]:
if not dictionary.check(word, tag):
has_error = True
break
if has_error:
continue
# write
words = words_text.split(' ')
morphs = morphs_text.split(' ')
for word, morph in zip(words, morphs):
f.write('{}\t{}\n'.format(word, morph))
f.write('\n')
except Exception as e:
continue