Open lixiangnlp opened 9 years ago
照 jieba 的做法实现一个:
import re
re_skip_internal = re.compile("(\r\n|\s)")
re_num = re.compile("[\.0-9]+")
re_eng = re.compile("[a-zA-Z0-9]+")
class POSSimpleTagger:
def __init__(self, dictfile):
self.word_tag_tab = {}
with open(dictfile, "rb") as f:
for lineno, line in enumerate(f, 1):
try:
line = line.strip().decode("utf-8")
if not line:
continue
word, _, tag = line.split(" ")
self.word_tag_tab[word] = tag
except Exception:
raise ValueError(
'invalid POS dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
def tagpos(self, words, HMM=True):
for w in words:
if re_skip_internal.match(w):
yield (w, 'x')
elif re_num.match(w):
yield (w, 'm')
elif re_eng.match(w):
yield (w, 'eng')
else:
yield (w, self.word_tag_tab.get(w, 'x'))
注意字符串要用 unicode(py2), str(py3)。
能否对已经分词的文本单独进行词性标注呢