NTMC-Community / MatchZoo

Facilitating the design, comparison and sharing of deep text matching models.
Apache License 2.0
3.84k stars 897 forks source link

使用matchzoo 处理中文,报错TypeError: expected string or bytes-like object,无法定位问题位置 #814

Closed Apollo2Mars closed 4 years ago

Apollo2Mars commented 4 years ago

Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 3%|▎ | 62441/2345889 [00:06<03:44, 10189.40it/s]

TypeError Traceback (most recent call last) in 14 preprocessor = mz.preprocessors.BasicPreprocessor() 15 # preprocessor = mz.preprocessors.DSSMPreprocessor() ---> 16 preprocessor.fit(train_raw, verbose=5) ## init preprocessor inner state. 17 train_processed = preprocessor.transform(train_raw, verbose=5) 18 test_processed = preprocessor.transform(test_raw, verbose=5)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/preprocessors/basic_preprocessor.py in fit(self, data_pack, verbose) 93 """ 94 data_pack = data_pack.apply_on_text(chain_transform(self._units), ---> 95 verbose=verbose) 96 fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, 97 data_pack,

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/data_pack/data_pack.py in wrapper(self, inplace, *args, *kwargs) 247 target = self.copy() 248 --> 249 func(target, args, **kwargs) 250 251 if not inplace:

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/data_pack/data_pack.py in apply_on_text(self, func, mode, rename, verbose) 379 """ 380 if mode == 'both': --> 381 self._apply_on_text_both(func, rename, verbose=verbose) 382 elif mode == 'left': 383 self._apply_on_text_left(func, rename, verbose=verbose)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/data_pack/data_pack.py in _apply_on_text_both(self, func, rename, verbose) 406 def _apply_on_text_both(self, func, rename, verbose=1): 407 left_name, right_name = rename or ('text_left', 'text_right') --> 408 self._apply_on_text_left(func, rename=left_name, verbose=verbose) 409 self._apply_on_text_right(func, rename=right_name, verbose=verbose) 410

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/data_pack/data_pack.py in _apply_on_text_left(self, func, rename, verbose) 400 if verbose: 401 tqdm.pandas(desc="Processing " + name + " with " + func.name) --> 402 self._left[name] = self._left['text_left'].progress_apply(func) 403 else: 404 self._left[name] = self._left['text_left'].apply(func)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/tqdm/std.py in inner(df, func, *args, kwargs) 751 # Apply the provided function (in kwargs) 752 # on the df using our wrapper (which provides bar updating) --> 753 result = getattr(df, df_function)(wrapper, **kwargs) 754 755 # Close bar and return pandas calculation result

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds) 4043 else: 4044 values = self.astype(object).values -> 4045 mapped = lib.map_infer(values, f, convert=convert_dtype) 4046 4047 if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/tqdm/std.py in wrapper(*args, *kwargs) 747 # take a fast or slow code path; so stop when t.total==t.n 748 t.update(n=1 if not t.total or t.n < t.total else 0) --> 749 return func(args, kwargs) 750 751 # Apply the provided function (in kwargs)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/preprocessors/chain_transform.py in wrapper(arg) 17 """Wrapper function of transformations composition.""" 18 for unit in units: ---> 19 arg = unit.transform(arg) 20 return arg 21

~/anaconda3/envs/tfbase/lib/python3.6/site-packages/matchzoo/preprocessors/units/tokenize.py in transform(self, input) 15 :return tokens: tokenized tokens as a list. 16 """ ---> 17 return nltk.wordtokenize(input)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/init.py in word_tokenize(text, language, preserve_line) 142 :type preserve_line: bool 143 """ --> 144 sentences = [text] if preserve_line else sent_tokenize(text, language) 145 return [ 146 token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/init.py in sent_tokenize(text, language) 104 """ 105 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language)) --> 106 return tokenizer.tokenize(text) 107 108

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in tokenize(self, text, realign_boundaries) 1275 Given a text, returns a list of the sentences in that text. 1276 """ -> 1277 return list(self.sentences_from_text(text, realign_boundaries)) 1278 1279 def debug_decisions(self, text):

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in sentences_from_text(self, text, realign_boundaries) 1329 follows the period. 1330 """ -> 1331 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] 1332 1333 def _slices_from_text(self, text):

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in (.0) 1329 follows the period. 1330 """ -> 1331 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] 1332 1333 def _slices_from_text(self, text):

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in span_tokenize(self, text, realign_boundaries) 1319 if realign_boundaries: 1320 slices = self._realign_boundaries(text, slices) -> 1321 for sl in slices: 1322 yield (sl.start, sl.stop) 1323

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _realign_boundaries(self, text, slices) 1360 """ 1361 realign = 0 -> 1362 for sl1, sl2 in _pair_iter(slices): 1363 sl1 = slice(sl1.start + realign, sl1.stop) 1364 if not sl2:

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _pair_iter(it) 316 it = iter(it) 317 try: --> 318 prev = next(it) 319 except StopIteration: 320 return

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _slices_from_text(self, text) 1333 def _slices_from_text(self, text): 1334 last_break = 0 -> 1335 for match in self._lang_vars.period_context_re().finditer(text): 1336 context = match.group() + match.group('after_tok') 1337 if self.text_contains_sentbreak(context):

TypeError: expected string or bytes-like object

matthew-z commented 4 years ago

Apparently, it is not a bug.

The text_left and text_right should be strings. If you have segmented them using jieba, then the text fields became lists.

Apollo2Mars commented 4 years ago

应该是我处理后边的数据有空行,剔除空行后正常了