NIHOPA / NLPre

Python library for Natural Language Preprocessing (NLPre)
190 stars 34 forks source link

Error in separated parenthesis #84

Closed thoppe closed 7 years ago

thoppe commented 7 years ago

This is a large traceback, but it's unfortunately all we get when running in parallel. It looks like the input to the function is truncated a bit too so it's hard to tell what's going in.

/usr/local/lib/python2.7/dist-packages/joblib/parallel.py in __call__(self=<joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function dispatcher>
        args = ({'_filename': 'data_import/RPG_2012.csv', '_ref': '50223', 'text': 'Dynamic Dopamine Images: A New View of the Neuro...vement will be greater in men than women. [[[ ]]]'},)
        kwargs = {'target_column': 'text'}
        self.items = [(<function dispatcher>, ({'_filename': 'data_import/RPG_2012.csv', '_ref': '50223', 'text': 'Dynamic Dopamine Images: A New View of the Neuro...vement will be greater in men than women. [[[ ]]]'},), {'target_column': 'text'})]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/home/hoppeta/test/word2vec_pipeline/word2vec_pipeline/parse.py in dispatcher(row={'_filename': 'data_import/RPG_2012.csv', '_ref': '50223', 'text': 'Dynamic Dopamine Images: A New View of the Neuro...vement will be greater in men than women. [[[ ]]]'}, target_column='text')
     16 
     17 def dispatcher(row, target_column):
     18     text = row[target_column] if target_column in row else None
     19 
     20     for f in parser_functions:
---> 21         text = unicode(f(text))
        text = u'Dynamic Dopamine Images : A New View of the Ne...will be greater in men than women .\n[ [ [ ] ] ]'
        f = <nlpre.separated_parenthesis.separated_parenthesis object>
     22 
     23     row[target_column] = text
     24     return row
     25 

...........................................................................
/usr/local/lib/python2.7/dist-packages/nlpre/separated_parenthesis.py in __call__(self=<nlpre.separated_parenthesis.separated_parenthesis object>, text=[u'Based on findings with other stimuli , we hypo... involvement will be greater in men than women .', u'a .', u'b .'])
     81 
     82                 text = ' '.join(tokens)
     83                 doc_out.append(text)
     84             else:
     85 
---> 86                 text = self.paren_pop(tokens)
        text = [u'Based on findings with other stimuli , we hypo... involvement will be greater in men than women .', u'a .', u'b .']
        self.paren_pop = <bound method separated_parenthesis.paren_pop of...arated_parenthesis.separated_parenthesis object>>
        tokens = ([([([([], {})], {})], {})], {})
     87                 doc_out.extend(text)
     88 
     89         return '\n'.join(doc_out)
     90 

...........................................................................
/usr/local/lib/python2.7/dist-packages/nlpre/separated_parenthesis.py in paren_pop(self=<nlpre.separated_parenthesis.separated_parenthesis object>, parsed_tokens=[[[[]]]])
     99         # must convert the ParseResult to a list, otherwise adding it to a list
    100         # causes weird results.
    101         if isinstance(parsed_tokens, pypar.ParseResults):
    102             parsed_tokens = parsed_tokens.asList()
    103 
--> 104         content = self.paren_pop_helper(parsed_tokens)
        content = undefined
        self.paren_pop_helper = <bound method separated_parenthesis.paren_pop_he...arated_parenthesis.separated_parenthesis object>>
        parsed_tokens = [[[[]]]]
    105         return content
    106 
    107     def paren_pop_helper(self, tokens):
    108         '''

...........................................................................
/usr/local/lib/python2.7/dist-packages/nlpre/separated_parenthesis.py in paren_pop_helper(self=<nlpre.separated_parenthesis.separated_parenthesis object>, tokens=[[[]]])
    134             reorged_tokens = []
    135 
    136             # Iterate through all parenthetical content, recursing on them
    137             # This allows content in nested parenthesis to be captured
    138             for tokes in token_parens:
--> 139                 sents = self.paren_pop_helper(tokes)
        sents = undefined
        self.paren_pop_helper = <bound method separated_parenthesis.paren_pop_he...arated_parenthesis.separated_parenthesis object>>
        tokes = [[]]
    140                 self.logger.info('Expanded parenthetical content: %s' % sents)
    141                 reorged_tokens.extend(sents)
    142 
    143             # Bundles outer sentence with inner parenthetical content

...........................................................................
/usr/local/lib/python2.7/dist-packages/nlpre/separated_parenthesis.py in paren_pop_helper(self=<nlpre.separated_parenthesis.separated_parenthesis object>, tokens=[])
    124         new_tokens = []
    125         token_words = [x for x in tokens if isinstance(x, six.string_types)]
    126 
    127         # If tokens don't include parenthetical content, return as string
    128         if len(token_words) == len(tokens):
--> 129             if token_words[-1] not in ['.', '!', '?']:
        token_words = []
    130                 token_words.append('.')
    131             return [' '.join(token_words)]
    132         else:
    133             token_parens = [x for x in tokens if isinstance(x, list)]

IndexError: list index out of range
___________________________________________________________________________

Fatal error: local() encountered an error (return code 1) while executing 'python word2vec_pipeline parse'
thoppe commented 7 years ago

Trying to narrow it down, a much smaller single threaded traceback is

  File "/usr/local/lib/python2.7/dist-packages/nlpre/separated_parenthesis.py", line 92, in __call__
    text = self.paren_pop(tokens)
  File "/usr/local/lib/python2.7/dist-packages/nlpre/separated_parenthesis.py", line 110, in paren_pop
    content = self.paren_pop_helper(parsed_tokens)
  File "/usr/local/lib/python2.7/dist-packages/nlpre/separated_parenthesis.py", line 146, in paren_pop_helper
    sents = self.paren_pop_helper(tokens)
  File "/usr/local/lib/python2.7/dist-packages/nlpre/separated_parenthesis.py", line 136, in paren_pop_helper
    if token_words[-1] not in ['.', '!', '?']:
IndexError: list index out of range
thoppe commented 7 years ago

Found it. Here is a MWE of the error. It's pathological, but it came up in real world data (and absolutely shouldn't crash the program!)

import nlpre
doc = '''[[[ ]]]'''
nlpre.separated_parenthesis()(doc)