Closed amcgregor closed 6 years ago
Simple test case:
# encoding: cinje : def tmpl A sample of Unicode™ text.
Without the ™ symbol, all is well. With it, a substantial traceback is produced:
/Users/amcgregor/Projects/cegid/cinje/cinje/encoding.pyc in cinje_decode(input, errors, final) 18 def cinje_decode(input, errors='strict', final=True): 19 if not final: return '', 0 ---> 20 output = transform(bytes(input).decode('utf8', errors)) 21 return output, len(input) 22 /Users/amcgregor/Projects/cegid/cinje/cinje/encoding.pyc in transform(input) 12 def transform(input): 13 #__import__('pudb').set_trace() ---> 14 translator = Context(input) 15 return '\n'.join(str(i) for i in translator.stream) 16 /Users/amcgregor/Projects/cegid/cinje/cinje/util.pyc in __init__(self, input) 449 450 def __init__(self, input): --> 451 self.input = Lines(input.decode('utf8') if isinstance(input, bytes) else input) 452 self.scope = 0 453 self.flag = set() /Users/amcgregor/Projects/cegid/cinje/cinje/util.pyc in __init__(self, input, Line) 397 398 else: --> 399 self.source = list(self.Line(i + 1, j) for i, j in enumerate(input.split('\n'))) 400 self.buffer = deque(self.source) 401 /Users/amcgregor/Projects/cegid/cinje/cinje/util.pyc in <genexpr>((i, j)) 397 398 else: --> 399 self.source = list(self.Line(i + 1, j) for i, j in enumerate(input.split('\n'))) 400 self.buffer = deque(self.source) 401 /Users/amcgregor/Projects/cegid/cinje/cinje/util.pyc in __init__(self, number, line, scope, kind) 318 def __init__(self, number, line, scope=None, kind=None): 319 if isinstance(line, str): --> 320 line = line.decode('utf-8') 321 322 self.number = number /Users/amcgregor/Projects/cegid/rita/.venv/lib/python2.7/encodings/utf_8.pyc in decode(input, errors) 14 15 def decode(input, errors='strict'): ---> 16 return codecs.utf_8_decode(input, errors, True) 17 18 class IncrementalEncoder(codecs.IncrementalEncoder): UnicodeEncodeError: 'ascii' codec can't encode character u'\u2122' in position 19: ordinal not in range(128)
Looks like a double decoding: util.py:451, util.py:320 (incorrect isinstance check if normalized str has been imported from compat)
isinstance
str
compat
Simple test case:
Without the ™ symbol, all is well. With it, a substantial traceback is produced:
Looks like a double decoding: util.py:451, util.py:320 (incorrect
isinstance
check if normalizedstr
has been imported fromcompat
)