maxpmaxp / pdfreader

Python API for PDF documents
MIT License
113 stars 26 forks source link

Cmap/Decoder issue #29

Closed maxpmaxp closed 4 years ago

maxpmaxp commented 4 years ago
TRACEBACK: Traceback (most recent call last):
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/codecs/decoder.py", line 125, in decode_hexstring
   ch = self.cmap.bf_ranges[code]
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/types/cmap.py", line 323, in __getitem__
   raise KeyError(item)
KeyError: '4578'
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/viewer/pdfviewer.py", line 82, in render
   self.notify(obj)
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/viewer/pdfviewer.py", line 73, in notify
   handler(obj)
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/viewer/simple.py", line 161, in _decode_prop_contents
   props['Contents'] = self.decode_string(contents)
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/viewer/simple.py", line 100, in decode_string
   s = self.decoder.decode_string(s)
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/codecs/decoder.py", line 147, in decode_string
   return self.decode_hexstring(s_hex)
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/codecs/decoder.py", line 134, in decode_hexstring
   ch = self._encoding_decoder.decode_hexstring(HexString(code[:2]))
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/codecs/decoder.py", line 153, in decode_hexstring
   return self.decode_string(s.to_bytes())
 File "/usr/local/scrapers/PostaliSpider/.venv/src/pdfreader/pdfreader/codecs/decoder.py", line 171, in decode_string
   raise TypeError("Unexpected type. Probably a bug: {} type of {}".format(self.encoding, type(self.encoding)))
TypeError: Unexpected type. Probably a bug: {'Differences': [0, 'EX000000', 54, 'EX150000', 64, 'EX032000', 75, 'EX046000', 77, 'EX040000', 'EX043000', 80, 'EX038000', 91, 'EX036000', 93, 'EX041000', 96, 'EX045000', 'EX047000', 107, 'EX044000', 'EX037000', 122, 'EX058000', 126, 'EX061000', 129, 'EX097000', 'EX098000', 'EX099000', 'EX100000', 'EX101000', 'EX102000', 'EX103000', 'EX104000', 'EX105000', 146, 'EX107000', 'EX108000', 'EX109000', 'EX110000', 'EX111000', 'EX112000', 153, 'EX114000', 162, 'EX115000', 'EX116000', 'EX117000', 'EX118000', 'EX119000', 'EX120000', 'EX121000', 175, 'EX174000', 193, 'EX065000', 'EX066000', 'EX067000', 'EX068000', 'EX069000', 'EX070000', 'EX071000', 'EX072000', 'EX073000', 210, 'EX075000', 'EX076000', 'EX077000', 'EX078000', 'EX079000', 'EX080000', 217, 'EX082000', 226, 'EX083000', 'EX084000', 'EX085000', 'EX086000', 'EX087000', 'EX088000', 'EX089000', 240, 'EX048000', 'EX049000', 'EX050000', 'EX051000', 'EX052000', 'EX053000', 'EX054000', '
EX055000', 'EX056000', 'EX057000']} type of <class 'pdfreader.types.objects.DictBasedObject'>

response-0-7.pdf

maxpmaxp commented 4 years ago

The reason is Encoding object given without /Type key.

{'Differences': [0, 'EX000000', 54, 'EX150000', 64, 'EX032000', 75, 'EX046000', 77, 'EX040000', 
'EX043000', 80, 'EX038000', 91, 'EX036000', 93, 'EX041000', 96, 'EX045000', 'EX047000', 107, 
'EX044000', 'EX037000', 122, 'EX058000', 126, 'EX061000', 129, 'EX097000', 'EX098000', 
'EX099000', 'EX100000', 'EX101000', 'EX102000', 'EX103000', 'EX104000', 'EX105000', 146, 
'EX107000', 'EX108000', 'EX109000', 'EX110000', 'EX111000', 'EX112000', 153, 'EX114000', 162, 
'EX115000', 'EX116000', 'EX117000', 'EX118000', 'EX119000', 'EX120000', 'EX121000', 175, 
'EX174000', 193, 'EX065000', 'EX066000', 'EX067000', 'EX068000', 'EX069000', 'EX070000', 
'EX071000', 'EX072000', 'EX073000', 210, 'EX075000', 'EX076000', 'EX077000', 'EX078000', 
'EX079000', 'EX080000', 217, 'EX082000', 226, 'EX083000', 'EX084000', 'EX085000', 'EX086000', 
'EX087000', 'EX088000', 'EX089000', 240, 'EX048000', 'EX049000', 'EX050000', 'EX051000', 
'EX052000', 'EX053000', 'EX054000', 'EX055000', 'EX056000', 'EX057000']}