Open russkel opened 4 years ago
So I attempted to do what I talked about - noticed the FlateCodec class doesn't exist in the version on pypi, installed from this git repo, and now I get an unknown xref error:
PdfReadError: Unknown xref type: 83
So I needed to use strict=False
but now I get this error when trying to getFields()
:
PdfReadWarning: Object 131072 0 not defined. [pdf.py:2229]
---------------------------------------------------------------------------
PdfReadError Traceback (most recent call last)
<ipython-input-12-cd312879d9b2> in <module>
----> 1 pdf.getFields()
~/.local/lib/python3.8/site-packages/pypdf/pdf.py in getFields(self, tree, retval, fileobj)
1667 if retval is None:
1668 retval = {}
-> 1669 catalog = self._trailer["/Root"]
1670
1671 # Get the AcroForm tree
~/.local/lib/python3.8/site-packages/pypdf/generic.py in __getitem__(self, key)
579
580 def __getitem__(self, key):
--> 581 return dict.__getitem__(self, key).getObject()
582
583 def getXmpMetadata(self):
~/.local/lib/python3.8/site-packages/pypdf/generic.py in getObject(self)
199
200 def getObject(self):
--> 201 return self.pdf.getObject(self).getObject()
202
203 def __repr__(self):
~/.local/lib/python3.8/site-packages/pypdf/pdf.py in getObject(self, ref)
2220 return self._cachedObjects[(ref.generation, ref.idnum)]
2221 if ref.idnum in self._xrefStm:
-> 2222 retval = self._getObjectByRef(ref, self.R_XSTREAM)
2223 elif (
2224 ref.generation in self._xrefTable
~/.local/lib/python3.8/site-packages/pypdf/pdf.py in _getObjectByRef(self, ref, source)
2042 )
2043 elif this_type == 2:
-> 2044 return self._getCompressedObjectFromXRefStream(ref)
2045 else:
2046 # «Any other value shall be interpreted as a reference to the
~/.local/lib/python3.8/site-packages/pypdf/pdf.py in _getCompressedObjectFromXRefStream(self, ref)
2106
2107 # Object streams always have a generation number of 0
-> 2108 objStm = IndirectObject(objStmId, 0, self).getObject()
2109
2110 if objStm["/Type"] != "/ObjStm":
~/.local/lib/python3.8/site-packages/pypdf/generic.py in getObject(self)
199
200 def getObject(self):
--> 201 return self.pdf.getObject(self).getObject()
202
203 def __repr__(self):
~/.local/lib/python3.8/site-packages/pypdf/pdf.py in getObject(self, ref)
2231 PdfReadWarning,
2232 )
-> 2233 raise PdfReadError(
2234 "Could not find object (%d, %d)" % (ref.idnum, ref.generation)
2235 )
PdfReadError: Could not find object (131072, 0)
https://github.com/claird/PyPDF4/blob/9c60d9df3a56edd32226c9e76695018f997fafe6/pypdf/generic.py#L960-L963
Hi, sorry but I am not familiar with PDF formats whatsoever. Is there a reason this isn't supported yet? Can we just take some bytes, reapply the compression/encoding and be done, or am I missing something?