claird / PyPDF4

A utility to read and write PDFs with Python
obsolete-https://pythonhosted.org/PyPDF2/
Other
328 stars 61 forks source link

Instance of `PyPDF4.PdfFileWriter` alters instance of `PyPDF4.PdfFileReader` #21

Closed DeliciousHair closed 5 years ago

DeliciousHair commented 5 years ago

Just so this doesn't seem like a totally esoteric complaint, what I need to do is be able to make a PNG image of each page in the source document for processing as an image file. I am also doing things via the image / text data directly, so this acts as a per-page reference point.

I have used this pdf in the below, the choice being completely random and arbitrary, arxiv.org simply has a lot of PDF documents.

The following fails miserably:

import io
import popplerqt5
import PyPDF4

from PyQt5.QtCore import QByteArray

def dump_pages(pdf_path, dpi=110):
    with open(pdf_path, 'rb') as fp:
        pdf = PyPDF4.PdfFileReader(fp)
        npages = pdf.numPages

        for pnum in range(npages):
            print('processing page {}'.format(pnum))
            x = io.BytesIO()
            writer = PyPDF4.PdfFileWriter()
            writer.addPage(pdf.getPage(pnum))
            writer.write(x)

            x.seek(0)
            z = QByteArray(x.read())

            doc = popplerqt5.Poppler.Document.loadFromData(z)
            doc.setRenderHint(popplerqt5.Poppler.Document.TextAntialiasing)
            page = doc.page(0)
            img = page.renderToImage(dpi, dpi)
            img.save('p_{0:03d}.png'.format(pnum))

I know, there are easier ways to do this but the above captures the mystery problem nicely; I have added a print(data.pdf) statement to PdfFileWriter._sweepIndirectReferences in order to generate the output below:

In [6]: dump_pages('sample.pdf')
processing page 0
<PyPDF4.pdf.PdfFileReader object at 0x7f3c5960fe80>
    ....
<PyPDF4.pdf.PdfFileReader object at 0x7f3c5960fe80>
processing page 1
<PyPDF4.pdf.PdfFileReader object at 0x7f3c5960fe80>
    ...
<PyPDF4.pdf.PdfFileReader object at 0x7f3c5960fe80>
<PyPDF4.pdf.PdfFileWriter object at 0x7f3c5960ff98>
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-6-04f45808fc5a> in <module>()
----> 1 dump_pages('sample.pdf')

<ipython-input-4-62d1bd9958a9> in dump_pages(pdf_path, dpi)
     15             writer = PyPDF4.PdfFileWriter()
     16             writer.addPage(pdf.getPage(pnum))
---> 17             writer.write(x)
     18 
     19             x.seek(0)

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in write(self, stream)
    480         self.stack = []
    481         if debug: print(("ERM:", externalReferenceMap, "root:", self._root))
--> 482         self._sweepIndirectReferences(externalReferenceMap, self._root)
    483         del self.stack
    484 

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    570                     self.stack.append(data.idnum)
    571                     realdata = self.getObject(data)
--> 572                     self._sweepIndirectReferences(externMap, realdata)
    573                     return data
    574             else:

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    546             for key, value in list(data.items()):
    547                 origvalue = value
--> 548                 value = self._sweepIndirectReferences(externMap, value)
    549                 if isinstance(value, StreamObject):
    550                     # a dictionary value is a stream.  streams must be indirect

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    570                     self.stack.append(data.idnum)
    571                     realdata = self.getObject(data)
--> 572                     self._sweepIndirectReferences(externMap, realdata)
    573                     return data
    574             else:

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    546             for key, value in list(data.items()):
    547                 origvalue = value
--> 548                 value = self._sweepIndirectReferences(externMap, value)
    549                 if isinstance(value, StreamObject):
    550                     # a dictionary value is a stream.  streams must be indirect

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    555         elif isinstance(data, ArrayObject):
    556             for i in range(len(data)):
--> 557                 value = self._sweepIndirectReferences(externMap, data[i])
    558                 if isinstance(value, StreamObject):
    559                     # an array value is a stream.  streams must be indirect

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    570                     self.stack.append(data.idnum)
    571                     realdata = self.getObject(data)
--> 572                     self._sweepIndirectReferences(externMap, realdata)
    573                     return data
    574             else:

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    546             for key, value in list(data.items()):
    547                 origvalue = value
--> 548                 value = self._sweepIndirectReferences(externMap, value)
    549                 if isinstance(value, StreamObject):
    550                     # a dictionary value is a stream.  streams must be indirect

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    588                             externMap[data.pdf][data.generation] = {}
    589                         externMap[data.pdf][data.generation][data.idnum] = newobj_ido
--> 590                         newobj = self._sweepIndirectReferences(externMap, newobj)
    591                         self._objects[idnum-1] = newobj
    592                         return newobj_ido

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    546             for key, value in list(data.items()):
    547                 origvalue = value
--> 548                 value = self._sweepIndirectReferences(externMap, value)
    549                 if isinstance(value, StreamObject):
    550                     # a dictionary value is a stream.  streams must be indirect

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    546             for key, value in list(data.items()):
    547                 origvalue = value
--> 548                 value = self._sweepIndirectReferences(externMap, value)
    549                 if isinstance(value, StreamObject):
    550                     # a dictionary value is a stream.  streams must be indirect

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    588                             externMap[data.pdf][data.generation] = {}
    589                         externMap[data.pdf][data.generation][data.idnum] = newobj_ido
--> 590                         newobj = self._sweepIndirectReferences(externMap, newobj)
    591                         self._objects[idnum-1] = newobj
    592                         return newobj_ido

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    546             for key, value in list(data.items()):
    547                 origvalue = value
--> 548                 value = self._sweepIndirectReferences(externMap, value)
    549                 if isinstance(value, StreamObject):
    550                     # a dictionary value is a stream.  streams must be indirect

/usr/local/lib/python3.6/dist-packages/PyPDF4/pdf.py in _sweepIndirectReferences(self, externMap, data)
    574             else:
    575                 print(data.pdf)
--> 576                 if data.pdf.stream.closed:
    577                     raise ValueError("I/O operation on closed file: {}".format(data.pdf.stream.name))
    578                 newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None)

AttributeError: 'PdfFileWriter' object has no attribute 'stream'

See what happened there? After using the PyPDF4.PdfFileWriter once, something is actually altered in the PyPDF4.PdfFileReader object (which, one would think, is a separate thing from the reader and not subject to quiet re-assingment by the writer) and the simple sanity check on line 576 causes things to fail every time. If one wants to use the writer again, one needs to start over and re-read the PDF in order to do this. Which is generally fine if the document is on the order of 10's of pages long, but a real time-sink if they are on the order of 100's and up, and many thousands of them.

The following ugly hack to PdfFileWriter._sweepIndirectReferences manages to sidestep the problem, but this is hardly a fix:

            else:
                try:
                    if data.pdf.stream.closed:
                        raise ValueError("I/O operation on closed file: {}".format(data.pdf.stream.name))
                except AttributeError:
                    pass

which works:

In [5]: dump_pages('sample.pdf')
processing page 0
processing page 1
processing page 2
processing page 3
processing page 4
processing page 5
acsor commented 5 years ago

Unfortunately I'm having troubles installing python-poppler-qt5, maybe you went through the same issue in the past?

pip3.5 install python-poppler-qt5
Collecting python-poppler-qt5
  Using cached https://files.pythonhosted.org/packages/af/e7/aa451d4ca0910472c4442d8aa6ef44300852926d85ad033b029a22157027/python-poppler-qt5-0.24.2.tar.gz
    Complete output from command python setup.py egg_info:
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "/tmp/pip-install-eeu5xqgc/python-poppler-qt5/setup.py", line 42, in <module>
        import sipdistutils
    ImportError: No module named 'sipdistutils
acsor commented 5 years ago

By the way, did you glance at the scripts/pdf-image-extractor.py file? Something, but not all apparently, might be working there.

DeliciousHair commented 5 years ago

Yeah, poppler is a pain. But it works so well for page --> image that I'm loath to abandon it altogether. I have no idea how I managed to get it onto OSX, but I did manage it in the past so it can be done. On ubuntu though it's easy:

$ sudo apt-get install python3-poppler-qt5

Job done, that also drags in PyQt5 as well.

I have not had a look at scripts/pdf-image-extractor.py however; the reason being that in this application I needed to extract an image of the rendered page and I'm assuming that pulls images from the PDF tree? I should look though I suppose :-)

Thanks!

acsor commented 5 years ago

Luckily Debian's got your script dependency in the repos (I run under sid, someone else's availability may vary):

# apt-cache search poppler-qt5
libpoppler-qt5-1 - PDF rendering library (Qt 5 based shared library)
libpoppler-qt5-dev - PDF rendering library -- development files (Qt 5 interface)
python3-poppler-qt5 - Python binding to Poppler-Qt5 C++ library (Python 3)
acsor commented 5 years ago

Eh eh @DeliciousHair, you've come at about the right time. Only a few days ago I introduced a change to PdfFileReader/Writer that now enabled me to presumably fix this issue painlessly. The updated script is below, but you should fork from newnone:master until @claird agrees to merge PR #14:

from os.path import abspath, dirname, join, pardir
import io
import popplerqt5

from sys import argv, path
from PyQt5.QtCore import QByteArray

PROJECT_ROOT = abspath(
    join(dirname(__file__), pardir)
)
path.append(PROJECT_ROOT)

from pypdf import PdfFileReader, PdfFileWriter

def dump_pages(pdf_path, dpi=110):
    with open(pdf_path, 'rb') as fp:
        pdf = PdfFileReader(fp)
        npages = pdf.numPages
        x = io.BytesIO()

        for pnum in range(npages):
            print('processing page {}'.format(pnum))
            x = io.BytesIO()
            writer = PdfFileWriter()
            writer.addPage(pdf.getPage(pnum))
            writer.write(x)

        x.seek(0)
        z = QByteArray(x.read())

        doc = popplerqt5.Poppler.Document.loadFromData(z)
        doc.setRenderHint(popplerqt5.Poppler.Document.TextAntialiasing)
        page = doc.page(0)
        img = page.renderToImage(dpi, dpi)
        img.save('p_{0:03d}.png'.format(pnum))

if __name__ == "__main__":
    dump_pages(argv[1])
$ python3 ./Issue\ 21/main.py tests/fixture_data/GeoBase_NHNC1_Data_Model_UML_EN.pdf 
processing page 0
processing page 1
processing page 2
processing page 3
processing page 4
processing page 5
processing page 6
processing page 7
processing page 8
processing page 9
processing page 10
processing page 11
processing page 12
processing page 13
processing page 14
processing page 15
processing page 16
processing page 17
processing page 18

I have no way to tell if the number of images extracted is all that there is unfortunately.

acsor commented 5 years ago

In order to be effective, the latest change should leverage on PdfFileWriter stream attribute, which at the moment isn't there.

Since we are in the midst of PR #14, may I move the stream parameter of PdfFileWriter.write() to PdfFileWriter.__init__() @claird? That will bring about some code changes, in the codebase and in users' code, but I have faith that it will result in an overall improvement.

Since we don't look much active now, I'll wait about three days before I have a grant from you to work on this essentially small modification. After that, I'll try to implement it nevertheless, and if you don't like it you will always be able to revert the changes.

acsor commented 5 years ago

The updated script code is as follows now:

from os.path import abspath, dirname, join, pardir
import io
import popplerqt5

from sys import argv, path
from PyQt5.QtCore import QByteArray

PROJECT_ROOT = abspath(
    join(dirname(__file__), pardir)
)
path.append(PROJECT_ROOT)

from pypdf import PdfFileReader, PdfFileWriter

def dump_pages(pdf_path, dpi=110):
    with open(pdf_path, 'rb') as fp:
        pdf = PdfFileReader(fp)
        npages = pdf.numPages
        x = io.BytesIO()

        for pnum in range(npages):
            print('processing page {}'.format(pnum))
            x = io.BytesIO()
            writer = PdfFileWriter(x)
            writer.addPage(pdf.getPage(pnum))
            writer.write()

        x.seek(0)
        z = QByteArray(x.read())

        doc = popplerqt5.Poppler.Document.loadFromData(z)
        doc.setRenderHint(popplerqt5.Poppler.Document.TextAntialiasing)
        page = doc.page(0)
        img = page.renderToImage(dpi, dpi)
        img.save('p_{0:03d}.png'.format(pnum))

if __name__ == "__main__":
    dump_pages(argv[1])

Do you have it working as expected? Had I the time for it, I'd like to take this as an occasion to implement further tests cases.

DeliciousHair commented 5 years ago

Heyo!

I'd suggest a bit of an edit:

import os
import io
import popplerqt5

from sys import argv, path
from PyQt5.QtCore import QByteArray

PROJECT_ROOT = os.path.abspath(
    os.path.join(dirname(__file__), os.path.pardir)
)
path.append(PROJECT_ROOT)

from pypdf import PdfFileReader, PdfFileWriter

def dump_pages(pdf_path, dpi=60):
    with open(pdf_path, 'rb') as fp:
        pdf = PdfFileReader(fp)
        npages = pdf.numPages
        x = io.BytesIO()

        for pnum in range(npages):
            print('processing page {}'.format(pnum))
            fout = pdf_path.split('.pdf')[0] + '__p_{0:03d}.png'.format(pnum)

            x = io.BytesIO()
            writer = PdfFileWriter(x)
            writer.addPage(pdf.getPage(pnum))
            writer.write()

            x.seek(0)
            z = QByteArray(x.read())

            doc = popplerqt5.Poppler.Document.loadFromData(z)
            doc.setRenderHint(popplerqt5.Poppler.Document.TextAntialiasing)
            page = doc.page(0)
            img = page.renderToImage(dpi, dpi)
            img.save(fout)

if __name__ == "__main__":
    dump_pages(argv[1])

otherwise you only end up with a single PNG file regardless of what happens. Tried this just now with the same sampe file I linked at the beginning of this issue and it works like a charm!

The case(s) that do not work are down to the handling of structural issues in the PDF itself that I'm still fighting with in #11 , but in terms of this issue I do believe this one can be safely closed, and there is even the beginnings of a test script out of it as well.

Thanks!

acsor commented 5 years ago

Thanks to you for the patience gathered so far in discussing several of these issues.