Seemingly infinite loop on PdfFileReader().getPage().ExtractText() on certain files. Workaround included in post.

Don't know if anyone else has run into this, but ExtractText() seems to loop infinitely on certain files, and even then, only certain pages on those files. Even left over a 3-day weekend, it remains stuck. I've attached a short sample script illustrating a workaround for whomever comes after me in search of a solution. It uses a timeout argument on the multiprocessing module's Process object.

#this is a workaround for an infinite loop bug in pyPdf
from pyPdf import PdfFileReader
from multiprocessing import Process, Queue

def get_highest_page_number(pdf_path):
    pdf_handle = file(pdf_path, "rb")
    pdf_file = PdfFileReader(pdf_handle)
    if pdf_file.getIsEncrypted():
        pdf_file.decrypt("")
    highest_page_number = pdf_file.getNumPages()
    pdf_handle.close()
    return highest_page_number

def get_page_text(pdf_path, page, que):
    pdf_handle = file(pdf_path, "rb")
    pdf_file = PdfFileReader(pdf_handle)
    if pdf_file.getIsEncrypted():
        pdf_file.decrypt("")
    pdf_page = pdf_file.getPage(page)
    page_text = pdf_page.extractText()
    pdf_handle.close()
    que.put(page_text)

def read_pdf(pdf_path):
    pages_top_limit = get_highest_page_number(pdf_path)
    for page in range(0, pages_top_limit):
        page_text_que = Queue()
        page_text_process = Process(target = get_page_text, args = (pdf_path, page, page_text_que))
        page_text_process.start()
        page_text_process.join(10)
        if page_text_process.is_alive():
            page_text_process.terminate()
            raise RuntimeError
        else:
            page_text = page_text_que.get()

def main():
    pdf_path = "file.pdf"
    read_pdf(pdf_path)

if __name__ == "__main__":
    main()

I don't like having to re-open the handle for every page, but I really don't see another option at present.

mfenniak / pyPdf

Seemingly infinite loop on PdfFileReader().getPage().ExtractText() on certain files. Workaround included in post. #45