aws-solutions-library-samples / guidance-for-low-code-intelligent-document-processing-on-aws

This Guidance provides best practices for building and deploying an intelligent document processing (IDP) architecture that scales with workload demands.
https://aws.amazon.com/solutions/guidance/low-code-intelligent-document-processing-on-aws/
MIT No Attribution
40 stars 15 forks source link

PyPDF2 endstream error #29

Open schadem opened 1 year ago

schadem commented 1 year ago

Saw this during processing

{
  "errorMessage": "Unable to find 'endstream' marker for obj starting at 124920.",
  "errorType": "PdfReadError",
  "requestId": "cce07be2-b060-4ab7-90a1-d31c9163285f",
  "stackTrace": [
    "  File \"/var/task/main.py\", line 107, in lambda_handler\n    output_file_list = split_and_save_pages(\n",
    "  File \"/var/task/documentsplitter/documentsplitter.py\", line 53, in split_and_save_pages\n    writer.add_page(pdf_reader.pages[page_number - 1])\n",
    "  File \"/var/task/PyPDF2/_writer.py\", line 321, in add_page\n    return self._add_page(page, list.append, excluded_keys)\n",
    "  File \"/var/task/PyPDF2/_writer.py\", line 271, in _add_page\n    page = cast(\"PageObject\", page_org.clone(self, False, excluded_keys))\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 181, in clone\n    d__._clone(self, pdf_dest, force_duplicate, ignore_fields)\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 243, in _clone\n    v.clone(pdf_dest, force_duplicate, ignore_fields)\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 181, in clone\n    d__._clone(self, pdf_dest, force_duplicate, ignore_fields)\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 243, in _clone\n    v.clone(pdf_dest, force_duplicate, ignore_fields)\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 181, in clone\n    d__._clone(self, pdf_dest, force_duplicate, ignore_fields)\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 243, in _clone\n    v.clone(pdf_dest, force_duplicate, ignore_fields)\n",
    "  File \"/var/task/PyPDF2/generic/_base.py\", line 249, in clone\n    dup = obj.clone(pdf_dest, force_duplicate, ignore_fields)\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 181, in clone\n    d__._clone(self, pdf_dest, force_duplicate, ignore_fields)\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 243, in _clone\n    v.clone(pdf_dest, force_duplicate, ignore_fields)\n",
    "  File \"/var/task/PyPDF2/generic/_base.py\", line 247, in clone\n    obj = self.get_object()\n",
    "  File \"/var/task/PyPDF2/generic/_base.py\", line 259, in get_object\n    obj = self.pdf.get_object(self)\n",
    "  File \"/var/task/PyPDF2/_reader.py\", line 1260, in get_object\n    retval = read_object(self.stream, self)  # type: ignore\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 1045, in read_object\n    return DictionaryObject.read_from_stream(stream, pdf, forced_encoding)\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 448, in read_from_stream\n    data[\"__streamdata__\"] = read_unsized_from_steam(stream, pdf)\n",
    "  File \"/var/task/PyPDF2/generic/_data_structures.py\", line 354, in read_unsized_from_steam\n    raise PdfReadError(\n"
  ]
}