aws-samples / amazon-textract-textractor

Analyze documents with Amazon Textract and generate output in multiple formats.
Apache License 2.0
404 stars 145 forks source link

KeyError: 'Geometry' raised when an empty cell found in `response_parser.py` #230

Open mhfarahani opened 1 year ago

mhfarahani commented 1 year ago

This is very similar to the #195. Since that issue has been closed, I am creating a new one:

In #195, KeyError: 'Geometry' was addressed by creating a condition that return None if "Geometry" not in field["ValueDetection"]. The same may happen if we cann't find the box for field["LabelDetection"]. Here is the error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-6-06735bc9ce69> in <module>
----> 1 textract_response.expense_documents

/opt/conda/lib/python3.7/site-packages/textractor/entities/lazy_document.py in __getattr__(self, _LazyDocument__name)
     61                 self._textract_client,
     62             )
---> 63             self._document = parse(response)
     64             if self._images is not None:
     65                 for i, page in enumerate(self._document.pages):

/opt/conda/lib/python3.7/site-packages/textractor/parsers/response_parser.py in parse(response)
    965         return parse_analyze_id_response(response)
    966     if "ExpenseDocuments" in response:
--> 967         return parser_analyze_expense_response(response)
    968     else:
    969         return parse_document_api_response(response)

/opt/conda/lib/python3.7/site-packages/textractor/parsers/response_parser.py in parser_analyze_expense_response(response)
    923         summary_fields = []
    924         for summary_field in doc["SummaryFields"]:
--> 925             summary_fields.append(create_expense_from_field(summary_field, page))
    926             summary_fields[-1].raw_object = summary_field
    927 

/opt/conda/lib/python3.7/site-packages/textractor/parsers/response_parser.py in create_expense_from_field(field, page)
    877         value_expense = Expense(
    878             bbox=BoundingBox.from_normalized_dict(
--> 879                 field["ValueDetection"]["Geometry"]["BoundingBox"], spatial_object=page
    880             ),
    881             text=field["ValueDetection"]["Text"],

KeyError: 'Geometry'

and it seems the same condition could be applied for field["LabelDetection"] in the case that "Geometry" does not exist. The code to fix the issue would be the following:

if "LabelDetection" in field:
  label_expense = Expense(
      None if not "Geometry" in field["LabelDetection"] else
      bbox=BoundingBox.from_normalized_dict(
          field["LabelDetection"]["Geometry"]["BoundingBox"], spatial_object=page
      ),
      text=field["LabelDetection"]["Text"],
      confidence=field["LabelDetection"]["Confidence"],
      page=page.page_num
  )
  label_expense.raw_object = field["LabelDetection"]
mhfarahani commented 1 year ago

After adding the block above and while debugging locally, I faced another issue when the key existed in the field, but the value was set to None.

"LabelDetection" in field -> is True ... but field["LabelDetection"] is None "GroupProperties" in field -> is True ... but field["GroupProperties"] is not None

The specific document I was analyzing for this example was a Lyft receipt.

I am not sure whether this issue should be addressed in another section of the code. But, to work around it, I implemented the following changes.

def create_expense_from_field(field: Dict, page: Page) -> ExpenseField:
    if "Type" in field:
        type_expense = ExpenseType(
            field["Type"]["Text"], field["Type"]["Confidence"], field["Type"]
        )
    else:
        type_expense = None
    if "ValueDetection" in field:
        value_expense = Expense(
            bbox=(
                None
                if not "Geometry" in field["ValueDetection"] else 
                BoundingBox.from_normalized_dict(
                    field["ValueDetection"]["Geometry"]["BoundingBox"],
                    spatial_object=page
                )
            ),
            text=field["ValueDetection"]["Text"],
            confidence=field["ValueDetection"]["Confidence"],
            page = page.page_num
        )
        value_expense.raw_object = field["ValueDetection"]
    else:
        value_expense = None
    if "LabelDetection" in field and field["LabelDetection"] is not None:
        label_expense = Expense(
            bbox=(
                None
                if not "Geometry" in field["LabelDetection"] else 
                BoundingBox.from_normalized_dict(
                field["LabelDetection"]["Geometry"]["BoundingBox"], spatial_object=page)
            ),
            text=field["LabelDetection"]["Text"],
            confidence=field["LabelDetection"]["Confidence"],
            page=page.page_num
        )
        label_expense.raw_object = field["LabelDetection"]
    else:
        label_expense = None
    group_properties = []
    if "GroupProperties" in field and field["GroupProperties"] is not None:
        for group_property in field["GroupProperties"]:
            group_properties.append(
                ExpenseGroupProperty(id=group_property["Id"], types=group_property["Types"])
            )
    if "Currency" in field and field["Currency"] is not None:
        currency = field["Currency"]["Code"]
    else:
        currency = None
    return ExpenseField(type_expense, value_expense, group_properties=group_properties, label=label_expense,
                        currency=currency, page=page.page_num)

def parser_analyze_expense_response(response):
    response["Blocks"] = [b for doc in response["ExpenseDocuments"] for b in doc.get("Blocks", [])]
    document = parse_document_api_response(response)
    for doc in response["ExpenseDocuments"]:
        # FIXME
        if len(doc["SummaryFields"]) == 0:
            continue
        page = document.pages[doc["SummaryFields"][0]["PageNumber"] - 1]
        summary_fields = []
        for summary_field in doc["SummaryFields"]:
            summary_fields.append(create_expense_from_field(summary_field, page))
            summary_fields[-1].raw_object = summary_field

        line_items_groups = []
        for line_items_group in doc["LineItemGroups"]:
            line_item_rows = []
            for i, line_item in enumerate(line_items_group["LineItems"]):
                row_expenses = []
                for line_item_field in line_item["LineItemExpenseFields"]:
                    row_expenses.append(create_expense_from_field(line_item_field, page))
                    row_expenses[-1].raw_object = line_item_field
                line_item_rows.append(LineItemRow(index=i, line_item_expense_fields=row_expenses, page=page.page_num))
            if not line_item_rows:
                continue
            line_items_groups.append(LineItemGroup(index=line_items_group["LineItemGroupIndex"], line_item_rows=line_item_rows, page=page.page_num))

        bbox = BoundingBox.enclosing_bbox(bboxes=[s.bbox for s in summary_fields] + [g.bbox for g in line_items_groups], spatial_object=page)
        expense_document = ExpenseDocument(
            summary_fields=summary_fields, line_items_groups=line_items_groups, bounding_box=bbox, page=page.page_num
        )
        expense_document.raw_object = doc
        document.pages[summary_field["PageNumber"] - 1].expense_documents.append(
            expense_document
        )
    del response["Blocks"]
    document.response = response
    return document