aws-samples / amazon-textract-code-samples

Amazon Textract Code Samples
MIT No Attribution
406 stars 263 forks source link

Textract form type - not getting data in sequential order #35

Open utsavvc opened 2 years ago

utsavvc commented 2 years ago

Hello, Currently I am performing OCR on 1 page document over there I am having multiple same name entity and in front of it there is a checkbox. I am able to detect all values and the checkbox is selected or not using form in AWS textract but I am not getting any data in sequence. Below I have attached 2 files with same data but in both file it is detecting all entities but in random order. Here is the code I am using:

import boto3
import sys
import re
import json
from collections import defaultdict

def get_kv_map(file_name):
    with open(file_name, 'rb') as file:
        img_test = file.read()
        bytes_test = bytearray(img_test)
        print('Image loaded', file_name)

    # process using image bytes
    client = boto3.client('textract')
    response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['FORMS'])

    # Get the text blocks
    blocks = response['Blocks']

    # get key and value maps
    key_map = {}
    value_map = {}
    block_map = {}
    for block in blocks:
        block_id = block['Id']
        block_map[block_id] = block
        if block['BlockType'] == "KEY_VALUE_SET":
            if 'KEY' in block['EntityTypes']:
                key_map[block_id] = block
            else:
                value_map[block_id] = block
    return key_map, value_map, block_map

def get_kv_relationship(key_map, value_map, block_map):
    kvs = defaultdict(list)
    for block_id, key_block in key_map.items():
        value_block = find_value_block(key_block, value_map)
        key = get_text(key_block, block_map)
        val = get_text(value_block, block_map)

        kvs[key].append(val)
    return kvs

def find_value_block(key_block, value_map):
    for relationship in key_block['Relationships']:
        if relationship['Type'] == 'VALUE':
            for value_id in relationship['Ids']:
                value_block = value_map[value_id]
    return value_block

def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] == 'SELECTED':
                            text += 'X '
    return text

def print_kvs(kvs):
    for key, value in kvs.items():
        print(key, ":", value)

def search_value(kvs, search_key):
    for key, value in kvs.items():
        if re.search(search_key, key, re.IGNORECASE):
            return value

def main(file_name):
    key_map, value_map, block_map = get_kv_map(file_name)

    # Get Key Value relationship
    kvs = get_kv_relationship(key_map, value_map, block_map)
    print("\n\n== FOUND KEY : VALUE pairs ===\n")
    print_kvs(kvs)
    return kvs

if __name__ == "__main__":
    file_name = sys.argv[1]
    d = main("./data.png")

file1.pdf file2.pdf

So how can I get the details in sequence rather than in random order:

For buyer entity this is data from 1 file:['', '', '', '', '', '', '', '', '', '', '', '', 'X ', '', 'X ', '', ''] For the same data this is response of buyer for other file: ['', '', '', '', '', '', '', '', '', 'X ', '', '', '', 'X ', '', '', '']