Hello,
Currently I am performing OCR on 1 page document over there I am having multiple same name entity and in front of it there is a checkbox. I am able to detect all values and the checkbox is selected or not using form in AWS textract but I am not getting any data in sequence.
Below I have attached 2 files with same data but in both file it is detecting all entities but in random order.
Here is the code I am using:
import boto3
import sys
import re
import json
from collections import defaultdict
def get_kv_map(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
client = boto3.client('textract')
response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['FORMS'])
# Get the text blocks
blocks = response['Blocks']
# get key and value maps
key_map = {}
value_map = {}
block_map = {}
for block in blocks:
block_id = block['Id']
block_map[block_id] = block
if block['BlockType'] == "KEY_VALUE_SET":
if 'KEY' in block['EntityTypes']:
key_map[block_id] = block
else:
value_map[block_id] = block
return key_map, value_map, block_map
def get_kv_relationship(key_map, value_map, block_map):
kvs = defaultdict(list)
for block_id, key_block in key_map.items():
value_block = find_value_block(key_block, value_map)
key = get_text(key_block, block_map)
val = get_text(value_block, block_map)
kvs[key].append(val)
return kvs
def find_value_block(key_block, value_map):
for relationship in key_block['Relationships']:
if relationship['Type'] == 'VALUE':
for value_id in relationship['Ids']:
value_block = value_map[value_id]
return value_block
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] == 'SELECTED':
text += 'X '
return text
def print_kvs(kvs):
for key, value in kvs.items():
print(key, ":", value)
def search_value(kvs, search_key):
for key, value in kvs.items():
if re.search(search_key, key, re.IGNORECASE):
return value
def main(file_name):
key_map, value_map, block_map = get_kv_map(file_name)
# Get Key Value relationship
kvs = get_kv_relationship(key_map, value_map, block_map)
print("\n\n== FOUND KEY : VALUE pairs ===\n")
print_kvs(kvs)
return kvs
if __name__ == "__main__":
file_name = sys.argv[1]
d = main("./data.png")
So how can I get the details in sequence rather than in random order:
For buyer entity this is data from 1 file:['', '', '', '', '', '', '', '', '', '', '', '', 'X ', '', 'X ', '', '']
For the same data this is response of buyer for other file: ['', '', '', '', '', '', '', '', '', 'X ', '', '', '', 'X ', '', '', '']
Hello, Currently I am performing OCR on 1 page document over there I am having multiple same name entity and in front of it there is a checkbox. I am able to detect all values and the checkbox is selected or not using form in AWS textract but I am not getting any data in sequence. Below I have attached 2 files with same data but in both file it is detecting all entities but in random order. Here is the code I am using:
file1.pdf file2.pdf
So how can I get the details in sequence rather than in random order:
For buyer entity this is data from 1 file:['', '', '', '', '', '', '', '', '', '', '', '', 'X ', '', 'X ', '', ''] For the same data this is response of buyer for other file: ['', '', '', '', '', '', '', '', '', 'X ', '', '', '', 'X ', '', '', '']