from PIL import Image
from tesserocr import PyTessBaseAPI, RIL
image = Image.open('/input_tiffs/one.tif')
with PyTessBaseAPI() as api:
api.SetImage(image)
ALL = api.GetUTF8Text()
boxes = api.GetComponentImages(RIL.WORD, True)
print(f'Found {len(boxes)} textline image components.')
words = []
for i, (im, box, _, _) in enumerate(boxes):
# im is a PIL image object
# box is a dict with x, y, w and h keys
api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
ocrResult = api.GetUTF8Text()
conf = api.MeanTextConf()
words.append(ocrResult)
# ALL is different to words, with missing and incorrect words
Why do I get different (incorrect) body of text by combining the words at a word level than when calling GetUTF8Text() on the entire image?
This is a tesseract-specific question rather than tessererocr which is just a wrapper around the tesseract API. You're better off asking on StackOverflow imho.
Why do I get different (incorrect) body of text by combining the words at a word level than when calling GetUTF8Text() on the entire image?