layoutlmv3, true inference, return corresponding text for detected labels

aditya11ad commented 2 years ago

hi,

in layoutlmv3

after true inference , labels and boxes are generated.

how can i get the text(word) for a particular label.

should i need to apply ocr in that detected box or any easy way??

thanks in advance .

LucaMalagutti commented 2 years ago

An easy workaround to achieve this could be to keep track of the input data that is associated with your prediction and then, given any bounding box, search in the data for a match between the predicted bounding box and the input bounding box. When you find a match, you can use the input data to obtain the word that is contained in the matching bounding box

Malisha15 commented 1 year ago

@aditya11ad Hi bro, Were you able to figure it out? And also I'd like to know how did you get the true inference using Layoutlmv3 model cuz I could only do the inference for train and test documents that we already have labels

Monta79 commented 6 months ago

@Malisha15 did you find a solution please

aditya11ad commented 6 months ago

hi @Malisha15 , sorry for the late reply.

once you have your trained model ready, you can use this inference code:

labels = ['date', 'invoice_num', 'total', 'others']
id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}

feature_extractor = LayoutLMv3FeatureExtractor.from_pretrained( "microsoft/layoutlmv3-base")
tokenizer = LayoutLMv3TokenizerFast.from_pretrained( "microsoft/layoutlmv3-base", apply_ocr=False)
processor = LayoutLMv3Processor.from_pretrained( "microsoft/layoutlmv3-base", apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained("checkpoint-1000") // the trained model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# --------------------------

def unnormalize_box(bbox, width, height):
    return [
        width * (bbox[0] / 1000),
        height * (bbox[1] / 1000),
        width * (bbox[2] / 1000),
        height * (bbox[3] / 1000),
    ]

def prediction(im_path):
    image = Image.open(im_path)
    image = image.convert("RGB")

    encoding_feature_extractor = feature_extractor(image, return_tensors="pt")
    words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
    encoding = processor(image, words, boxes=boxes, return_offsets_mapping=True, return_tensors="pt", truncation=True)
    offset_mapping = encoding.pop('offset_mapping')

    for k, v in encoding.items():
        encoding[k] = v.to(device)

    outputs = model(**encoding)

    predictions = outputs.logits.argmax(-1).squeeze().tolist()
    token_boxes = encoding.bbox.squeeze().tolist()

    inp_ids = encoding.input_ids.squeeze().tolist()
    inp_words = [tokenizer.decode(i) for i in inp_ids]

    width, height = image.size
    is_subword = np.array(offset_mapping.squeeze().tolist())[:, 0] != 0

    true_predictions = [id2label[pred]
                        for idx, pred in enumerate(predictions) if not is_subword[idx]]
    true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(
        token_boxes) if not is_subword[idx]]

    true_words = []
    for id, i in enumerate(inp_words):
        if not is_subword[id]:
            true_words.append(i)
        else:
            true_words[-1] = true_words[-1]+i

    true_predictions = true_predictions[1:-1]
    true_boxes = true_boxes[1:-1]
    true_words = true_words[1:-1]

    preds = []
    l_words = []
    bboxes = []

    for i, j in enumerate(true_predictions):
        if j != 'others':
            preds.append(true_predictions[i])
            l_words.append(true_words[i])
            bboxes.append(true_boxes[i])

    d = {}
    for id, i in enumerate(preds):
        if i not in d.keys():
            d[i] = l_words[id]
        else:
            d[i] = d[i]+l_words[id]
    d = {k: v.strip() for (k, v) in d.items()}

    draw = ImageDraw.Draw(image, "RGBA")
    font = ImageFont.load_default()

    label2color = {"invoice_num": 'red', "date": 'red',
                   "total": 'red', "others": 'green'}

    #if len(fields) == 0:
    #  dict_filtered = d
    #else:
    #  dict_filtered = dict((k, d[k]) for k in fields if k in d)
    #  preds = [i for i in preds if i in fields]
    #  bboxes = [bboxes[id] for id, i in enumerate(preds) if i in fields]

    for prediction, box in zip(preds, bboxes):
        draw.rectangle(box, outline=label2color[prediction], fill=(
            255, 255, 0, int(0.4 * 255)))
        draw.text((box[0]+10, box[1]-10), text=prediction,
                  fill=label2color[prediction], font=font)

    return d, image

Monta79 commented 6 months ago

@aditya11ad NameError: name 'fields' is not defined, can you help me with this error please

aditya11ad commented 6 months ago

hi @Monta79 , u don't need that 'fields'(I updated the above code.) that was actually useful when u explicitly pass desired fields to extract like: fields=['invoice_num', 'total']. otherwise its not needed.

promaprogga commented 4 months ago

hello @aditya11ad this is my code

id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}

feature_extractor = LayoutLMv3FeatureExtractor.from_pretrained( "microsoft/layoutlmv3-base")
tokenizer = LayoutLMv3TokenizerFast.from_pretrained( "microsoft/layoutlmv3-base", apply_ocr=False)
processor = LayoutLMv3Processor.from_pretrained( "microsoft/layoutlmv3-base", apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained("/content/drive/MyDrive/SSCL/layoutlm/model") 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# --------------------------

def unnormalize_box(bbox, width, height):
    return [
        width * (bbox[0] / 1000),
        height * (bbox[1] / 1000),
        width * (bbox[2] / 1000),
        height * (bbox[3] / 1000),
    ]

def prediction(im_path):
    image = Image.open(im_path)
    image = image.convert("RGB")

    encoding_feature_extractor = feature_extractor(image, return_tensors="pt")
    words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
    # Assuming processor is defined and example contains the necessary data
    encoding = processor(
        image, words, boxes=boxes, return_tensors="pt",
        truncation=True, stride=128, padding="max_length", 
        max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True
    )

    offset_mapping = encoding.pop('offset_mapping')
    overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')

    # Convert pixel_values to a PyTorch tensor
    # Convert the list of numpy.ndarrays to a single numpy.ndarray and then to a PyTorch tensor
    pixel_values_list = encoding['pixel_values']
    pixel_values_array = np.stack(pixel_values_list)  # Use np.stack() to convert list of arrays to a single array
    encoding['pixel_values'] = torch.tensor(pixel_values_array)

    print(encoding['pixel_values'].shape) 
    # change the shape of pixel values
    x = []
    for i in range(0, len(encoding['pixel_values'])):
        x.append(encoding['pixel_values'][i])
    x = torch.stack(x)
    encoding['pixel_values'] = x
    print(encoding['pixel_values'].shape) 
    # Convert non-tensor items to PyTorch tensors if necessary
    for k, v in encoding.items():
        if not isinstance(v, torch.Tensor):
            encoding[k] = torch.tensor(v)

    # Print the shapes of all items in the encoding dictionary
    for k, v in encoding.items():
        print(f"{k}: {v.shape}")

    with torch.no_grad():
        outputs = model(**encoding)

    predictions = outputs.logits.argmax(-1).squeeze().tolist()
    token_boxes = encoding.bbox.squeeze().tolist()

    inp_ids = encoding.input_ids.squeeze().tolist()
    inp_words = [tokenizer.decode(i) for i in inp_ids]

    width, height = image.size
    is_subword = np.array(offset_mapping.squeeze().tolist())[:, 0] != 0

    true_predictions = [id2label[pred]
                        for idx, pred in enumerate(predictions) if not is_subword[idx]]
    true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(
        token_boxes) if not is_subword[idx]]

    true_words = []
    for id, i in enumerate(inp_words):
        if not is_subword[id]:
            true_words.append(i)
        else:
            true_words[-1] = true_words[-1]+i

    true_predictions = true_predictions[1:-1]
    true_boxes = true_boxes[1:-1]
    true_words = true_words[1:-1]

    preds = []
    l_words = []
    bboxes = []

    for i, j in enumerate(true_predictions):
        if j != 'others':
            preds.append(true_predictions[i])
            l_words.append(true_words[i])
            bboxes.append(true_boxes[i])

    d = {}
    for id, i in enumerate(preds):
        if i not in d.keys():
            d[i] = l_words[id]
        else:
            d[i] = d[i]+l_words[id]
    d = {k: v.strip() for (k, v) in d.items()}

    draw = ImageDraw.Draw(image, "RGBA")
    font = ImageFont.load_default()

    label2color = {
        "other": "black",
        "name_key": "red",
        "ben_name": "orange",
        "add_key": "brown",
        "ben_add": "yellow",
        "invoice_key": "blue",
        "invoice_no": "violet",
        "amount_key": "green",
        "total_amount": "pink",
    }

    for prediction, box in zip(preds, bboxes):
        draw.rectangle(box, outline=label2color[prediction], fill=(
            255, 255, 0, int(0.4 * 255)))
        draw.text((box[0]+10, box[1]-10), text=prediction,
                  fill=label2color[prediction], font=font)

    return d, image

im_path = "/content/drive/MyDrive/SSCL/test/1045_pdf-375.jpg"

# Run prediction on the image
result, annotated_image = prediction(im_path)

# Save the annotated image
annotated_image.save("/content/drive/MyDrive/SSCL/layoutlm/annotated_1045_pdf-375.jpg")

# Display the annotated image
annotated_image.show()

but getting this error could you please help me with that /usr/local/lib/python3.10/dist-packages/transformers/models/layoutlmv3/feature_extraction_layoutlmv3.py:30: FutureWarning: The class LayoutLMv3FeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use LayoutLMv3ImageProcessor instead. warnings.warn( torch.Size([2, 3, 224, 224]) torch.Size([2, 3, 224, 224]) input_ids: torch.Size([2, 512]) attention_mask: torch.Size([2, 512]) bbox: torch.Size([2, 512, 4]) pixel_values: torch.Size([2, 3, 224, 224]) /usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:1052: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers. warnings.warn(

ValueError Traceback (most recent call last) in <cell line: 134>() 132 133 # Run prediction on the image --> 134 result, annotated_image = prediction(im_path) 135 136 # Save the annotated image

1 frames in (.0) 71 is_subword = np.array(offset_mapping.squeeze().tolist())[:, 0] != 0 72 ---> 73 true_predictions = [id2label[pred] 74 for idx, pred in enumerate(predictions) if not is_subword[idx]] 75 true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

NielsRogge / Transformers-Tutorials

layoutlmv3, true inference, return corresponding text for detected labels #187