facebookresearch / detectron2

Detectron2 is a platform for object detection, segmentation and other visual recognition tasks.
https://detectron2.readthedocs.io/en/latest/
Apache License 2.0
29.32k stars 7.32k forks source link

Adjusting Model Confidence Level #5237

Closed Rahul6903 closed 3 months ago

Rahul6903 commented 3 months ago

import sys sys.path.append("unilm") sys.path.append("detectron2") import cv2 import numpy as np import matplotlib.pyplot as plt from unilm.dit.object_detection.ditod import add_vit_config import torch

from detectron2.config import CfgNode as CN from detectron2.config import get_cfg from detectron2.utils.visualizer import ColorMode, Visualizer from detectron2.data import MetadataCatalog from detectron2.engine import DefaultPredictor

import gradio as gr

Step 1: instantiate config

cfg = get_cfg() add_vit_config(cfg) cfg.merge_from_file("cascade_dit_base.yml")

Step 2: add model weights URL to config

cfg.MODEL.WEIGHTS = "publaynet_dit-b_cascade.pth"

Step 3: set device

cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

Step 4: define model

predictor = DefaultPredictor(cfg)

import json import pdfplumber

def extract_text_from_pdf_with_coordinates(pdf_path, page_number, x_min, y_min, x_max, y_max): with pdfplumber.open(pdf_path) as pdf: page = pdf.pages[page_number - 1] # Adjust for 0-based indexing content = "" page_width = page.width page_height = page.height

    # Check if the coordinates fall within the page bounds
    if x_min < 0 or y_min < 0 or x_max > page_width or y_max > page_height:
        raise ValueError("Coordinates are outside the page bounds")

    # Extract text from the specified coordinates
    text = page.crop((x_min, y_min, x_max, y_max)).extract_text()
    content += text.strip() + "\n"
return content

pdf_path = "/Users/infx012941/Desktop/AI/pdf/AIM_chest-imaging.pdf" page_number = 6

def analyze_image(img): md = MetadataCatalog.get(cfg.DATASETS.TEST[0]) if cfg.DATASETS.TEST[0] == 'icdar2019_test': md.set(thing_classes=["table"]) else: md.set(thing_classes=["text", "title", "list", "table", "figure"])

output = predictor(img)["instances"]
v = Visualizer(img[:, :, ::-1],
               md,
               scale=1.0,
               instance_mode=ColorMode.SEGMENTATION)
result = v.draw_instance_predictions(output.to("cpu"))
result_image = result.get_image()[:, :, ::-1]

bbox_data = []

# Plot each bounding box
for i, bbox in enumerate(output.pred_boxes.tensor):
    x_min, y_min, x_max, y_max = bbox.cpu().numpy().tolist()  # Convert float32 to Python float
    width = x_max - x_min
    height = y_max - y_min
    class_label = md.thing_classes[output.pred_classes[i]]
    bbox_data.append({
        "class_name": class_label,
        "coordinates": {
            "x_min": x_min,
            "y_min": y_min,
            "x_max": x_max,
            "y_max": y_max
        },
        "contents": extract_text_from_pdf_with_coordinates(pdf_path, page_number, x_min, y_min, x_max, y_max)
    })

# Convert bounding box data to JSON
json_data = json.dumps(bbox_data, indent=4)
file_path = "pdf_output.json"
with open(file_path, "w") as json_file:
    json_file.write(json_data)

# Plot the image
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

# Plot each bounding box
for bbox in bbox_data:
    coordinates = bbox["coordinates"]
    class_label = bbox["class_name"]
    x_min, y_min, x_max, y_max = coordinates["x_min"], coordinates["y_min"], coordinates["x_max"], coordinates["y_max"]
    plt.text(x_min, y_min - 2, class_label, fontsize=8, color='r')
    rect = plt.Rectangle((x_min, y_min), x_max - x_min, y_max - y_min, linewidth=1, edgecolor='r', facecolor='none')
    plt.gca().add_patch(rect)

plt.axis('off')  # Turn off axis
plt.show()
# Close the plot to prevent the warning
plt.close()
return result_image

title = "Interactive demo: Document Layout Analysis with DiT"

description = "Demo for Microsoft's DiT, the Document Image Transformer for state-of-the-art document understanding tasks. This particular model is fine-tuned on PubLayNet, a large dataset for document layout analysis (read more at the links below). To use it, simply upload an image or use the example image below and click 'Submit'. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select 'Open image in new tab'."

article = "

Paper | Github Repo

| HuggingFace doc

"

examples =[['publaynet_example.jpeg']]

css = ".output-image, .input-image, .image-preview {height: 600px !important}"

iface = gr.Interface(fn=analyze_image, inputs=gr.Image(type="numpy", label="document image"), outputs=gr.Image(type="numpy", label="annotated document"), title=title, css=css) obj=iface.launch(debug=True)

it's possible to adjust a model's confidence level because i unable to obtain the complete labeled coordinates along with the corresponding text

github-actions[bot] commented 3 months ago

You've chosen to report an unexpected problem or bug. Unless you already know the root cause of it, please include details about it by filling the issue template. The following information is missing: "Instructions To Reproduce the Issue and Full Logs"; "Your Environment";

github-actions[bot] commented 3 months ago

Requested information was not provided in 7 days, so we're closing this issue.

Please open new issue if information becomes available. Otherwise, use github discussions for free-form discussions.