Improvement idea - Githubissues

mh-put-00 commented 1 year ago

Hi!

I love your program.

I'm new to programming, but I modified the "main.py" so that you can take the input to be a folder of images instead of a single one and also the output is an .md file with the highlighted texts.

My idea is next to modify the code so that you can accept different colours of the highlight at once and also to use a gpt api to organise the text into proper notes. Here's the code for the "main.py" I used, sorry if this is the wrong way to do this - still learning :)

import cv2
import numpy as np
from collections import namedtuple
from pathlib import Path
import pytesseract
from pytesseract import Output
import re
import os
from natsort import natsorted
import argparse

Rectangle = namedtuple('Rectangle', ['xmin', 'ymin', 'xmax', 'ymax'])

class Levels:
    PAGE = 1
    BLOCK = 2
    PARAGRAPH = 3
    LINE = 4
    WORD = 5

def image_to_data(img_src):
    return pytesseract.image_to_data(
        img_src, lang='eng', config='--psm 6', output_type=Output.DICT)

def normalize_images(images):
    """Convert all images into 3-dimensional images via cv2.COLOR_GRAY2BGR."""
    return [cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
            if image.ndim == 2 else image for image in images]

def threshold_image(img_src):
    """Grayscale image and apply Otsu's threshold"""
    # Grayscale
    img_gray = cv2.cvtColor(img_src, cv2.COLOR_BGR2GRAY)
    # Binarisation and Otsu's threshold
    img_thresh = cv2.threshold(
        img_gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    return img_thresh

def mask_image(img_src, lower, upper):
    """Convert image from RGB to HSV and create a mask for given lower and upper boundaries."""
    # RGB to HSV color space conversion
    img_hsv = cv2.cvtColor(img_src, cv2.COLOR_BGR2HSV)
    hsv_lower = np.array(lower, np.uint8)  # Lower HSV value
    hsv_upper = np.array(upper, np.uint8)  # Upper HSV value

    # Color segmentation with lower and upper threshold ranges to obtain a binary image
    img_mask = cv2.inRange(img_hsv, hsv_lower, hsv_upper)

    return img_mask

def apply_mask(img_src, img_mask):
    """Apply bitwise conjunction of source image and image mask."""
    img_result = cv2.bitwise_and(img_src, img_src, mask=img_mask)

    return img_result

def denoise_image(img_src):
    """Denoise image with a morphological transformation."""
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
    img_denoise = cv2.morphologyEx(
        img_src, cv2.MORPH_OPEN, kernel, iterations=1)

    return img_denoise

def find_highlighted_words(img_mask, data_ocr, threshold_percentage=25):
    """Find highlighted words by calculating how much of the words area contains white pixels compared to black pixels."""
    data_ocr['highlighted'] = [False] * len(data_ocr['text'])

    for i in range(len(data_ocr['text'])):
        (x, y, w, h) = (data_ocr['left'][i], data_ocr['top']
                        [i], data_ocr['width'][i], data_ocr['height'][i])
        rect_threshold = (w * h * threshold_percentage) / 100
        img_roi = img_mask[y:y+h, x:x+w]
        count = cv2.countNonZero(img_roi)

        if count > rect_threshold:
            data_ocr['highlighted'][i] = True

    return data_ocr

def words_to_string(data_ocr):
    """Convert OCR data to a string, preserving line breaks and highlighting."""
    word_list = []
    line_breaks = (Levels.PAGE, Levels.BLOCK, Levels.PARAGRAPH, Levels.LINE)

    for i in range(len(data_ocr['text'])):
        if data_ocr['level'][i] in line_breaks:
            word_list.append("\n")
            continue
        text = data_ocr['text'][i].strip()
        if text and data_ocr['highlighted'][i]:
            word_list.append(text + " ")
    word_string = "".join(word_list)
    word_string = re.sub(r'\n+', '\n', word_string).strip()
    return word_string

def main(args):
    img_dir = str(args.img_dir)  # Get the input directory path
    img_files = natsorted([f for f in os.listdir(img_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])

    highlighted_text = ""  # Initialize an empty string to store highlighted text

    for img_file in img_files:
        img_path = os.path.join(img_dir, img_file)
        img_orig = cv2.imread(img_path)

        if img_orig is None:
            print(f"Error: Unable to load image from {img_path}")
            continue

        img_thresh = threshold_image(img_orig)  # Only unpack img_thresh
        data_ocr = image_to_data(img_thresh)
        hsv_lower = [22, 30, 30]
        hsv_upper = [45, 255, 255]
        img_mask = mask_image(img_orig, hsv_lower, hsv_upper)
        img_mask_denoised = denoise_image(img_mask)
        img_orig_masked = apply_mask(img_orig, img_mask=img_mask_denoised)
        img_thresh_masked = apply_mask(img_thresh, img_mask=img_mask_denoised)
        data_ocr = find_highlighted_words(img_mask_denoised, data_ocr, threshold_percentage=25)
        str_highlight = words_to_string(data_ocr)
        highlighted_text += str_highlight + "\n\n"

    # Save the highlighted text to a single .md file
    output_file_path = os.path.join(args.output_dir, "highlighted_text.md")
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        output_file.write(highlighted_text)

    print("Highlighted text saved to:", output_file_path)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('img_dir', type=str, help="Input directory containing images")
    parser.add_argument('output_dir', type=str, help="Output directory for the .md file")
    args = parser.parse_args()
    main(args)

    output_dir = args.output_dir
    output_dir_path = Path(output_dir)
    output_dir_path.mkdir(parents=True, exist_ok=True)

mh-put-00 commented 1 year ago

Let me know what you think of this and if you think there's a better way to do this.

zirkelc commented 1 year ago

Hi @mh-put-00 I'm glad to hear that you like it! Thank you very much for suggestions. It looks good, I think I would have done it the same way.

I see this is your first issue on GitHub - so very warm welcome from my side! Would you like to submit your changes as an official Pull Request? I would help you out of course if you got any questions!

Let me know what you think!

zirkelc / pyhighlight-ocr

Improvement idea #1