atlas-nano / atlas-nano.github.io

Atlas group website for cmp.ucsd.edu based on Hugo/Wowchemy
https://hugoblox.com/templates/
MIT License
1 stars 9 forks source link

Need figures for publications pages #15

Open rramji opened 4 months ago

rramji commented 4 months ago

Publications pages should include the figures from the papers. See below script:

import fitz  # PyMuPDF
import os

def extract_images_and_captions(pdf_path, output_directory):
    doc = fitz.open(pdf_path)
    toc_graphic_assigned = False  # To handle the first unnamed figure differently

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for page_number in range(len(doc)):
        page = doc[page_number]
        text_blocks = page.get_text("dict")["blocks"]
        image_list = page.get_images(full=True)  # Image references

        # Extract and process each image
        for img_index, img_info in enumerate(image_list):
            xref = img_info[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            img_rects = page.get_image_rects(img_info[0])

            if not img_rects:
                continue

            img_rect = img_rects[0]  # Assuming first rectangle is primary if multiple

            # Attempt to find the nearest text block as caption
            nearest_text = ""
            figure_name = ""
            min_dist = float('inf')

            for block in text_blocks:
                if 'bbox' in block:
                    text_rect = fitz.Rect(block["bbox"])
                    if text_rect.y0 > img_rect.y1:  # Text below the image
                        horizontal_overlap = min(text_rect.x1, img_rect.x1) - max(text_rect.x0, img_rect.x0)
                        if horizontal_overlap > 0:
                            distance = text_rect.y0 - img_rect.y1
                            if distance < min_dist:
                                caption_candidate = ' '.join(span['text'] for line in block['lines'] for span in line['spans']).strip().replace('  ', ' ')
                                if "Figure" in caption_candidate:
                                    min_dist = distance
                                    nearest_text = caption_candidate
                                    figure_number = caption_candidate.split('.')[0].strip()
                                    if figure_number:
                                        figure_name = figure_number.replace(' ', '_') + '.' + image_ext

            if not nearest_text:
                if not toc_graphic_assigned:
                    image_filename = "TOC_graphic." + image_ext
                    toc_graphic_assigned = True
                else:
                    image_filename = "unlabeled." + image_ext
            else:
                image_filename = figure_name if figure_name else f"image{page_number + 1}_{img_index + 1}.{image_ext}"

            image_path = os.path.join(output_directory, image_filename)
            caption_filename = os.path.splitext(image_filename)[0] + "_caption.txt"
            caption_path = os.path.join(output_directory, caption_filename)

            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)
            with open(caption_path, "w") as caption_file:
                caption_file.write(nearest_text)

            print(f"Saved {image_filename} with caption: {nearest_text} at {image_path}")

    doc.close()

# Example usage
pdf_path = '/home/robert/Downloads/xiang-et-al-2014-molecular-dynamics-simulations-of-polyamide-membrane-calcium-alginate-gel-and-their-interactions-in.pdf'
output_directory = './ExtractedImagesAndCaptions'
extract_images_and_captions(pdf_path, output_directory)
dowhep commented 4 months ago

I have added figures generated to each publication's folder. Will work on how to display the figures.