kubeagi / arcadia

A diverse, simple, and secure all-in-one LLMOps platform
http://www.kubeagi.com/
Apache License 2.0
82 stars 23 forks source link

Extract the images from pdf, and then generate the QA list #609

Open ggservice007 opened 9 months ago

ggservice007 commented 9 months ago

what

Extract the images from pdf, and then generate the QA list

implement

direct

detail

bjwswang commented 9 months ago

This is a optimization to current pdf QA generation.

Use case

When user asks a question against a knowlegebase, our chat server can respond with extra images which are extracted from pdf and extra indexed in our pg.

bjwswang commented 9 months ago

@ggservice007 @wangxinbiao Please provide more details on how you are gonna implement this.

ggservice007 commented 7 months ago

Use Qwen VL to recognize the content in the image. If it's in English, provide the translated Chinese content.

@bjwswang

ggservice007 commented 7 months ago

direct

Use the pdf library to extract image directly.

reference

https://mp.weixin.qq.com/s/4mg59Sb7TzaoXVctEMJVWw

source code

def get_image_direct():
    """
    apt install poppler-utils
    """
    # 读取PDF
    import pypdf
    # 分析PDF的layout,提取文本
    from pdfminer.high_level import extract_pages, extract_text
    from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
    from pdf2image import convert_from_path
    # 从PDF的表格中提取文本
    import pdfplumber
    import os
    import ulid

    print("get image direct")

    # 创建一个从pdf中裁剪图像元素的函数
    def crop_image(element, pageObj, file_name):
        # 获取从PDF中裁剪图像的坐标
        [image_left, image_top, image_right, image_bottom] = [
                                                    element.x0,
                                                    element.y0,
                                                    element.x1,
                                                    element.y1] 

        # 使用坐标(left, bottom, right, top)裁剪页面
        pageObj.mediabox.lower_left = (image_left, image_bottom)
        pageObj.mediabox.upper_right = (image_right, image_top)
        # 将裁剪后的页面保存为新的PDF
        cropped_pdf_writer = pypdf.PdfWriter()
        cropped_pdf_writer.add_page(pageObj)
        # 将裁剪好的PDF保存到一个新文件

        with open(file_name, 'wb') as cropped_pdf_file:
            cropped_pdf_writer.write(cropped_pdf_file)

    # 创建一个将PDF内容转换为image的函数
    def convert_to_images(input_file, output_file):
        images = convert_from_path(input_file)
        image = images[0]
        image.save(output_file, "PNG")

        # 查找PDF路径
    pdf_path = 'aa.pdf'

    # 创建一个PDF文件对象
    pdfFileObj = open(pdf_path, 'rb')
    # 创建一个PDF阅读器对象
    pdfReaded = pypdf.PdfReader(pdfFileObj)

    # 打开pdf文件
    pdf = pdfplumber.open(pdf_path)

    # 我们从PDF中提取页面
    for pagenum, page in enumerate(extract_pages(pdf_path)):
        # print(f"第{pagenum + 1}页")
        # 初始化从页面中提取文本所需的变量
        pageObj = pdfReaded.pages[pagenum]
        # 找到所有的元素
        page_elements = [(element.y1, element) for element in page._objs]
        # 对页面中出现的所有元素进行排序
        page_elements.sort(key=lambda a: a[0], reverse=True)

        # 查找组成页面的元素
        image_index = 0
        for i, component in enumerate(page_elements):
            # 提取PDF中元素顶部的位置
            pos= component[0]
            # 提取页面布局的元素
            element = component[1]
            if isinstance(element, LTFigure):
                [image_left, image_top, image_right, image_bottom] = [
                                                    element.x0,
                                                    element.y0,
                                                    element.x1,
                                                    element.y1] 

                width = image_right - image_left
                height = image_bottom - image_top
                if width >= 64 and height >= 64:
                    print(f"{width} x {height}")
                    print(f"第{pagenum + 1}页: 第{image_index + 1}个图片")
                    # 从PDF中裁剪图像
                    file_name = f"./images/pdf/pdf_{pagenum + 1}_{image_index + 1}.pdf"
                    crop_image(element, pageObj, f"./images/pdf/pdf_{pagenum + 1}_{image_index + 1}.pdf")
                    # 将裁剪后的pdf转换为图像
                    output_file = f"./images/img/img_{pagenum + 1}_{image_index + 1}.png"
                    convert_to_images(file_name, output_file)
                    image_index = image_index + 1

        print(f"第{pagenum + 1}页: 包含{image_index + 1}个图片")

    # 关闭pdf文件对象
    pdfFileObj.close()

if __name__ == '__main__':
    get_image_direct()

result

image

But the WPS can work well.

resolve it

use the following library

pypdf==4.1.0
Pillow==10.3.0

use the simple code

def extract_image_002():
    from pypdf import PdfReader

    print("extract image 002")

    pdf_path = 'pdf/aa.pdf'
    reader = PdfReader(pdf_path)

    page = reader.pages[2]
    count = 0
    for image_file_object in page.images:
        with open('img' + '/' + str(count) + image_file_object.name, "wb") as fp:
            fp.write(image_file_object.data)
            count += 1

image is ok