kubeagi / core-library

Core library for kubeagi to provide apis&sdk in python
Apache License 2.0
3 stars 3 forks source link

Extract the table from pdf, and then generate the QA list #42

Open ggservice007 opened 8 months ago

ggservice007 commented 8 months ago

how to

1 extract table the from pdf 2 save table content in json format 3 save table content in markdown format

implement

pdfplumber

detail #43

tabula-py

detail #44

table-transformer

detail in comment

combine

detail in comment

ggservice007 commented 8 months ago

table-transformer

what

Use the pdfplumber the extract table from table.

github

https://github.com/microsoft/table-transformer

pypi

https://pypi.org/project/table-transformer/

result

Cannot install this library.

pip install table-transformer==1.0.3
ERROR: Could not find a version that satisfies the requirement onnxruntime~=1.14.1 (from table-transformer) (from versions: 1.15.0, 1.15.1, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.17.0, 1.17.1)

ERROR: No matching distribution found for onnxruntime~=1.14.1
ggservice007 commented 8 months ago

combine

reference

https://mp.weixin.qq.com/s/4mg59Sb7TzaoXVctEMJVWw

source code

def get_table_by_combine():

    # 读取PDF
    import PyPDF2
    # 分析PDF的layout,提取文本
    from pdfminer.high_level import extract_pages, extract_text
    from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
    # 从PDF的表格中提取文本
    import pdfplumber
    # 清除过程中的各种过程文件
    import os

    print("=" * 18 +  "使用combine来提取表格" + "=" * 18)

    # 查找PDF路径
    pdf_path = '财务报销管理细则-V1.00-202201.pdf'

    # 创建一个PDF文件对象
    pdfFileObj = open(pdf_path, 'rb')
    # 创建一个PDF阅读器对象
    pdfReaded = PyPDF2.PdfReader(pdfFileObj)

    # 创建字典以从每个图像中提取文本
    text_per_page = {}

    # 打开pdf文件
    pdf = pdfplumber.open(pdf_path)

    def extract_table(pdf_path, page_num, table_num):
        # 打开PDF文件
        pdf = pdfplumber.open(pdf_path)
        # 查找已检查的页面
        table_page = pdf.pages[page_num]
        # 提取适当的表格
        table = table_page.extract_tables()[table_num]
        return table

    # 将表格转换为适当的格式
    def table_converter(table):
        table_string = ''
        # 遍历表格的每一行
        for row_num in range(len(table)):
            row = table[row_num]
            # 从warp的文字删除线路断路器
            cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
            # 将表格转换为字符串,注意'|'、'\n'
            table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
        # 删除最后一个换行符
        table_string = table_string[:-1]
        return table_string

    # 我们从PDF中提取页面
    for pagenum, page in enumerate(extract_pages(pdf_path)):
        print(f"第{pagenum + 1}页")
        # 初始化从页面中提取文本所需的变量
        pageObj = pdfReaded.pages[pagenum]
        page_text = []
        line_format = []
        text_from_images = []
        text_from_tables = []
        page_content = []
        # 初始化检查表的数量
        table_num = 0
        first_element= True
        table_extraction_flag= False
        # 查找已检查的页面
        page_tables = pdf.pages[pagenum]

        # 找出本页上的表格数目
        tables = page_tables.find_tables()

        # 找到所有的元素
        page_elements = [(element.y1, element) for element in page._objs]
        # 对页面中出现的所有元素进行排序
        page_elements.sort(key=lambda a: a[0], reverse=True)
        # 查找组成页面的元素
        for i,component in enumerate(page_elements):
            # 提取PDF中元素顶部的位置
            pos= component[0]
            # 提取页面布局的元素
            element = component[1]
            # 检查表的元素
            if isinstance(element, LTRect):
                # 如果第一个矩形元素
                if first_element == True and (table_num+1) <= len(tables):
                    # 找到表格的边界框
                    lower_side = page.bbox[3] - tables[table_num].bbox[3]
                    upper_side = element.y1 

                    # 从表中提取信息
                    table = extract_table(pdf_path, pagenum, table_num)
                    # 将表信息转换为结构化字符串格式
                    table_string = table_converter(table)
                    print('table_string:\n', table_string)

                    # 将标志设置为True以再次避免该内容
                    table_extraction_flag = True
                    # 让它成为另一个元素
                    first_element = False

        print('\n\n')

    # 关闭pdf文件对象
    pdfFileObj.close()

if __name__ == '__main__':
    get_table_by_combine()

result

|变更 序号|变更内容||变更|||变更||更改人|批准人|生效日期|备 注|
|None|None|None|前版|None|None|后版|None|None|None|None|None|
|None|None|None|本号|None|None|本号|None|None|None|None|None|
|1|新建|-|None|None|1.00|None|None|冯雪妮|黄启功|||
|2|||None|None||None|None|||||

image