Open ggservice007 opened 8 months ago
Use the pdfplumber the extract table from table.
https://github.com/microsoft/table-transformer
https://pypi.org/project/table-transformer/
Cannot install this library.
pip install table-transformer==1.0.3
ERROR: Could not find a version that satisfies the requirement onnxruntime~=1.14.1 (from table-transformer) (from versions: 1.15.0, 1.15.1, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.17.0, 1.17.1)
ERROR: No matching distribution found for onnxruntime~=1.14.1
https://mp.weixin.qq.com/s/4mg59Sb7TzaoXVctEMJVWw
def get_table_by_combine():
# 读取PDF
import PyPDF2
# 分析PDF的layout,提取文本
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
# 从PDF的表格中提取文本
import pdfplumber
# 清除过程中的各种过程文件
import os
print("=" * 18 + "使用combine来提取表格" + "=" * 18)
# 查找PDF路径
pdf_path = '财务报销管理细则-V1.00-202201.pdf'
# 创建一个PDF文件对象
pdfFileObj = open(pdf_path, 'rb')
# 创建一个PDF阅读器对象
pdfReaded = PyPDF2.PdfReader(pdfFileObj)
# 创建字典以从每个图像中提取文本
text_per_page = {}
# 打开pdf文件
pdf = pdfplumber.open(pdf_path)
def extract_table(pdf_path, page_num, table_num):
# 打开PDF文件
pdf = pdfplumber.open(pdf_path)
# 查找已检查的页面
table_page = pdf.pages[page_num]
# 提取适当的表格
table = table_page.extract_tables()[table_num]
return table
# 将表格转换为适当的格式
def table_converter(table):
table_string = ''
# 遍历表格的每一行
for row_num in range(len(table)):
row = table[row_num]
# 从warp的文字删除线路断路器
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
# 将表格转换为字符串,注意'|'、'\n'
table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
# 删除最后一个换行符
table_string = table_string[:-1]
return table_string
# 我们从PDF中提取页面
for pagenum, page in enumerate(extract_pages(pdf_path)):
print(f"第{pagenum + 1}页")
# 初始化从页面中提取文本所需的变量
pageObj = pdfReaded.pages[pagenum]
page_text = []
line_format = []
text_from_images = []
text_from_tables = []
page_content = []
# 初始化检查表的数量
table_num = 0
first_element= True
table_extraction_flag= False
# 查找已检查的页面
page_tables = pdf.pages[pagenum]
# 找出本页上的表格数目
tables = page_tables.find_tables()
# 找到所有的元素
page_elements = [(element.y1, element) for element in page._objs]
# 对页面中出现的所有元素进行排序
page_elements.sort(key=lambda a: a[0], reverse=True)
# 查找组成页面的元素
for i,component in enumerate(page_elements):
# 提取PDF中元素顶部的位置
pos= component[0]
# 提取页面布局的元素
element = component[1]
# 检查表的元素
if isinstance(element, LTRect):
# 如果第一个矩形元素
if first_element == True and (table_num+1) <= len(tables):
# 找到表格的边界框
lower_side = page.bbox[3] - tables[table_num].bbox[3]
upper_side = element.y1
# 从表中提取信息
table = extract_table(pdf_path, pagenum, table_num)
# 将表信息转换为结构化字符串格式
table_string = table_converter(table)
print('table_string:\n', table_string)
# 将标志设置为True以再次避免该内容
table_extraction_flag = True
# 让它成为另一个元素
first_element = False
print('\n\n')
# 关闭pdf文件对象
pdfFileObj.close()
if __name__ == '__main__':
get_table_by_combine()
|变更 序号|变更内容||变更|||变更||更改人|批准人|生效日期|备 注|
|None|None|None|前版|None|None|后版|None|None|None|None|None|
|None|None|None|本号|None|None|本号|None|None|None|None|None|
|1|新建|-|None|None|1.00|None|None|冯雪妮|黄启功|||
|2|||None|None||None|None|||||
how to
1 extract table the from pdf 2 save table content in json format 3 save table content in markdown format
implement
pdfplumber
detail #43
tabula-py
detail #44
table-transformer
detail in comment
combine
detail in comment