opendatalab / MinerU

A high-quality tool for convert PDF to Markdown and JSON.一站式开源高质量数据提取工具,将PDF转换成Markdown和JSON格式。
https://opendatalab.com/OpenSourceTools?tool=extract
GNU Affero General Public License v3.0
18.52k stars 1.32k forks source link

如何只提取pdf中的table,不需要文本也不需要图片,只需要表格呀,我按照默认的模式提取会直接提取所有的内容 #1103

Open guozhetao opened 2 hours ago

guozhetao commented 2 hours ago

image

myhloli commented 2 hours ago

用命令行跑,有个content_list.json,过滤里面的table元素就可以了

guozhetao commented 2 hours ago

大佬,我没有搜到content_list.json这个文件,我现在在用代码批量处理,之前一直没用到表格,现在不知道怎么单独提取 import os os.environ['CUDA_VISIBLE_DEVICES'] = '4' import json import time from loguru import logger from tqdm import tqdm

from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.TXTPipe import TXTPipe from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter import magic_pdf.model as model_config

model_config.use_inside_model = True

def json_md_dump( md_writer, pdf_name, md_content, ): md_writer.write( content=md_content, path=os.path.join(md_writer.path, f"{pdf_name}.md") )

def find_pdfs_in_directory(directory): pdfpaths = [] for root, , files in os.walk(directory): for file in files: if file.lower().endswith('.pdf'): pdf_paths.append(os.path.join(root, file)) return pdf_paths

def get_subfolders(directory): return [name for name in os.listdir(directory) if os.path.isdir(os.path.join(directory, name))]

def pdf_parse_main( pdf_path_list: str, parse_method: str = 'auto', model_json_path: str = None, is_json_md_dump: bool = True, output_directory: str = None ): if model_json_path: model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read()) else: model_json = []

for pdf_path in tqdm(pdf_path_list, desc="Processing PDFs", unit="file"):
    pdf_name = os.path.basename(pdf_path).split(".")[0]
    pdf_subfolder = os.path.basename(os.path.dirname(pdf_path))

    try:
        # Create a new output directory using the specified output directory
        output_path = os.path.join(output_directory, pdf_subfolder)
        os.makedirs(output_path, exist_ok=True)

        pdf_bytes = open(pdf_path, "rb").read()

        md_writer = DiskReaderWriter(output_path)
        image_writer = DiskReaderWriter(output_path)  # Save images to the same output folder

        if parse_method == "auto":
            jso_useful_key = {"_pdf_type": "", "model_list": model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
        elif parse_method == "txt":
            pipe = TXTPipe(pdf_bytes, model_json, image_writer)
        elif parse_method == "ocr":
            pipe = OCRPipe(pdf_bytes, model_json, image_writer)
        else:
            logger.error("Unknown parse method, only auto, ocr, txt allowed")
            exit(1)

        pipe.pipe_classify()

        if not model_json:
            if model_config.__use_inside_model__:
                pipe.pipe_analyze()
            else:
                logger.error("Need model list input")
                exit(1)

        pipe.pipe_parse()

        md_content = pipe.pipe_mk_markdown(pdf_name, drop_mode="none")

        if is_json_md_dump:
            json_md_dump(md_writer, pdf_name, md_content)

        # Copy the original PDF to the new output directory
        original_pdf_destination = os.path.join(output_path, os.path.basename(pdf_path))
        with open(original_pdf_destination, "wb") as f:
            f.write(pdf_bytes)

    except Exception as e:
        logger.exception(e)

Test

if name == 'main': start_time = time.time() directory = "Info_Extration/test_paper_1080/data3" # Change this to your PDF directory output_directory = "Info_Extration/test_paper_1080/data3_md_fig" # Change this to your desired output directory

# Get subfolder names in input and output directories
input_subfolders = get_subfolders(directory)
output_subfolders = get_subfolders(output_directory)

logger.info(f"Found {len(input_subfolders)} subfolders in input directory.")
logger.info(f"Found {len(output_subfolders)} subfolders in output directory.")

# Identify subfolders that are in input but not in output
subfolders_to_process = set(input_subfolders) - set(output_subfolders)

logger.info(f"Found {len(subfolders_to_process)} subfolders to process: {subfolders_to_process}")

# Collect unprocessed PDFs from the identified subfolders
unprocessed_pdfs = []
for subfolder in subfolders_to_process:
    subfolder_path = os.path.join(directory, subfolder)
    pdfs_in_subfolder = find_pdfs_in_directory(subfolder_path)
    unprocessed_pdfs.extend(pdfs_in_subfolder)

logger.info(f"Found {len(unprocessed_pdfs)} unprocessed PDF files to process.")

pdf_parse_main(unprocessed_pdfs, output_directory=output_directory)
print('花费时间', time.time() - start_time)

大佬, 如果想改代码的话应该到哪个脚本或者json文件修改啊

myhloli commented 2 hours ago

https://github.com/opendatalab/MinerU/blob/master/demo/magic_pdf_parse_main.py 这个里面有