开启表格识别后巨慢,半个小时都处理不了一个4.5M 的pdf 文档

tao-xiaoxin commented 2 weeks ago

Description of the bug | 错误描述

配置文件如下:

{
    "bucket_info":{
        "bucket-name-1":["ak", "sk", "endpoint"],
        "bucket-name-2":["ak", "sk", "endpoint"]
    },
    "models-dir":"/data/ai_models/pdf_models/opendatalab/PDF-Extract-Kit/models",
    "device-mode":"cuda:1",
    "table-config": {
        "model": "TableMaster",
        "is_table_recog_enable": true,
        "max_time": 800
    }
}

设备配置如下:

(venv) root@notebook-1725881840-qmxrp-6fbdd9cf9b-wh6zk:/tmp/aitools#  nvidia-smi
Fri Oct 11 09:39:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:18:00.0 Off |                    0 |
| N/A   28C    P0             113W / 700W |   3866MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80GB HBM3          On  | 00000000:2A:00.0 Off |                    0 |
| N/A   31C    P0             113W / 700W |  20160MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   2  NVIDIA H100 80GB HBM3          On  | 00000000:3A:00.0 Off |                    0 |
| N/A   30C    P0             109W / 700W |  69298MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   3  NVIDIA H100 80GB HBM3          On  | 00000000:5D:00.0 Off |                    0 |
| N/A   25C    P0              66W / 700W |      7MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   4  NVIDIA H100 80GB HBM3          On  | 00000000:91:00.0 Off |                    0 |
| N/A   29C    P0             110W / 700W |    531MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   5  NVIDIA H100 80GB HBM3          On  | 00000000:E4:00.0 Off |                    0 |
| N/A   24C    P0              66W / 700W |      7MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
+---------------------------------------------------------------------------------------+

代码如下:

import os
import json
import subprocess
import copy
import logging
from typing import Dict
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import magic_pdf.model as model_config
from app.core.path_conf import BASE_DIR
from fastapi import HTTPException
from app.core.conf import settings

logger = logging.getLogger(__name__)
model_config.__use_inside_model__ = True

# 定义配置文件名常量
CONFIG_FILE_NAME = os.path.join(BASE_DIR, "magic-pdf-config.json")

def read_config():
    """
    读取模型配置文件
    :return:
    """
    config_file = os.path.join(CONFIG_FILE_NAME)

    if not os.path.exists(config_file):
        raise FileNotFoundError(f"{config_file} not found")

    with open(config_file, "r", encoding="utf-8") as f:
        config = json.load(f)
    return config

def json_md_dump(
        pipe,
        md_writer,
        pdf_name,
        content_list,
        md_content,
):
    # 写入模型结果到 model.json
    orig_model_list = copy.deepcopy(pipe.model_list)
    md_writer.write(
        content=json.dumps(orig_model_list, ensure_ascii=False, indent=4),
        path=f"{pdf_name}_model.json"
    )

    # 写入中间结果到 middle.json
    md_writer.write(
        content=json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
        path=f"{pdf_name}_middle.json"
    )

    # text文本结果写入到 conent_list.json
    md_writer.write(
        content=json.dumps(content_list, ensure_ascii=False, indent=4),
        path=f"{pdf_name}_content_list.json"
    )

    # 写入结果到 .md 文件中
    md_writer.write(
        content=md_content,
        path=f"{pdf_name}.md"
    )

def pdf_parse_main(
        pdf_bytes: bytes,
        parse_method: str = 'auto',
        model_json_path: str = None,
        pdf_name: str = None,
        is_json_md_dump: bool = True,
        output_dir: str = settings.DOCUMENTS_PROCESSED_DIR,
):
    """
    执行从 pdf 转换到 json、md 的过程，输出 md 和 json 文件到 pdf 文件所在的目录

    :param pdf_name: PDF 文件名称
    :param pdf_bytes: pdf 文件的二进制数据
    :param parse_method: 解析方法， 共 auto、ocr、txt 三种，默认 auto，如果效果不好，可以尝试 ocr
    :param model_json_path: 已经存在的模型数据文件，如果为空则使用内置模型，pdf 和 model_json 务必对应
    :param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中，默认 True，会将不同阶段的数据写入到不同的 .json 文件中（共3个.json文件），md内容会保存到 .md 文件中
    :param output_dir: 输出结果的目录地址，会生成一个以 pdf 文件名命名的文件夹并保存所有结果
    """
    try:
        output_path = os.path.join(output_dir, pdf_name)

        output_image_path = os.path.join(output_path, 'images')

        # 获取图片的父路径，为的是以相对路径保存到 .md 和 conent_list.json 文件中
        image_path_parent = os.path.basename(output_image_path)

        if model_json_path:
            # 读取已经被模型解析后的pdf文件的 json 原始数据，list 类型
            model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
        else:
            model_json = []

        # 执行解析步骤
        # image_writer = DiskReaderWriter(output_image_path)
        image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)

        # 选择解析方式
        # jso_useful_key = {"_pdf_type": "", "model_list": model_json}
        # pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
        if parse_method == "auto":
            jso_useful_key = {"_pdf_type": "", "model_list": model_json}
            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
        elif parse_method == "txt":
            pipe = TXTPipe(pdf_bytes, model_json, image_writer)
        elif parse_method == "ocr":
            pipe = OCRPipe(pdf_bytes, model_json, image_writer)
        else:
            logger.error("unknown parse method, only auto, ocr, txt allowed")
            raise HTTPException(detail="PDF文件解析失败，请选择OCR、Auto、或者Txt模式！", status_code=401)

        # 执行分类
        pipe.pipe_classify()

        # 如果没有传入模型数据，则使用内置模型解析
        if not model_json:
            if model_config.__use_inside_model__:
                pipe.pipe_analyze()  # 解析
            else:
                logger.error("need model list input")

        # 执行解析
        pipe.pipe_parse()

        # 保存 text 和 md 格式的结果
        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")

        if is_json_md_dump:
            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)

        return output_path,content_list,md_content

    except Exception as e:
        logger.exception(e)
        raise HTTPException(status_code=401, detail="PDF文件解析失败，请换个文件再试！")

import asyncio
import os
import shutil
import logging
from typing import Dict, Any, Tuple
from app.utils.oss_manager import oss_manager
from concurrent.futures import ProcessPoolExecutor
from pdf2docx import Converter
from magic_pdf.tools.common import do_parse
from app.utils.magic_pdf.pdf_to_md import pdf_parse_main, read_config as local_config
from magic_pdf.libs import config_reader
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
from app.core.conf import settings

logger = logging.getLogger(__name__)

config_reader.read_config = local_config

class PDFService:
    def __init__(self):
        self.max_workers = 10
        self.output_dir = settings.DOCUMENTS_PROCESSED_DIR

    def load_model(self):
        try:
            model_manager = ModelSingleton()
            txt_model = model_manager.get_model(False, False)
            logger.info(f"txt_model init final")
            ocr_model = model_manager.get_model(True, False)
            logger.info(f"ocr_model init final")
            return True
        except Exception as e:
            logger.exception(e)
            return False

    @staticmethod
    async def process_image(output_path, content_list, md_content):
        # 上传图片并更新路径
        for item in content_list:
            if item['type'] in ['image', 'table'] and 'img_path' in item:
                relative_img_path = item.get('img_path')
                if relative_img_path:
                    full_img_path = os.path.join(output_path, relative_img_path)
                    if os.path.exists(full_img_path):
                        success, oss_url = oss_manager.upload_file(full_img_path)
                        if success:
                            item['img_path'] = oss_url
                            # 更新md_content中的图片路径
                            md_content = md_content.replace(f"![]({relative_img_path})", f"![]({oss_url})")
                        else:
                            logger.error(f"Failed to upload image to OSS: {full_img_path}")
                    else:
                        logger.error(f"Image file not found: {full_img_path}")

        logger.info(f"PDF processing completed. Output path: {output_path}")
        return content_list, md_content

    @staticmethod
    async def process_output_file(md_content: str, pdf_name: str) -> str:
        try:
            # 将md_content转换为字节
            md_bytes = md_content.encode('utf-8')

            # 使用upload_bytes_file方法上传
            oss_file_name = f"{pdf_name}.md"
            oss_url = oss_manager.upload_bytes_file(md_bytes, oss_file_name)

            if oss_url:
                return oss_url
            else:
                raise Exception("Failed to upload MD file to OSS")
        except Exception as e:
            logger.error(f"Failed to process output file: {str(e)}")
            raise Exception("文件解析失败，请稍后再试！")

    @staticmethod
    def cleanup_output_directory(output_path: str):
        try:
            if os.path.exists(output_path):
                shutil.rmtree(output_path)
                logger.info(f"Successfully deleted output directory: {output_path}")
            else:
                logger.warning(f"Output directory does not exist: {output_path}")
        except Exception as e:
            logger.error(f"Failed to delete output directory {output_path}: {str(e)}")

    async def process(self, file_bytes: bytes, pdf_name: str, parse_method: str) -> Dict[str, Any]:
        # 添加PDF转Word的处理
        word_conversion_success, word_file_url = await self.convert_pdf_to_word(file_bytes, pdf_name)
        # try:
        output_path, content_list, md_content = pdf_parse_main(pdf_bytes=file_bytes, pdf_name=pdf_name,
                                                               parse_method=parse_method)

        content_list, md_content = await self.process_image(output_path, content_list, md_content)

        md_file_url = await self.process_output_file(md_content, pdf_name)

        output_dict = {
            "md_file_url": md_file_url,
            "content_list": content_list,
            "md_content": md_content,
            "word_file_url": word_file_url if word_conversion_success else None
        }
        return output_dict

pdf_service = PDFService()

个人感觉没有用GPU,而是用了CPU ,我想问下如何调整?pdf文档涉及机密,无法给出

How to reproduce the bug | 如何复现

0: 1888x1344 2 embeddings, 12.6ms Speed: 8.5ms preprocess, 12.6ms inference, 26.4ms postprocess per image at shape (1, 3, 1888, 1344) 2024-10-11 08:54:49.487 | INFO | magic_pdf.model.pdf_extract_kit:call:289 | process:204264 | thread:140588207540032 | formula nums: 2, mfr time: 0.53 2024-10-11 08:54:49.496 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 08:58:01.023 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 191.52642560005188s----- 2024-10-11 08:58:01.024 | INFO | magic_pdf.model.pdf_extract_kit:call:407 | process:204264 | thread:140588207540032 | table cost: 191.53 2024-10-11 08:58:01.258 | INFO | magic_pdf.model.pdf_extract_kit:call:259 | process:204264 | thread:140588207540032 | layout detection cost: 0.23

0: 1888x1344 (no detections), 12.4ms Speed: 11.5ms preprocess, 12.4ms inference, 0.4ms postprocess per image at shape (1, 3, 1888, 1344) 2024-10-11 08:58:01.284 | INFO | magic_pdf.model.pdf_extract_kit:call:289 | process:204264 | thread:140588207540032 | formula nums: 0, mfr time: 0.0 2024-10-11 08:58:01.290 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:01:07.418 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 186.12867164611816s----- 2024-10-11 09:01:07.419 | INFO | magic_pdf.model.pdf_extract_kit:call:407 | process:204264 | thread:140588207540032 | table cost: 186.13 2024-10-11 09:01:07.598 | INFO | magic_pdf.model.pdf_extract_kit:call:259 | process:204264 | thread:140588207540032 | layout detection cost: 0.18

0: 1888x1344 (no detections), 12.3ms Speed: 12.6ms preprocess, 12.3ms inference, 0.4ms postprocess per image at shape (1, 3, 1888, 1344) 2024-10-11 09:01:07.625 | INFO | magic_pdf.model.pdf_extract_kit:call:289 | process:204264 | thread:140588207540032 | formula nums: 0, mfr time: 0.0 2024-10-11 09:01:07.632 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:04:13.809 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 186.17707657814026s----- 2024-10-11 09:04:13.810 | INFO | magic_pdf.model.pdf_extract_kit:call:407 | process:204264 | thread:140588207540032 | table cost: 186.18 2024-10-11 09:04:14.075 | INFO | magic_pdf.model.pdf_extract_kit:call:259 | process:204264 | thread:140588207540032 | layout detection cost: 0.26

0: 1888x1344 15 embeddings, 3 isolateds, 12.2ms Speed: 10.2ms preprocess, 12.2ms inference, 2.2ms postprocess per image at shape (1, 3, 1888, 1344) 2024-10-11 09:04:18.833 | INFO | magic_pdf.model.pdf_extract_kit:call:289 | process:204264 | thread:140588207540032 | formula nums: 18, mfr time: 4.39 2024-10-11 09:04:18.838 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:07:25.832 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 186.99368476867676s----- 2024-10-11 09:07:25.832 | INFO | magic_pdf.model.pdf_extract_kit:call:407 | process:204264 | thread:140588207540032 | table cost: 187.0 2024-10-11 09:07:26.138 | INFO | magic_pdf.model.pdf_extract_kit:call:259 | process:204264 | thread:140588207540032 | layout detection cost: 0.3

0: 1888x1344 10 embeddings, 4 isolateds, 12.2ms Speed: 8.7ms preprocess, 12.2ms inference, 0.7ms postprocess per image at shape (1, 3, 1888, 1344) 2024-10-11 09:07:36.092 | INFO | magic_pdf.model.pdf_extract_kit:call:289 | process:204264 | thread:140588207540032 | formula nums: 14, mfr time: 0.88 2024-10-11 09:07:36.098 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:10:42.828 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 186.73010873794556s----- 2024-10-11 09:10:42.828 | INFO | magic_pdf.model.pdf_extract_kit:call:407 | process:204264 | thread:140588207540032 | table cost: 186.73 2024-10-11 09:10:43.098 | INFO | magic_pdf.model.pdf_extract_kit:call:259 | process:204264 | thread:140588207540032 | layout detection cost: 0.27

0: 1888x1344 13 embeddings, 12.3ms Speed: 10.7ms preprocess, 12.3ms inference, 0.8ms postprocess per image at shape (1, 3, 1888, 1344) 2024-10-11 09:10:43.966 | INFO | magic_pdf.model.pdf_extract_kit:call:289 | process:204264 | thread:140588207540032 | formula nums: 13, mfr time: 0.82 2024-10-11 09:10:43.970 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:13:52.121 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 188.15088176727295s----- 2024-10-11 09:13:52.122 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:17:00.153 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 188.03010416030884s----- 2024-10-11 09:17:00.153 | INFO | magic_pdf.model.pdf_extract_kit:call:407 | process:204264 | thread:140588207540032 | table cost: 376.18 2024-10-11 09:17:00.409 | INFO | magic_pdf.model.pdf_extract_kit:call:259 | process:204264 | thread:140588207540032 | layout detection cost: 0.25

0: 1888x1344 21 embeddings, 12.3ms Speed: 10.8ms preprocess, 12.3ms inference, 0.8ms postprocess per image at shape (1, 3, 1888, 1344) 2024-10-11 09:17:00.731 | INFO | magic_pdf.model.pdf_extract_kit:call:289 | process:204264 | thread:140588207540032 | formula nums: 21, mfr time: 0.26 2024-10-11 09:17:00.735 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:20:13.231 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 192.4964451789856s----- 2024-10-11 09:20:13.233 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:23:22.363 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 189.1304304599762s----- 2024-10-11 09:23:22.365 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:26:31.113 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 188.74734711647034s----- 2024-10-11 09:26:31.113 | INFO | magic_pdf.model.pdf_extract_kit:call:407 | process:204264 | thread:140588207540032 | table cost: 570.38 2024-10-11 09:26:31.394 | INFO | magic_pdf.model.pdf_extract_kit:call:259 | process:204264 | thread:140588207540032 | layout detection cost: 0.28

0: 1888x1344 16 embeddings, 12.3ms Speed: 10.9ms preprocess, 12.3ms inference, 0.9ms postprocess per image at shape (1, 3, 1888, 1344) 2024-10-11 09:26:31.666 | INFO | magic_pdf.model.pdf_extract_kit:call:289 | process:204264 | thread:140588207540032 | formula nums: 16, mfr time: 0.22 2024-10-11 09:26:31.670 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:29:39.840 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 188.1698076725006s----- 2024-10-11 09:29:39.841 | INFO | magic_pdf.model.pdf_extract_kit:call:407 | process:204264 | thread:140588207540032 | table cost: 188.17 2024-10-11 09:29:40.118 | INFO | magic_pdf.model.pdf_extract_kit:call:259 | process:204264 | thread:140588207540032 | layout detection cost: 0.28

0: 1888x1344 32 embeddings, 12.3ms Speed: 10.5ms preprocess, 12.3ms inference, 0.9ms postprocess per image at shape (1, 3, 1888, 1344) 2024-10-11 09:29:40.742 | INFO | magic_pdf.model.pdf_extract_kit:call:289 | process:204264 | thread:140588207540032 | formula nums: 32, mfr time: 0.54 2024-10-11 09:29:40.747 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins----------------- 2024-10-11 09:32:48.072 | INFO | magic_pdf.model.pdf_extract_kit:call:390 | process:204264 | thread:140588207540032 | ------------table recognition processing ends within 187.32563972473145s----- 2024-10-11 09:32:48.074 | INFO | magic_pdf.model.pdf_extract_kit:call:380 | process:204264 | thread:140588207540032 | ------------------table recognition processing begins-----------------

Operating system | 操作系统

Linux

Python version | Python 版本

3.10

Software version | 软件版本 (magic-pdf --version)

0.8.x

Device mode | 设备模式

cuda

myhloli commented 2 weeks ago

卸载paddlepaddle和paddlepaddle-gpu，再安装paddlepaddle-gpu paddlepaddle-gpu记得选3.0.0b1版本

tao-xiaoxin commented 2 weeks ago

机器无网络,包是通过离线安装的,是paddlepaddle-gpu 3.0.0b1 版本

myhloli commented 2 weeks ago

机器无网络,包是通过离线安装的,是paddlepaddle-gpu 3.0.0b1 版本

这样调整完你的环境应该确保没有cpu版本的paddle了，不会出现因为调用到cpu的paddle引起的速度下降问题，实际测试还是一个表格一百多秒吗？

tao-xiaoxin commented 2 weeks ago

机器无网络,包是通过离线安装的,是paddlepaddle-gpu 3.0.0b1 版本

这样调整完你的环境应该确保没有cpu版本的paddle了，不会出现因为调用到cpu的paddle引起的速度下降问题，实际测试还是一个表格一百多秒吗？

是的呢,还是100多s

tao-xiaoxin commented 2 weeks ago

机器无网络,包是通过离线安装的,是paddlepaddle-gpu 3.0.0b1 版本

这样调整完你的环境应该确保没有cpu版本的paddle了，不会出现因为调用到cpu的paddle引起的速度下降问题，实际测试还是一个表格一百多秒吗？

但是关闭表格识别后就是20s左右,开启后就巨慢了

myhloli commented 2 weeks ago

在确认下用户目录中的magic-pdf.json中表格的配置是"model": "TableMaster",吗

tao-xiaoxin commented 2 weeks ago

在确认下用户目录中的magic-pdf.json中表格的配置是"model": "TableMaster",吗

是的,上面代码和配置文件都给出了,

{
    "bucket_info":{
        "bucket-name-1":["ak", "sk", "endpoint"],
        "bucket-name-2":["ak", "sk", "endpoint"]
    },
    "models-dir":"/data/ai_models/pdf_models/opendatalab/PDF-Extract-Kit/models",
    "device-mode":"cuda:1",
    "table-config": {
        "model": "TableMaster",
        "is_table_recog_enable": true,
        "max_time": 800
    }
}

tao-xiaoxin commented 2 weeks ago

在确认下用户目录中的magic-pdf.json中表格的配置是"model": "TableMaster",吗设备是用的6张H100的卡,只用了其中一张

myhloli commented 2 weeks ago

https://github.com/opendatalab/MinerU/blob/master/demo/small_ocr.pdf 下载这个pdf测试下ocr的速度，然后把日志贴一下看看？

tao-xiaoxin commented 2 weeks ago

https://github.com/opendatalab/MinerU/blob/master/demo/small_ocr.pdf 下载这个pdf测试下ocr的速度，然后把日志贴一下看看？

2024-10-11 10:25:13.164 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:259 | process:213467 | thread:139747333814080 | layout detection cost: 1.12

0: 1888x1312 (no detections), 81.9ms
Speed: 14.1ms preprocess, 81.9ms inference, 7.3ms postprocess per image at shape (1, 3, 1888, 1312)
2024-10-11 10:25:13.724 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:289 | process:213467 | thread:139747333814080 | formula nums: 0, mfr time: 0.0
2024-10-11 10:25:14.886 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:372 | process:213467 | thread:139747333814080 | ocr cost: 1.14
2024-10-11 10:25:14.886 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:407 | process:213467 | thread:139747333814080 | table cost: 0.0
2024-10-11 10:25:15.565 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:259 | process:213467 | thread:139747333814080 | layout detection cost: 0.68

0: 1888x1312 4 embeddings, 12.4ms
Speed: 16.1ms preprocess, 12.4ms inference, 41.5ms postprocess per image at shape (1, 3, 1888, 1312)
2024-10-11 10:25:16.480 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:289 | process:213467 | thread:139747333814080 | formula nums: 4, mfr time: 0.78
2024-10-11 10:25:16.684 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:372 | process:213467 | thread:139747333814080 | ocr cost: 0.17
2024-10-11 10:25:16.685 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:407 | process:213467 | thread:139747333814080 | table cost: 0.0
2024-10-11 10:25:17.220 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:259 | process:213467 | thread:139747333814080 | layout detection cost: 0.54

0: 1888x1312 (no detections), 12.4ms
Speed: 10.6ms preprocess, 12.4ms inference, 0.5ms postprocess per image at shape (1, 3, 1888, 1312)
2024-10-11 10:25:17.246 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:289 | process:213467 | thread:139747333814080 | formula nums: 0, mfr time: 0.0
2024-10-11 10:25:17.373 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:372 | process:213467 | thread:139747333814080 | ocr cost: 0.11
2024-10-11 10:25:17.374 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:407 | process:213467 | thread:139747333814080 | table cost: 0.0
2024-10-11 10:25:17.924 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:259 | process:213467 | thread:139747333814080 | layout detection cost: 0.55

0: 1888x1312 (no detections), 12.4ms
Speed: 13.0ms preprocess, 12.4ms inference, 0.3ms postprocess per image at shape (1, 3, 1888, 1312)
2024-10-11 10:25:17.951 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:289 | process:213467 | thread:139747333814080 | formula nums: 0, mfr time: 0.0
2024-10-11 10:25:18.088 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:372 | process:213467 | thread:139747333814080 | ocr cost: 0.12
2024-10-11 10:25:18.088 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:407 | process:213467 | thread:139747333814080 | table cost: 0.0
2024-10-11 10:25:18.608 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:259 | process:213467 | thread:139747333814080 | layout detection cost: 0.52

0: 1888x1312 (no detections), 12.4ms
Speed: 13.4ms preprocess, 12.4ms inference, 0.3ms postprocess per image at shape (1, 3, 1888, 1312)
2024-10-11 10:25:18.635 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:289 | process:213467 | thread:139747333814080 | formula nums: 0, mfr time: 0.0
2024-10-11 10:25:18.746 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:372 | process:213467 | thread:139747333814080 | ocr cost: 0.1
2024-10-11 10:25:18.746 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:407 | process:213467 | thread:139747333814080 | table cost: 0.0
2024-10-11 10:25:19.356 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:259 | process:213467 | thread:139747333814080 | layout detection cost: 0.61

0: 1888x1312 (no detections), 12.4ms
Speed: 13.2ms preprocess, 12.4ms inference, 0.3ms postprocess per image at shape (1, 3, 1888, 1312)
2024-10-11 10:25:19.383 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:289 | process:213467 | thread:139747333814080 | formula nums: 0, mfr time: 0.0
2024-10-11 10:25:19.541 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:372 | process:213467 | thread:139747333814080 | ocr cost: 0.14
2024-10-11 10:25:19.541 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:407 | process:213467 | thread:139747333814080 | table cost: 0.0
2024-10-11 10:25:20.114 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:259 | process:213467 | thread:139747333814080 | layout detection cost: 0.57

0: 1888x1312 3 embeddings, 12.3ms
Speed: 12.1ms preprocess, 12.3ms inference, 0.8ms postprocess per image at shape (1, 3, 1888, 1312)
2024-10-11 10:25:20.422 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:289 | process:213467 | thread:139747333814080 | formula nums: 3, mfr time: 0.24
2024-10-11 10:25:20.547 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:372 | process:213467 | thread:139747333814080 | ocr cost: 0.11
2024-10-11 10:25:20.548 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:407 | process:213467 | thread:139747333814080 | table cost: 0.0
2024-10-11 10:25:21.079 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:259 | process:213467 | thread:139747333814080 | layout detection cost: 0.53

0: 1888x1312 (no detections), 12.4ms
Speed: 10.6ms preprocess, 12.4ms inference, 0.3ms postprocess per image at shape (1, 3, 1888, 1312)
2024-10-11 10:25:21.104 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:289 | process:213467 | thread:139747333814080 | formula nums: 0, mfr time: 0.0
2024-10-11 10:25:21.226 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:372 | process:213467 | thread:139747333814080 | ocr cost: 0.11
2024-10-11 10:25:21.226 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:407 | process:213467 | thread:139747333814080 | table cost: 0.0
2024-10-11 10:25:21.226 | INFO     | magic_pdf.model.doc_analyze_by_custom_model:doc_analyze:136 | process:213467 | thread:139747333814080 | doc analyze cost: 9.185619831085205
2024-10-11 10:25:21.423 | INFO     | magic_pdf.pipe.UNIPipe:pipe_mk_uni_format:48 | process:213467 | thread:139747333814080 | uni_pipe mk content list finished
2024-10-11 10:25:21.427 | INFO     | magic_pdf.pipe.UNIPipe:pipe_mk_markdown:53 | process:213467 | thread:139747333814080 | uni_pipe mk mm_markdown finished

tao-xiaoxin commented 2 weeks ago

https://github.com/opendatalab/MinerU/blob/master/demo/small_ocr.pdf 下载这个pdf测试下ocr的速度，然后把日志贴一下看看？

很快的,20s 左右,

myhloli commented 2 weeks ago

ocr速度没问题，paddle是跑gpu模式的，表格解析慢的问题没法解释，除非有样本我们这边测试下看看。。。

tao-xiaoxin commented 2 weeks ago

ocr速度没问题，paddle是跑gpu模式的，表格解析慢的问题没法解释，除非有样本我们这边测试下看看。。。

给我个邮箱,我发你们

myhloli commented 2 weeks ago

ocr速度没问题，paddle是跑gpu模式的，表格解析慢的问题没法解释，除非有样本我们这边测试下看看。。。

给我个邮箱,我发你们

moe@myhloli.com

tao-xiaoxin commented 2 weeks ago

ocr速度没问题，paddle是跑gpu模式的，表格解析慢的问题没法解释，除非有样本我们这边测试下看看。。。

给我个邮箱,我发你们

moe@myhloli.com

大佬,已经发送到你们邮箱,请问您收到了吗,麻烦帮忙看看

myhloli commented 2 weeks ago

随便找了个文档测试了三张表格，速度正常的

2024-10-12 10:35:24.341 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:287 - layout detection time: 2.46
2024-10-12 10:35:24.754 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:295 - mfd time: 0.4
2024-10-12 10:35:25.104 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:323 - formula nums: 4, mfr time: 0.35
2024-10-12 10:35:25.966 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:347 - gc time: 0.86
2024-10-12 10:35:30.236 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:431 - table time: 4.27
2024-10-12 10:35:30.236 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:433 - -----page total time: 8.35-----
2024-10-12 10:35:31.753 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:287 - layout detection time: 1.52
2024-10-12 10:35:31.871 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:295 - mfd time: 0.11
2024-10-12 10:35:31.956 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:323 - formula nums: 1, mfr time: 0.08
2024-10-12 10:35:32.149 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:347 - gc time: 0.19
2024-10-12 10:35:35.615 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:431 - table time: 3.47
2024-10-12 10:35:35.615 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:433 - -----page total time: 5.38-----
2024-10-12 10:35:37.099 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:287 - layout detection time: 1.48
2024-10-12 10:35:37.217 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:295 - mfd time: 0.11
2024-10-12 10:35:37.734 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:323 - formula nums: 17, mfr time: 0.52
2024-10-12 10:35:37.936 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:347 - gc time: 0.2
2024-10-12 10:35:41.517 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:431 - table time: 3.58
2024-10-12 10:35:41.517 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:433 - -----page total time: 5.9-----
2024-10-12 10:35:41.662 | INFO     | magic_pdf.model.doc_analyze_by_custom_model:doc_analyze:152 - gc time: 0.14
2024-10-12 10:35:41.662 | INFO     | magic_pdf.model.doc_analyze_by_custom_model:doc_analyze:156 - doc analyze time: 19.78, speed: 0.15 pages/second

我这边是单卡3060ti，跑表格会显存溢出一些，使用内存代替显存会更慢一些，足量16G显存的设备应该会更快

tao-xiaoxin commented 2 weeks ago

随便找了个文档测试了三张表格，速度正常的

2024-10-12 10:35:24.341 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:287 - layout detection time: 2.46
2024-10-12 10:35:24.754 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:295 - mfd time: 0.4
2024-10-12 10:35:25.104 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:323 - formula nums: 4, mfr time: 0.35
2024-10-12 10:35:25.966 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:347 - gc time: 0.86
2024-10-12 10:35:30.236 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:431 - table time: 4.27
2024-10-12 10:35:30.236 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:433 - -----page total time: 8.35-----
2024-10-12 10:35:31.753 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:287 - layout detection time: 1.52
2024-10-12 10:35:31.871 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:295 - mfd time: 0.11
2024-10-12 10:35:31.956 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:323 - formula nums: 1, mfr time: 0.08
2024-10-12 10:35:32.149 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:347 - gc time: 0.19
2024-10-12 10:35:35.615 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:431 - table time: 3.47
2024-10-12 10:35:35.615 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:433 - -----page total time: 5.38-----
2024-10-12 10:35:37.099 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:287 - layout detection time: 1.48
2024-10-12 10:35:37.217 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:295 - mfd time: 0.11
2024-10-12 10:35:37.734 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:323 - formula nums: 17, mfr time: 0.52
2024-10-12 10:35:37.936 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:347 - gc time: 0.2
2024-10-12 10:35:41.517 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:431 - table time: 3.58
2024-10-12 10:35:41.517 | INFO     | magic_pdf.model.pdf_extract_kit:__call__:433 - -----page total time: 5.9-----
2024-10-12 10:35:41.662 | INFO     | magic_pdf.model.doc_analyze_by_custom_model:doc_analyze:152 - gc time: 0.14
2024-10-12 10:35:41.662 | INFO     | magic_pdf.model.doc_analyze_by_custom_model:doc_analyze:156 - doc analyze time: 19.78, speed: 0.15 pages/second

我这边是单卡3060ti，跑表格会显存溢出一些，使用内存代替显存会更慢一些，足量16G显存的设备应该会更快

您是用的最新模型V1 版本模型吗?

myhloli commented 2 weeks ago

@tao-xiaoxin v1的模型只更新了一个yolo版本的layout，mineru项目目前没有更新v1模型，你可以直接在现有环境安装dev分支代码试试 pip install git+https://github.com/opendatalab/MinerU.git@dev

Tendo33 commented 2 weeks ago

安装paddlepaddle-gpu==3.0.0b1 会跟 torch 2.3.1有冲突，这样会有影响吗

torch 2.3.1 requires nvidia-cublas-cu12==12.1.3.1; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.3.4.1 which is incompatible.
torch 2.3.1 requires nvidia-cuda-cupti-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.3.101 which is incompatible.
torch 2.3.1 requires nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.3.107 which is incompatible.
torch 2.3.1 requires nvidia-cuda-runtime-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-runtime-cu12 12.3.101 which is incompatible.
torch 2.3.1 requires nvidia-cudnn-cu12==8.9.2.26; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.0.0.312 which is incompatible.
torch 2.3.1 requires nvidia-cufft-cu12==11.0.2.54; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cufft-cu12 11.2.1.3 which is incompatible.
torch 2.3.1 requires nvidia-curand-cu12==10.3.2.106; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-curand-cu12 10.3.5.147 which is incompatible.
torch 2.3.1 requires nvidia-cusolver-cu12==11.4.5.107; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cusolver-cu12 11.6.1.9 which is incompatible.
torch 2.3.1 requires nvidia-cusparse-cu12==12.1.0.106; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cusparse-cu12 12.3.1.170 which is incompatible.
torch 2.3.1 requires nvidia-nccl-cu12==2.20.5; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-nccl-cu12 2.19.3 which is incompatible.
torch 2.3.1 requires nvidia-nvtx-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-nvtx-cu12 12.4.127 which is incompatible.

myhloli commented 2 weeks ago

安装paddlepaddle-gpu==3.0.0b1 会跟 torch 2.3.1有冲突，这样会有影响吗

torch 2.3.1 requires nvidia-cublas-cu12==12.1.3.1; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.3.4.1 which is incompatible.
torch 2.3.1 requires nvidia-cuda-cupti-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.3.101 which is incompatible.
torch 2.3.1 requires nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.3.107 which is incompatible.
torch 2.3.1 requires nvidia-cuda-runtime-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-runtime-cu12 12.3.101 which is incompatible.
torch 2.3.1 requires nvidia-cudnn-cu12==8.9.2.26; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.0.0.312 which is incompatible.
torch 2.3.1 requires nvidia-cufft-cu12==11.0.2.54; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cufft-cu12 11.2.1.3 which is incompatible.
torch 2.3.1 requires nvidia-curand-cu12==10.3.2.106; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-curand-cu12 10.3.5.147 which is incompatible.
torch 2.3.1 requires nvidia-cusolver-cu12==11.4.5.107; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cusolver-cu12 11.6.1.9 which is incompatible.
torch 2.3.1 requires nvidia-cusparse-cu12==12.1.0.106; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cusparse-cu12 12.3.1.170 which is incompatible.
torch 2.3.1 requires nvidia-nccl-cu12==2.20.5; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-nccl-cu12 2.19.3 which is incompatible.
torch 2.3.1 requires nvidia-nvtx-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-nvtx-cu12 12.4.127 which is incompatible.

在linux环境，官方推荐安装使用cuda11.8的paddlepaddle-gpu==3.0.0b1以避免和torch产生环境冲突

tao-xiaoxin commented 2 weeks ago

@tao-xiaoxin v1的模型只更新了一个yolo版本的layout，mineru项目目前没有更新v1模型，你可以直接在现有环境安装dev分支代码试试 pip install git+https://github.com/opendatalab/MinerU.git@dev

大佬,我又发了一份解析乱码的,麻烦帮我看看

myhloli commented 2 weeks ago

@tao-xiaoxin v1的模型只更新了一个yolo版本的layout，mineru项目目前没有更新v1模型，你可以直接在现有环境安装dev分支代码试试 pip install git+https://github.com/opendatalab/MinerU.git@dev

大佬,我又发了一份解析乱码的,麻烦帮我看看

pdf直接复制文本出来就是乱码，可以在命令中加-m ocr强制使用ocr模式解决乱码问题

tao-xiaoxin commented 2 weeks ago

@tao-xiaoxin v1的模型只更新了一个yolo版本的layout，mineru项目目前没有更新v1模型，你可以直接在现有环境安装dev分支代码试试 pip install git+https://github.com/opendatalab/MinerU.git@dev

大佬,我在一个没有网络的机器上安装,通过源码安装dev 分支的代码执行了如下命令:

  cd MinerU/
  source ./venv/bin/activate
  pip install -e .
  magic_pdf

最后安装好以后提示命令不存在,这么怎么解决一下?

magic_pdf
bash: magic_pdf: command not found

myhloli commented 2 weeks ago

@tao-xiaoxin v1的模型只更新了一个yolo版本的layout，mineru项目目前没有更新v1模型，你可以直接在现有环境安装dev分支代码试试 pip install git+https://github.com/opendatalab/MinerU.git@dev

大佬,我在一个没有网络的机器上安装,通过源码安装dev 分支的代码执行了如下命令:
  cd MinerU/
  source ./venv/bin/activate
  pip install -e .
  magic_pdf
最后安装好以后提示命令不存在,这么怎么解决一下?
magic_pdf
bash: magic_pdf: command not found

magic-pdf

tao-xiaoxin commented 2 weeks ago

> magic-pdf

magic-pdf -p /gwm-tmp/experiments/pdf_decrypt/pdf_data/decrypted_a1dc3591061c7d36418dbf6c9df8149c.pdf -o ./pdf_data/  -m auto
2024-10-15 08:14:09.498 | INFO     | magic_pdf.libs.pdf_check:detect_invalid_chars:57 - cid_count: 0, text_len: 5687, cid_chars_radio: 0.0
2024-10-15 08:14:09.542 | ERROR    | magic_pdf.model.pdf_extract_kit:<module>:30 - No module named 'torchtext'
Traceback (most recent call last):

  File "/gwm-tmp/experiments/MinerU/venv/bin/magic-pdf", line 8, in <module>
    sys.exit(cli())
    │   │    └ <Command cli>
    │   └ <built-in function exit>
    └ <module 'sys' (built-in)>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
           │    │     │       └ {}
           │    │     └ ()
           │    └ <function BaseCommand.main at 0x7fd1c9f9f760>
           └ <Command cli>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/click/core.py", line 1078, in main
    rv = self.invoke(ctx)
         │    │      └ <click.core.Context object at 0x7fd1ca19dba0>
         │    └ <function Command.invoke at 0x7fd1c9fb0280>
         └ <Command cli>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
    return ctx.invoke(self.callback, **ctx.params)
           │   │      │    │           │   └ {'path': '/gwm-tmp/experiments/pdf_decrypt/pdf_data/decrypted_a1dc3591061c7d36418dbf6c9df8149c.pdf', 'output_dir': './pdf_dat...
           │   │      │    │           └ <click.core.Context object at 0x7fd1ca19dba0>
           │   │      │    └ <function cli at 0x7fd0732939a0>
           │   │      └ <Command cli>
           │   └ <function Context.invoke at 0x7fd1c9f9ef80>
           └ <click.core.Context object at 0x7fd1ca19dba0>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/click/core.py", line 783, in invoke
    return __callback(*args, **kwargs)
                       │       └ {'path': '/gwm-tmp/experiments/pdf_decrypt/pdf_data/decrypted_a1dc3591061c7d36418dbf6c9df8149c.pdf', 'output_dir': './pdf_dat...
                       └ ()

  File "/gwm-tmp/experiments/MinerU/magic_pdf/tools/cli.py", line 115, in cli
    parse_doc(path)
    │         └ '/gwm-tmp/experiments/pdf_decrypt/pdf_data/decrypted_a1dc3591061c7d36418dbf6c9df8149c.pdf'
    └ <function cli.<locals>.parse_doc at 0x7fd1ca1d0940>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/tools/cli.py", line 96, in parse_doc
    do_parse(
    └ <function do_parse at 0x7fd1c571c790>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/tools/common.py", line 82, in do_parse
    pipe.pipe_analyze()
    │    └ <function UNIPipe.pipe_analyze at 0x7fd073293250>
    └ <magic_pdf.pipe.UNIPipe.UNIPipe object at 0x7fd073273190>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/pipe/UNIPipe.py", line 30, in pipe_analyze
    self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
    │    │            │           │    └ b'%PDF-1.5\r%\xe2\xe3\xcf\xd3\r\n1 0 obj<</Contents 2 0 R/Type/Page/Parent 112 0 R/Rotate 0/MediaBox[0.0 0.0 595.275574 841.8...
    │    │            │           └ <magic_pdf.pipe.UNIPipe.UNIPipe object at 0x7fd073273190>
    │    │            └ <function doc_analyze at 0x7fd075143250>
    │    └ []
    └ <magic_pdf.pipe.UNIPipe.UNIPipe object at 0x7fd073273190>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/doc_analyze_by_custom_model.py", line 123, in doc_analyze
    custom_model = model_manager.get_model(ocr, show_log, lang)
                   │             │         │    │         └ None
                   │             │         │    └ False
                   │             │         └ False
                   │             └ <function ModelSingleton.get_model at 0x7fd0751431c0>
                   └ <magic_pdf.model.doc_analyze_by_custom_model.ModelSingleton object at 0x7fd07294ccd0>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/doc_analyze_by_custom_model.py", line 74, in get_model
    self._models[key] = custom_model_init(ocr=ocr, show_log=show_log, lang=lang)
    │    │       │      │                     │             │              └ None
    │    │       │      │                     │             └ False
    │    │       │      │                     └ False
    │    │       │      └ <function custom_model_init at 0x7fd0751430a0>
    │    │       └ (False, False, None)
    │    └ {}
    └ <magic_pdf.model.doc_analyze_by_custom_model.ModelSingleton object at 0x7fd07294ccd0>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/doc_analyze_by_custom_model.py", line 94, in custom_model_init
    from magic_pdf.model.pdf_extract_kit import CustomPEKModel

  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed

> File "/gwm-tmp/experiments/MinerU/magic_pdf/model/pdf_extract_kit.py", line 17, in <module>
    import torchtext

ModuleNotFoundError: No module named 'torchtext'
2024-10-15 08:14:09.548 | ERROR    | magic_pdf.model.pdf_extract_kit:<module>:31 - Required dependency not installed, please install by 
"pip install magic-pdf[full] --extra-index-url https://myhloli.github.io/wheels/"

执行解析命令的时候就报错,提示这个了

myhloli commented 2 weeks ago

No module named 'torchtext' 安装dev分支需要使用pip install -e .[full]以确保正确安装所有依赖环境

tao-xiaoxin commented 2 weeks ago

pip install -e .[full]

Obtaining file:///gwm-tmp/experiments/MinerU
  Installing build dependencies ... done
  Checking if build backend supports build_editable ... done
  Getting requirements to build editable ... done
  Preparing editable metadata (pyproject.toml) ... done
Requirement already satisfied: boto3>=1.28.43 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (1.35.40)
Requirement already satisfied: Brotli>=1.1.0 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (1.1.0)
Requirement already satisfied: click>=8.1.7 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (8.1.7)
Requirement already satisfied: fast-langdetect==0.2.0 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (0.2.0)
Requirement already satisfied: loguru>=0.6.0 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (0.7.2)
Requirement already satisfied: numpy<2.0.0,>=1.21.6 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (1.26.4)
Requirement already satisfied: pdfminer.six==20231228 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (20231228)
Requirement already satisfied: pydantic<2.8.0,>=2.7.2 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (2.7.4)
Requirement already satisfied: PyMuPDF>=1.24.9 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (1.24.11)
Requirement already satisfied: scikit-learn>=1.0.2 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (1.5.2)
Requirement already satisfied: wordninja>=2.0.0 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (2.0.0)
Requirement already satisfied: torch<=2.3.1,>=2.2.2 in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (2.3.1)
Requirement already satisfied: transformers in ./venv/lib/python3.10/site-packages (from magic_pdf==0.8.0) (4.45.2)
Requirement already satisfied: fasttext-wheel>=0.9.2 in ./venv/lib/python3.10/site-packages (from fast-langdetect==0.2.0->magic_pdf==0.8.0) (0.9.2)
Requirement already satisfied: robust-downloader>=0.0.2 in ./venv/lib/python3.10/site-packages (from fast-langdetect==0.2.0->magic_pdf==0.8.0) (0.0.2)
Requirement already satisfied: langdetect>=1.0.9 in ./venv/lib/python3.10/site-packages (from fast-langdetect==0.2.0->magic_pdf==0.8.0) (1.0.9)
Requirement already satisfied: charset-normalizer>=2.0.0 in ./venv/lib/python3.10/site-packages (from pdfminer.six==20231228->magic_pdf==0.8.0) (3.4.0)
Requirement already satisfied: cryptography>=36.0.0 in ./venv/lib/python3.10/site-packages (from pdfminer.six==20231228->magic_pdf==0.8.0) (43.0.1)
Collecting unimernet==0.2.1 (from magic_pdf==0.8.0)
  Downloading http://nexus.gwm.cn/repository/pypi-group/packages/unimernet/0.2.1/unimernet-0.2.1-py3-none-any.whl (2.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.3/2.3 MB 1.3 MB/s eta 0:00:00
Collecting ultralytics (from magic_pdf==0.8.0)
  Downloading http://nexus.gwm.cn/repository/pypi-group/packages/ultralytics/8.3.13/ultralytics-8.3.13-py3-none-any.whl (870 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 870.5/870.5 kB 1.6 MB/s eta 0:00:00
Collecting paddleocr==2.7.3 (from magic_pdf==0.8.0)
  Downloading http://nexus.gwm.cn/repository/pypi-group/packages/paddleocr/2.7.3/paddleocr-2.7.3-py3-none-any.whl (780 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 780.0/780.0 kB 1.8 MB/s eta 0:00:00
Collecting pypandoc (from magic_pdf==0.8.0)
  Downloading http://nexus.gwm.cn/repository/pypi-group/packages/pypandoc/1.14/pypandoc-1.14-py3-none-any.whl (21 kB)
Collecting struct-eqtable==0.1.0 (from magic_pdf==0.8.0)
  Downloading http://nexus.gwm.cn/repository/pypi-group/packages/struct-eqtable/0.1.0/struct_eqtable-0.1.0-py3-none-any.whl (8.5 kB)
INFO: pip is looking at multiple versions of magic-pdf[full] to determine which version is compatible with other requirements. This could take a while.
ERROR: Could not find a version that satisfies the requirement detectron2; extra == "full" (from magic-pdf[full]) (from versions: none)
ERROR: No matching distribution found for detectron2; extra == "full"

提示这个错误了

myhloli commented 2 weeks ago

pip install -e .[full] --extra-index-url https://wheels.myhloli.com

tao-xiaoxin commented 1 week ago

pip install -e .[full] --extra-index-url https://wheels.myhloli.com

大佬,我升级了一下touch 版本,然后就出现了如下错误, pip3 install --upgrade torch torchvision torchaudio 这个不支持最新版torch 版本吗? 据说最新的touch 版本加载推理模型会快一点

2024-10-18 05:21:26.085 | ERROR    | app.services.document_service.pdf_service:load_model:34 | process:6093 | thread:139703377499968 | /gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs
Traceback (most recent call last):

  File "<string>", line 1, in <module>
  File "/opt/conda/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               │     │   └ 4
               │     └ 7
               └ <function _main at 0x7f0e9755c550>
  File "/opt/conda/lib/python3.10/multiprocessing/spawn.py", line 129, in _main
    return self._bootstrap(parent_sentinel)
           │    │          └ 4
           │    └ <function BaseProcess._bootstrap at 0x7f0e975295a0>
           └ <SpawnProcess name='SpawnProcess-1' parent=6028 started>
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
    │    └ <function BaseProcess.run at 0x7f0e97528c10>
    └ <SpawnProcess name='SpawnProcess-1' parent=6028 started>
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
    │    │        │    │        │    └ {'config': <uvicorn.config.Config object at 0x7f0e97a6ffa0>, 'target': <bound method Server.run of <uvicorn.server.Server obj...
    │    │        │    │        └ <SpawnProcess name='SpawnProcess-1' parent=6028 started>
    │    │        │    └ ()
    │    │        └ <SpawnProcess name='SpawnProcess-1' parent=6028 started>
    │    └ <function subprocess_started at 0x7f0e9707b760>
    └ <SpawnProcess name='SpawnProcess-1' parent=6028 started>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/uvicorn/_subprocess.py", line 80, in subprocess_started
    target(sockets=sockets)
    │              └ [<socket.socket fd=3, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 9992)>]
    └ <bound method Server.run of <uvicorn.server.Server object at 0x7f0e97996260>>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/uvicorn/server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
           │       │   │    │             └ [<socket.socket fd=3, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 9992)>]
           │       │   │    └ <function Server.serve at 0x7f0e9707a7a0>
           │       │   └ <uvicorn.server.Server object at 0x7f0e97996260>
           │       └ <function run at 0x7f0e9755dc60>
           └ <module 'asyncio' from '/opt/conda/lib/python3.10/asyncio/__init__.py'>
  File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
           │    │                  └ <coroutine object Server.serve at 0x7f0e9702fa70>
           │    └ <function BaseEventLoop.run_until_complete at 0x7f0e973d17e0>
           └ <_UnixSelectorEventLoop running=True closed=False debug=False>
  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete
    self.run_forever()
    │    └ <function BaseEventLoop.run_forever at 0x7f0e973d1750>
    └ <_UnixSelectorEventLoop running=True closed=False debug=False>
  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    self._run_once()
    │    └ <function BaseEventLoop._run_once at 0x7f0e973d3250>
    └ <_UnixSelectorEventLoop running=True closed=False debug=False>
  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
    handle._run()
    │      └ <function Handle._run at 0x7f0e9736ac20>
    └ <Handle <TaskStepMethWrapper object at 0x7f0e9708f280>()>
  File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
    │    │            │    │           │    └ <member '_args' of 'Handle' objects>
    │    │            │    │           └ <Handle <TaskStepMethWrapper object at 0x7f0e9708f280>()>
    │    │            │    └ <member '_callback' of 'Handle' objects>
    │    │            └ <Handle <TaskStepMethWrapper object at 0x7f0e9708f280>()>
    │    └ <member '_context' of 'Handle' objects>
    └ <Handle <TaskStepMethWrapper object at 0x7f0e9708f280>()>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/uvicorn/server.py", line 69, in serve
    await self._serve(sockets)
          │    │      └ [<socket.socket fd=3, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 9992)>]
          │    └ <function Server._serve at 0x7f0e9707a830>
          └ <uvicorn.server.Server object at 0x7f0e97996260>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/uvicorn/server.py", line 76, in _serve
    config.load()
    │      └ <function Config.load at 0x7f0e970237f0>
    └ <uvicorn.config.Config object at 0x7f0e97a6ffa0>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/uvicorn/config.py", line 434, in load
    self.loaded_app = import_from_string(self.app)
    │                 │                  │    └ 'app.main:app'
    │                 │                  └ <uvicorn.config.Config object at 0x7f0e97a6ffa0>
    │                 └ <function import_from_string at 0x7f0e9732cf70>
    └ <uvicorn.config.Config object at 0x7f0e97a6ffa0>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/uvicorn/importer.py", line 19, in import_from_string
    module = importlib.import_module(module_str)
             │         │             └ 'app.main'
             │         └ <function import_module at 0x7f0f39d6cf70>
             └ <module 'importlib' from '/opt/conda/lib/python3.10/importlib/__init__.py'>
  File "/opt/conda/lib/python3.10/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           │          │           │    │        │        └ 0
           │          │           │    │        └ None
           │          │           │    └ 0
           │          │           └ 'app.main'
           │          └ <function _gcd_import at 0x7f0f39f83400>
           └ <module '_frozen_importlib' (frozen)>
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed

  File "/gwm-tmp/aitools/app/main.py", line 13, in <module>
    app = register_app()
          └ <function register_app at 0x7f0e96fe8310>

  File "/gwm-tmp/aitools/app/core/registrar.py", line 48, in register_app
    initialize_models()
    └ <function initialize_models at 0x7f0cf0818ee0>

  File "/gwm-tmp/aitools/app/core/model_registrar.py", line 62, in initialize_models
    model_registry.initialize_models()
    │              └ <function ModelRegistry.initialize_models at 0x7f0cf0819000>
    └ <app.core.model_registrar.ModelRegistry object at 0x7f0cf0811bd0>

  File "/gwm-tmp/aitools/app/core/model_registrar.py", line 49, in initialize_models
    pdf_service.load_model()
    │           └ <function PDFService.load_model at 0x7f0cf0975240>
    └ <app.services.document_service.pdf_service.PDFService object at 0x7f0cf09377c0>

> File "/gwm-tmp/aitools/app/services/document_service/pdf_service.py", line 28, in load_model
    txt_model = model_manager.get_model(False, False)
                │             └ <function ModelSingleton.get_model at 0x7f0cf183ed40>
                └ <magic_pdf.model.doc_analyze_by_custom_model.ModelSingleton object at 0x7f0cf7b16fe0>

  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/magic_pdf/model/doc_analyze_by_custom_model.py", line 63, in get_model
    self._models[key] = custom_model_init(ocr=ocr, show_log=show_log)
    │    │       │      │                     │             └ False
    │    │       │      │                     └ False
    │    │       │      └ <function custom_model_init at 0x7f0cf183ec20>
    │    │       └ (False, False)
    │    └ {}
    └ <magic_pdf.model.doc_analyze_by_custom_model.ModelSingleton object at 0x7f0cf7b16fe0>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/magic_pdf/model/doc_analyze_by_custom_model.py", line 83, in custom_model_init
    from magic_pdf.model.pdf_extract_kit import CustomPEKModel
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/magic_pdf/model/pdf_extract_kit.py", line 15, in <module>
    import torchtext
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/__init__.py", line 18, in <module>
    from torchtext import _extension  # noqa: F401
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/_extension.py", line 64, in <module>
    _init_extension()
    └ <function _init_extension at 0x7f097ded84c0>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/_extension.py", line 58, in _init_extension
    _load_lib("libtorchtext")
    └ <function _load_lib at 0x7f097ded8430>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/_extension.py", line 50, in _load_lib
    torch.ops.load_library(path)
    │     │   │            └ PosixPath('/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so')
    │     │   └ <function _Ops.load_library at 0x7f0dda3516c0>
    │     └ <module 'torch.ops' from '_ops.py'>
    └ <module 'torch' from '/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torch/__init__.py'>
  File "/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torch/_ops.py", line 1350, in load_library
    ctypes.CDLL(path)
    │      │    └ '/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so'
    │      └ <class 'ctypes.CDLL'>
    └ <module 'ctypes' from '/opt/conda/lib/python3.10/ctypes/__init__.py'>
  File "/opt/conda/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
    │    │         │       │    │      └ 0
    │    │         │       │    └ '/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so'
    │    │         │       └ <CDLL '/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so', handle 0 at 0x7f097deec1c0>
    │    │         └ <built-in function dlopen>
    │    └ 0
    └ <CDLL '/gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so', handle 0 at 0x7f097deec1c0>

OSError: /gwm-tmp/aitools/venv/lib/python3.10/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

myhloli commented 1 week ago

不支持最新的，换成2.3.1

tao-xiaoxin commented 1 week ago

不支持最新的，换成2.3.1

大佬,我在一个没有网的环境解析pdf ,解析报错,好像是在下什么东西,能告诉我下是下载什么文件到什么路径下吗?

2024-10-22 03:21:30.393 | INFO     | magic_pdf.model.pdf_extract_kit:__init__:215 - using models_dir: /data/ai_models/pdf_models/opendatalab/pdf-extract-kit-1.0/models
2024-10-22 03:23:41.099 | ERROR    | magic_pdf.tools.cli:parse_doc:109 - HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/ultralytics/assets/releases/tags/v8.3.0 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>, 'Connection to api.github.com timed out. (connect timeout=None)'))
Traceback (most recent call last):

  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/connection.py", line 199, in _new_conn
    sock = connection.create_connection(
           │          └ <function create_connection at 0x7f61ea4d0820>
           └ <module 'urllib3.util.connection' from '/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/util/connection...
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
          └ None
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
    │    │       └ ('20.205.243.168', 443)
    │    └ <method 'connect' of '_socket.socket' objects>
    └ <socket.socket [closed] fd=-1, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=6>

TimeoutError: [Errno 110] Connection timed out

The above exception was the direct cause of the following exception:

Traceback (most recent call last):

  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/connectionpool.py", line 789, in urlopen
    response = self._make_request(
               │    └ <function HTTPConnectionPool._make_request at 0x7f61ea40bbe0>
               └ <urllib3.connectionpool.HTTPSConnectionPool object at 0x7f5d4c732950>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/connectionpool.py", line 490, in _make_request
    raise new_e
          └ ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>, 'Connection to api.github.com timed out. (...
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/connectionpool.py", line 466, in _make_request
    self._validate_conn(conn)
    │    │              └ <urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>
    │    └ <function HTTPSConnectionPool._validate_conn at 0x7f61ea424040>
    └ <urllib3.connectionpool.HTTPSConnectionPool object at 0x7f5d4c732950>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1095, in _validate_conn
    conn.connect()
    │    └ <function HTTPSConnection.connect at 0x7f61ea408430>
    └ <urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/connection.py", line 693, in connect
    self.sock = sock = self._new_conn()
    │    │             │    └ <function HTTPConnection._new_conn at 0x7f61ea413ac0>
    │    │             └ <urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>
    │    └ None
    └ <urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/connection.py", line 208, in _new_conn
    raise ConnectTimeoutError(
          └ <class 'urllib3.exceptions.ConnectTimeoutError'>

urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>, 'Connection to api.github.com timed out. (connect timeout=None)')

The above exception was the direct cause of the following exception:

Traceback (most recent call last):

  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/requests/adapters.py", line 667, in send
    resp = conn.urlopen(
           │    └ <function HTTPConnectionPool.urlopen at 0x7f61ea40bd90>
           └ <urllib3.connectionpool.HTTPSConnectionPool object at 0x7f5d4c732950>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/connectionpool.py", line 843, in urlopen
    retries = retries.increment(
              │       └ <function Retry.increment at 0x7f61ea4d2e60>
              └ Retry(total=0, connect=None, read=False, redirect=None, status=None)
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment
    raise MaxRetryError(_pool, url, reason) from reason  # type: ignore[arg-type]
          │             │      │    │            └ ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>, 'Connection to api.github.com timed out. (...
          │             │      │    └ ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>, 'Connection to api.github.com timed out. (...
          │             │      └ '/repos/ultralytics/assets/releases/tags/v8.3.0'
          │             └ <urllib3.connectionpool.HTTPSConnectionPool object at 0x7f5d4c732950>
          └ <class 'urllib3.exceptions.MaxRetryError'>

urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/ultralytics/assets/releases/tags/v8.3.0 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>, 'Connection to api.github.com timed out. (connect timeout=None)'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):

  File "/gwm-tmp/experiments/MinerU/venv/bin/magic-pdf", line 8, in <module>
    sys.exit(cli())
    │   │    └ <Command cli>
    │   └ <built-in function exit>
    └ <module 'sys' (built-in)>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
           │    │     │       └ {}
           │    │     └ ()
           │    └ <function BaseCommand.main at 0x7f61eb0776d0>
           └ <Command cli>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/click/core.py", line 1078, in main
    rv = self.invoke(ctx)
         │    │      └ <click.core.Context object at 0x7f61eb271b40>
         │    └ <function Command.invoke at 0x7f61eb08c1f0>
         └ <Command cli>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
    return ctx.invoke(self.callback, **ctx.params)
           │   │      │    │           │   └ {'path': './downloaded_pdf.pdf', 'output_dir': './output', 'method': 'auto', 'lang': None, 'debug_able': False, 'start_page_i...
           │   │      │    │           └ <click.core.Context object at 0x7f61eb271b40>
           │   │      │    └ <function cli at 0x7f6097bfa7a0>
           │   │      └ <Command cli>
           │   └ <function Context.invoke at 0x7f61eb076ef0>
           └ <click.core.Context object at 0x7f61eb271b40>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/click/core.py", line 783, in invoke
    return __callback(*args, **kwargs)
                       │       └ {'path': './downloaded_pdf.pdf', 'output_dir': './output', 'method': 'auto', 'lang': None, 'debug_able': False, 'start_page_i...
                       └ ()

  File "/gwm-tmp/experiments/MinerU/magic_pdf/tools/cli.py", line 115, in cli
    parse_doc(path)
    │         └ './downloaded_pdf.pdf'
    └ <function cli.<locals>.parse_doc at 0x7f61eb2a88b0>

> File "/gwm-tmp/experiments/MinerU/magic_pdf/tools/cli.py", line 96, in parse_doc
    do_parse(
    └ <function do_parse at 0x7f61e67fee60>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/tools/common.py", line 82, in do_parse
    pipe.pipe_analyze()
    │    └ <function UNIPipe.pipe_analyze at 0x7f6097bfa050>
    └ <magic_pdf.pipe.UNIPipe.UNIPipe object at 0x7f61eb2724a0>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/pipe/UNIPipe.py", line 30, in pipe_analyze
    self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
    │    │            │           │    └ b'%PDF-1.5\r\n%\xb5\xb5\xb5\xb5\r\n1 0 obj\r\n<</Type/Catalog/Pages 2 0 R/Lang(zh-CN) /StructTreeRoot 156 0 R/MarkInfo<</Mark...
    │    │            │           └ <magic_pdf.pipe.UNIPipe.UNIPipe object at 0x7f61eb2724a0>
    │    │            └ <function doc_analyze at 0x7f6098fea050>
    │    └ []
    └ <magic_pdf.pipe.UNIPipe.UNIPipe object at 0x7f61eb2724a0>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/doc_analyze_by_custom_model.py", line 123, in doc_analyze
    custom_model = model_manager.get_model(ocr, show_log, lang)
                   │             │         │    │         └ None
                   │             │         │    └ False
                   │             │         └ False
                   │             └ <function ModelSingleton.get_model at 0x7f6098fe9fc0>
                   └ <magic_pdf.model.doc_analyze_by_custom_model.ModelSingleton object at 0x7f6096fd7670>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/doc_analyze_by_custom_model.py", line 74, in get_model
    self._models[key] = custom_model_init(ocr=ocr, show_log=show_log, lang=lang)
    │    │       │      │                     │             │              └ None
    │    │       │      │                     │             └ False
    │    │       │      │                     └ False
    │    │       │      └ <function custom_model_init at 0x7f6098fe9ea0>
    │    │       └ (False, False, None)
    │    └ {}
    └ <magic_pdf.model.doc_analyze_by_custom_model.ModelSingleton object at 0x7f6096fd7670>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/doc_analyze_by_custom_model.py", line 106, in custom_model_init
    custom_model = CustomPEKModel(**model_input)
                   │                └ {'ocr': False, 'show_log': False, 'models_dir': '/data/ai_models/pdf_models/opendatalab/pdf-extract-kit-1.0/models', 'device'...
                   └ <class 'magic_pdf.model.pdf_extract_kit.CustomPEKModel'>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/pdf_extract_kit.py", line 223, in __init__
    self.mfd_model = atom_model_manager.get_atom_model(
    │                │                  └ <function AtomModelSingleton.get_atom_model at 0x7f5d4c72dc60>
    │                └ <magic_pdf.model.pdf_extract_kit.AtomModelSingleton object at 0x7f6097241f90>
    └ <magic_pdf.model.pdf_extract_kit.CustomPEKModel object at 0x7f6096fd60b0>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/pdf_extract_kit.py", line 118, in get_atom_model
    self._models[atom_model_name] = atom_model_init(model_name=atom_model_name, **kwargs)
    │    │       │                  │                          │                  └ {'mfd_weights': '/data/ai_models/pdf_models/opendatalab/pdf-extract-kit-1.0/models/MFD/weights.pt'}
    │    │       │                  │                          └ 'mfd'
    │    │       │                  └ <function atom_model_init at 0x7f5d4c72d990>
    │    │       └ 'mfd'
    │    └ {}
    └ <magic_pdf.model.pdf_extract_kit.AtomModelSingleton object at 0x7f6097241f90>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/pdf_extract_kit.py", line 131, in atom_model_init
    atom_model = mfd_model_init(
                 └ <function mfd_model_init at 0x7f5d4c72c8b0>

  File "/gwm-tmp/experiments/MinerU/magic_pdf/model/pdf_extract_kit.py", line 56, in mfd_model_init
    mfd_model = YOLO(weight)
                │    └ '/data/ai_models/pdf_models/opendatalab/pdf-extract-kit-1.0/models/MFD/weights.pt'
                └ <class 'ultralytics.models.yolo.model.YOLO'>

  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/ultralytics/models/yolo/model.py", line 23, in __init__
    super().__init__(model=model, task=task, verbose=verbose)
                           │           │             └ False
                           │           └ None
                           └ '/data/ai_models/pdf_models/opendatalab/pdf-extract-kit-1.0/models/MFD/weights.pt'
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/ultralytics/engine/model.py", line 145, in __init__
    self._load(model, task=task)
    │    │     │           └ None
    │    │     └ '/data/ai_models/pdf_models/opendatalab/pdf-extract-kit-1.0/models/MFD/weights.pt'
    │    └ <function Model._load at 0x7f60858ada20>
    └ YOLO()
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/ultralytics/engine/model.py", line 285, in _load
    self.model, self.ckpt = attempt_load_one_weight(weights)
    │    │      │    │      │                       └ '/data/ai_models/pdf_models/opendatalab/pdf-extract-kit-1.0/models/MFD/weights.pt'
    │    │      │    │      └ <function attempt_load_one_weight at 0x7f6085856a70>
    │    │      │    └ None
    │    │      └ YOLO()
    │    └ None
    └ YOLO()
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/ultralytics/nn/tasks.py", line 910, in attempt_load_one_weight
    ckpt, weight = torch_safe_load(weight)  # load ckpt
                   │               └ '/data/ai_models/pdf_models/opendatalab/pdf-extract-kit-1.0/models/MFD/weights.pt'
                   └ <function torch_safe_load at 0x7f60858567a0>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/ultralytics/nn/tasks.py", line 815, in torch_safe_load
    file = attempt_download_asset(weight)  # search online if missing locally
           │                      └ '/data/ai_models/pdf_models/opendatalab/pdf-extract-kit-1.0/models/MFD/weights.pt'
           └ <function attempt_download_asset at 0x7f6085abad40>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/ultralytics/utils/downloads.py", line 457, in attempt_download_asset
    tag, assets = get_github_assets(repo, release)
                  │                 │     └ 'v8.3.0'
                  │                 └ 'ultralytics/assets'
                  └ <function get_github_assets at 0x7f6085abacb0>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/ultralytics/utils/downloads.py", line 402, in get_github_assets
    r = requests.get(url)  # github api
        │        │   └ 'https://api.github.com/repos/ultralytics/assets/releases/tags/v8.3.0'
        │        └ <function get at 0x7f6098520e50>
        └ <module 'requests' from '/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/requests/__init__.py'>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/requests/api.py", line 73, in get
    return request("get", url, params=params, **kwargs)
           │              │           │         └ {}
           │              │           └ None
           │              └ 'https://api.github.com/repos/ultralytics/assets/releases/tags/v8.3.0'
           └ <function request at 0x7f60984cab00>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/requests/api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
           │       │              │           │      └ {'params': None}
           │       │              │           └ 'https://api.github.com/repos/ultralytics/assets/releases/tags/v8.3.0'
           │       │              └ 'get'
           │       └ <function Session.request at 0x7f60985205e0>
           └ <requests.sessions.Session object at 0x7f60971508e0>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/requests/sessions.py", line 589, in request
    resp = self.send(prep, **send_kwargs)
           │    │    │       └ {'timeout': None, 'allow_redirects': True, 'proxies': OrderedDict(), 'stream': False, 'verify': True, 'cert': None}
           │    │    └ <PreparedRequest [GET]>
           │    └ <function Session.send at 0x7f6098520a60>
           └ <requests.sessions.Session object at 0x7f60971508e0>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/requests/sessions.py", line 703, in send
    r = adapter.send(request, **kwargs)
        │       │    │          └ {'timeout': None, 'proxies': OrderedDict(), 'stream': False, 'verify': True, 'cert': None}
        │       │    └ <PreparedRequest [GET]>
        │       └ <function HTTPAdapter.send at 0x7f60984f7eb0>
        └ <requests.adapters.HTTPAdapter object at 0x7f5d4c730850>
  File "/gwm-tmp/experiments/MinerU/venv/lib/python3.10/site-packages/requests/adapters.py", line 688, in send
    raise ConnectTimeout(e, request=request)
          │                         └ <PreparedRequest [GET]>
          └ <class 'requests.exceptions.ConnectTimeout'>

requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/ultralytics/assets/releases/tags/v8.3.0 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f5d4c732ce0>, 'Connection to api.github.com timed out. (connect timeout=None)'))

myhloli commented 1 week ago

模型路径指定错了，我们目前还没有兼容1.0的kit

opendatalab / MinerU