VikParuchuri / marker

Convert PDF to markdown quickly with high accuracy
https://www.datalab.to
GNU General Public License v3.0
14.15k stars 720 forks source link

The processing is still very slow. Is the entire program executed in parallel or serially? #125

Closed xuboot closed 1 month ago

xuboot commented 1 month ago

The processing is still very slow. Is the entire program executed in parallel or serially?

xuboot commented 1 month ago

@VikParuchuri hello,This is the configuration I changed ,Can it be split into parallel processing so that it is faster and can it also satisfy concurrency? from typing import Optional, List, Dict, Literal, Union from dotenv import find_dotenv from pydantic import BaseSettings, Extra, Field import torch from pydantic_core.core_schema import computed_field

class Settings(BaseSettings):

General

TORCH_DEVICE: Optional[str] = Field(None, env="TORCH_DEVICE")  # Let PyTorch decide the best device
IMAGE_DPI: int = 96  # DPI to render images pulled from pdf at
EXTRACT_IMAGES: bool = True  # Extract images from pdfs and save them

@Extra.classproperty
def TORCH_DEVICE_MODEL(cls) -> str:
    if cls.TORCH_DEVICE is not None:
        return cls.TORCH_DEVICE
    return "cuda" if torch.cuda.is_available() else "cpu"

INFERENCE_RAM: int = 15  # Reduced to avoid OOM errors on T4
VRAM_PER_TASK: float = 3.75  # Adjusted based on T4 VRAM availability
DEFAULT_LANG: str = "English"  # Default language we assume files to be in

SUPPORTED_FILETYPES: Dict[str, str] = {
    "application/pdf": "pdf",
}

# Text line Detection
DETECTOR_BATCH_SIZE: Optional[int] = Field(None, env="DETECTOR_BATCH_SIZE")
SURYA_DETECTOR_DPI: int = 96
DETECTOR_POSTPROCESSING_CPU_WORKERS: int = 4

# OCR
INVALID_CHARS: List[str] = [chr(0xfffd), ""]
OCR_ENGINE: Optional[Literal["surya", "ocrmypdf"]] = Field("ocrmypdf", env="OCR_ENGINE")
OCR_ALL_PAGES: bool = False  # Run OCR on every page even if text can be extracted

# Surya
SURYA_OCR_DPI: int = 96
RECOGNITION_BATCH_SIZE: Optional[int] = Field(None, env="RECOGNITION_BATCH_SIZE")

# Tesseract
OCR_PARALLEL_WORKERS: int = 2  # How many CPU workers to use for OCR
TESSERACT_TIMEOUT: int = 20  # When to give up on OCR
TESSDATA_PREFIX: str = ""

# Texify model
TEXIFY_MODEL_MAX: int = 384  # Max inference length for texify
TEXIFY_TOKEN_BUFFER: int = 256  # Number of tokens to buffer above max for texify
TEXIFY_DPI: int = 96  # DPI to render images at
TEXIFY_BATCH_SIZE: Optional[int] = Field(None, env="TEXIFY_BATCH_SIZE")
TEXIFY_MODEL_NAME: str = "vikp/texify"

# Layout model
SURYA_LAYOUT_DPI: int = 96
BAD_SPAN_TYPES: List[str] = [
    "Caption",
    "Footnote",
    "Page-footer",
    "Page-header",
    "Picture",
]
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout2"
BBOX_INTERSECTION_THRESH: float = 0.7  # How much the layout and pdf bboxes need to overlap to be the same
LAYOUT_BATCH_SIZE: Optional[int] = Field(None, env="LAYOUT_BATCH_SIZE")

# Ordering model
SURYA_ORDER_DPI: int = 96
ORDER_BATCH_SIZE: Optional[int] = Field(None, env="ORDER_BATCH_SIZE")
ORDER_MAX_BBOXES: int = 255

# Final editing model
EDITOR_BATCH_SIZE: Optional[int] = Field(None, env="EDITOR_BATCH_SIZE")
EDITOR_MAX_LENGTH: int = 1024
EDITOR_MODEL_NAME: str = "vikp/pdf_postprocessor_t5"
ENABLE_EDITOR_MODEL: bool = False  # The editor model can create false positives
EDITOR_CUTOFF_THRESH: float = 0.9  # Ignore predictions below this probability

# Ray
RAY_CACHE_PATH: Optional[str] = Field(None, env="RAY_CACHE_PATH")
RAY_CORES_PER_WORKER: int = 1  # How many cpu cores to allocate per worker

# Debug
DEBUG: bool = Field(False, env="DEBUG")
DEBUG_DATA_FOLDER: Optional[str] = Field(None, env="DEBUG_DATA_FOLDER")
DEBUG_LEVEL: int = 0  # 0 to 2, 2 means log everything

@computed_field
@property
def CUDA(self) -> bool:
    return "cuda" in self.TORCH_DEVICE_MODEL

@computed_field
@property
def MODEL_DTYPE(self) -> torch.dtype:
    if self.CUDA:
        return torch.bfloat16
    else:
        return torch.float32

@computed_field
@property
def TEXIFY_DTYPE(self) -> torch.dtype:
    return torch.float32 if self.TORCH_DEVICE_MODEL == "cpu" else torch.float16

class Config:
    env_file = find_dotenv("local.env")
    extra = "ignore"

Create an instance of the settings

settings = Settings()

VikParuchuri commented 1 month ago

You can run in parallel using the marker or chunk_convert scripts, see README