Closed xuboot closed 1 month ago
@VikParuchuri hello,This is the configuration I changed ,Can it be split into parallel processing so that it is faster and can it also satisfy concurrency? from typing import Optional, List, Dict, Literal, Union from dotenv import find_dotenv from pydantic import BaseSettings, Extra, Field import torch from pydantic_core.core_schema import computed_field
class Settings(BaseSettings):
TORCH_DEVICE: Optional[str] = Field(None, env="TORCH_DEVICE") # Let PyTorch decide the best device
IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at
EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them
@Extra.classproperty
def TORCH_DEVICE_MODEL(cls) -> str:
if cls.TORCH_DEVICE is not None:
return cls.TORCH_DEVICE
return "cuda" if torch.cuda.is_available() else "cpu"
INFERENCE_RAM: int = 15 # Reduced to avoid OOM errors on T4
VRAM_PER_TASK: float = 3.75 # Adjusted based on T4 VRAM availability
DEFAULT_LANG: str = "English" # Default language we assume files to be in
SUPPORTED_FILETYPES: Dict[str, str] = {
"application/pdf": "pdf",
}
# Text line Detection
DETECTOR_BATCH_SIZE: Optional[int] = Field(None, env="DETECTOR_BATCH_SIZE")
SURYA_DETECTOR_DPI: int = 96
DETECTOR_POSTPROCESSING_CPU_WORKERS: int = 4
# OCR
INVALID_CHARS: List[str] = [chr(0xfffd), ""]
OCR_ENGINE: Optional[Literal["surya", "ocrmypdf"]] = Field("ocrmypdf", env="OCR_ENGINE")
OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
# Surya
SURYA_OCR_DPI: int = 96
RECOGNITION_BATCH_SIZE: Optional[int] = Field(None, env="RECOGNITION_BATCH_SIZE")
# Tesseract
OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
TESSDATA_PREFIX: str = ""
# Texify model
TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
TEXIFY_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for texify
TEXIFY_DPI: int = 96 # DPI to render images at
TEXIFY_BATCH_SIZE: Optional[int] = Field(None, env="TEXIFY_BATCH_SIZE")
TEXIFY_MODEL_NAME: str = "vikp/texify"
# Layout model
SURYA_LAYOUT_DPI: int = 96
BAD_SPAN_TYPES: List[str] = [
"Caption",
"Footnote",
"Page-footer",
"Page-header",
"Picture",
]
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout2"
BBOX_INTERSECTION_THRESH: float = 0.7 # How much the layout and pdf bboxes need to overlap to be the same
LAYOUT_BATCH_SIZE: Optional[int] = Field(None, env="LAYOUT_BATCH_SIZE")
# Ordering model
SURYA_ORDER_DPI: int = 96
ORDER_BATCH_SIZE: Optional[int] = Field(None, env="ORDER_BATCH_SIZE")
ORDER_MAX_BBOXES: int = 255
# Final editing model
EDITOR_BATCH_SIZE: Optional[int] = Field(None, env="EDITOR_BATCH_SIZE")
EDITOR_MAX_LENGTH: int = 1024
EDITOR_MODEL_NAME: str = "vikp/pdf_postprocessor_t5"
ENABLE_EDITOR_MODEL: bool = False # The editor model can create false positives
EDITOR_CUTOFF_THRESH: float = 0.9 # Ignore predictions below this probability
# Ray
RAY_CACHE_PATH: Optional[str] = Field(None, env="RAY_CACHE_PATH")
RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker
# Debug
DEBUG: bool = Field(False, env="DEBUG")
DEBUG_DATA_FOLDER: Optional[str] = Field(None, env="DEBUG_DATA_FOLDER")
DEBUG_LEVEL: int = 0 # 0 to 2, 2 means log everything
@computed_field
@property
def CUDA(self) -> bool:
return "cuda" in self.TORCH_DEVICE_MODEL
@computed_field
@property
def MODEL_DTYPE(self) -> torch.dtype:
if self.CUDA:
return torch.bfloat16
else:
return torch.float32
@computed_field
@property
def TEXIFY_DTYPE(self) -> torch.dtype:
return torch.float32 if self.TORCH_DEVICE_MODEL == "cpu" else torch.float16
class Config:
env_file = find_dotenv("local.env")
extra = "ignore"
settings = Settings()
You can run in parallel using the marker
or chunk_convert scripts, see README
The processing is still very slow. Is the entire program executed in parallel or serially?