Open victorjourne opened 3 months ago
Bout de code de extract_text_and_ocr qui peut aider
with pymupdf.open(pdf_path) as pdf_document:
if selected_pages == "all":
selected_pages = tuple(range(1, pdf_document.page_count + 1))
assert isinstance(selected_pages, Iterable), f"{selected_pages=}"
for page_num in selected_pages:
if page_num in current_cache:
result[page_num] = current_cache[page_num]
continue
text = pdf_document.get_page_text(page_num - 1) # # its 0-based page
use_OCR = len(clean_pdf_text(text)[0].strip()) <= 30 and ocr != "no_ocr" # [0] pour récupérer uniquement le texte
if not use_OCR:
page_content = text
result[page_num] = page_content
else:
page = pdf_document.load_page(page_num - 1) # # its 0-based page
image = page.get_pixmap(dpi=800)
image_pil = Image.frombytes("RGB", [image.width, image.height], image.samples)
2 options pour le pdf :