getomni-ai / zerox

Zero shot pdf OCR with gpt-4o-mini
https://getomni.ai/ocr-demo
MIT License
2.9k stars 132 forks source link

Python example script gives error on "None" select_pages #57

Open swombat opened 10 hours ago

swombat commented 10 hours ago
from pyzerox import zerox
import os
import json
import asyncio
import argparse

# Parse command-line arguments
parser = argparse.ArgumentParser(description='Process a PDF file with zerox.')
parser.add_argument('file_path', type=str, help='The file path or URL of the PDF to process.')
args = parser.parse_args()

# Use the file_path from the command-line arguments
file_path = args.file_path

### Model Setup (Use only Vision Models) Refer: https://docs.litellm.ai/docs/providers ###

## placeholder for additional model kwargs which might be required for some models
kwargs = {}

## system prompt to use for the vision model
custom_system_prompt = None

# to override
# custom_system_prompt = "For the below pdf page, do something..somthing..." ## example

###################### Example for OpenAI ######################
model = "gpt-4o-mini" ## openai model
os.environ["OPENAI_API_KEY"] = "*snip*"

###################### For other providers refer: https://docs.litellm.ai/docs/providers ######################

# Define main async entrypoint
async def main():
    ## process only some pages or all
    select_pages = None ## None for all, but could be int or list(int) page numbers (1 indexed)

    output_dir = "./output_test" ## directory to save the consolidated markdown file
    result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
                        custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs)
    return result

# run the main function:
result = asyncio.run(main())

# print markdown result
print(result)

I adjusted the script slightly to just use OpenAI and then added a command line argument parser.

When I run it on a test.pdf in the current folder I get:

✗ python3.11 lib/scripts/zerox.py test.pdf
Traceback (most recent call last):
  File "/Users/danieltenner/dev/bamboo/lib/scripts/zerox.py", line 44, in <module>
    result = asyncio.run(main())
             ^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/runners.py", line 190, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py", line 653, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/Users/danieltenner/dev/bamboo/lib/scripts/zerox.py", line 38, in main
    result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyzerox/core/zerox.py", line 91, in zerox
    select_pages = sorted(select_pages)
                   ^^^^^^^^^^^^^^^^^^^^
TypeError: 'NoneType' object is not iterable

There's probably some easy fix for this but I guess a lot of people might run into this so might be worth documenting. I'm on a Mac running Sonoma 14.3.1. I usually write ruby code, hence why I put the script in lib/scripts/ in my rails app.

swombat commented 10 hours ago

So if I set the select_pages to something like:

select_pages = list(range(1, 10))

It passes. But obviously I won't know the number of pages ahead of time. If I just give a high number of pages (e.g. 100) I get:

  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyzerox/processor/utils.py", line 94, in create_selected_pages_pdf
    raise PageNumberOutOfBoundError(extra_info={"input_pdf_num_pages":total_pages,
pyzerox.errors.exceptions.PageNumberOutOfBoundError: 
    The page number(s) provided is out of bound. Please provide a valid page number(s).
     (Extra Info: {'input_pdf_num_pages': 11, 'select_pages': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'invalid_page_numbers': [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]})
swombat commented 9 hours ago

Ok, so I got the script to work with PDF. But despite the docs claiming that Zerox is able to use libreoffice and graphicsmagick to convert docx/etc to image based PDFs, I could not get that working.

Here is my script, which works on Mac after pip-installing zerox python-docx odfpy openpyxl python-pptx.

from pyzerox import zerox
import os
import json
import asyncio
import argparse
from PyPDF2 import PdfReader
from docx import Document
from odf.opendocument import load as load_odt
from odf.text import P
import csv

def get_file_page_count(file_path):
    extension = os.path.splitext(file_path)[1].lower()

    if extension == '.pdf':
        with open(file_path, 'rb') as file:
            reader = PdfReader(file)
            return len(reader.pages)
    elif extension in ['.docx']:
        doc = Document(file_path)
        return len(doc.paragraphs)  # Approximation for pages
    elif extension in ['.odt', '.ott']:
        doc = load_odt(file_path)
        paragraphs = doc.getElementsByType(P)
        return len(paragraphs)  # Approximation for pages
    elif extension in ['.txt']:
        with open(file_path, 'r') as file:
            return len(file.readlines())  # Lines as a proxy for pages
    elif extension in ['.csv', '.tsv']:
        with open(file_path, 'r') as file:
            reader = csv.reader(file, delimiter=',' if extension == '.csv' else '\t')
            return sum(1 for row in reader)  # Rows as a proxy for pages
    # Add more elif blocks for other formats like .doc, .xls, .ppt, etc.
    else:
        raise ValueError(f"Unsupported file format: {extension}")

# Parse command-line arguments
parser = argparse.ArgumentParser(description='Process a PDF file with zerox.')
parser.add_argument('file_path', type=str, help='The file path or URL of the PDF to process.')
parser.add_argument('--open_ai_key', type=str, required=True, help='The OpenAI API key.')
parser.add_argument('--model', type=str, required=False, help='The model to use. (default gpt-4o-mini)')
parser.add_argument('--output_dir', type=str, required=False, help='The output file name. (default output_test/FILENAME.md)')
args = parser.parse_args()

# Use the file_path from the command-line arguments
file_path = args.file_path

### Model Setup (Use only Vision Models) Refer: https://docs.litellm.ai/docs/providers ###

## placeholder for additional model kwargs which might be required for some models
kwargs = {}

## system prompt to use for the vision model
custom_system_prompt = None

# to override
# custom_system_prompt = "For the below pdf page, do something..somthing..." ## example

###################### Example for OpenAI ######################
model = args.model if args.model else "gpt-4o-mini" ## openai model
os.environ["OPENAI_API_KEY"] = args.open_ai_key

###################### For other providers refer: https://docs.litellm.ai/docs/providers ######################

# Define main async entrypoint
async def main():
    select_pages = list(range(1, get_file_page_count(file_path) + 1))

    output_dir = args.output_dir if args.output_dir else "./output_test" ## directory to save the consolidated markdown file
    result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
                        custom_system_prompt=custom_system_prompt,select_pages=select_pages, **kwargs)
    return result

# run the main function:
result = asyncio.run(main())

# print markdown result
print(result)

Unfortunately despite the hopeful start it only seems to be able to process PDFs, even with libreoffice and graphicsmagick installed via brew.

swombat commented 8 hours ago

Alright, done!

I'm a Ruby/Rails coder (though I've dabbled in Python) and using this in the context of a Rails app so please forgive the hackiness, but here's a script that will correctly take in most of the listed formats (I skipped txt because why would you need to OCR that??) and convert it to PDF (no need to convert it into images afaict), and then OCR it using Zerox. Temp files go into ./tmp/zerox/ by default. Model, OpenAI key, and even the custom system prompt can be specified as arguments.

Just pass in the filename like:

python3.11 lib/scripts/zerox.py test-pdf.pdf --open_ai_key=<your key>

Here's the full script. Use/abuse as you see fit.

Requirements (Linux):

apt-get install libreoffice
pip install zerox PyPDF2

Requirements (Mac):

brew install libreoffice
pip install zerox PyPDF2

And finally the script:

from pyzerox import zerox
import os
import asyncio
import argparse
from PyPDF2 import PdfReader
import subprocess
import shutil

def get_pdf_page_range(file_path):
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        return list(range(1, len(reader.pages) + 1))

def get_libreoffice_command():
    """Determine whether to use 'libreoffice' or 'soffice' based on availability."""
    # Check if 'libreoffice' exists in the system
    if shutil.which('libreoffice'):
        return 'libreoffice'
    # Check if 'soffice' exists in the system
    elif shutil.which('soffice'):
        return 'soffice'
    else:
        raise FileNotFoundError("Neither 'libreoffice' nor 'soffice' was found on the system.")

def convert_to_pdf(source_path, output_dir):
    """Convert DOCX to PDF using LibreOffice."""
    try:
        print(f"Converting {source_path} to PDF...")
        subprocess.run(
            [get_libreoffice_command(), '--headless', '--convert-to', 'pdf', '--outdir', output_dir, source_path],
            check=True
        )
        print(f"Converted {source_path} to PDF successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Error converting DOCX to PDF: {e}")
        return None

    pdf_filename = os.path.splitext(os.path.basename(source_path))[0] + '.pdf'
    return os.path.join(output_dir, pdf_filename)

# Parse command-line arguments
parser = argparse.ArgumentParser(description='Process a PDF file with zerox.')
parser.add_argument('file_path', type=str, help='The file path or URL of the PDF to process.')
parser.add_argument('--open_ai_key', type=str, required=True, help='The OpenAI API key.')
parser.add_argument('--model', type=str, required=False, help='The model to use. (default gpt-4o-mini)')
parser.add_argument('--output_dir', type=str, required=False, help='The output file name. (default ./tmp/zerox/FILENAME.md)')
parser.add_argument('--custom_system_prompt', type=str, required=False, help='The custom prompt to use. (default None)')
args = parser.parse_args()

# Use the file_path from the command-line arguments
file_path = args.file_path

# Convert docx to pdf if the file is not a PDF
convertible_formats = [
    "doc", "docx", "odt", "ott", "rtf", "html", "htm", "xml", "wps", "wpd",
    "xls", "xlsx", "ods", "ots", "csv", "tsv", "ppt", "pptx", "odp", "otp"
]

output_dir = args.output_dir if args.output_dir else "./tmp/zerox" ## directory to save the consolidated markdown file

if os.path.splitext(file_path)[1].lower().lstrip('.') in convertible_formats:
    file_path = convert_to_pdf(file_path, output_dir)
elif os.path.splitext(file_path)[1].lower().lstrip('.') == 'pdf':
    pass
else:
    print(f"File {file_path} with extension {os.path.splitext(file_path)[1].lower().lstrip('.')} is not convertible to PDF.")
    exit(1)

### Model Setup (Use only Vision Models) Refer: https://docs.litellm.ai/docs/providers ###

## placeholder for additional model kwargs which might be required for some models
kwargs = {}

## system prompt to use for the vision model
custom_system_prompt = args.custom_system_prompt if args.custom_system_prompt else None

###################### Example for OpenAI ######################
model = args.model if args.model else "gpt-4o-mini" ## openai model
os.environ["OPENAI_API_KEY"] = args.open_ai_key

###################### For other providers refer: https://docs.litellm.ai/docs/providers ######################

# Define main async entrypoint
async def main():
    select_pages = get_pdf_page_range(file_path)

    result = await zerox(file_path=file_path, model=model, output_dir=output_dir,
                        custom_system_prompt=custom_system_prompt, select_pages=select_pages, **kwargs)
    return result

# run the main function:
result = asyncio.run(main())

# print markdown result
print(result)