nlmatics / nlm-ingestor

This repo provides the server side code for llmsherpa API to connect. It includes parsers for various file formats.
https://www.nlmatics.com
Apache License 2.0
923 stars 112 forks source link

For anyone hoping to deploy this as a lambda #56

Open dgonier opened 2 months ago

dgonier commented 2 months ago

Dockerfile

syntax=docker/dockerfile:experimental

FROM python:3.11-bookworm RUN apt-get update && apt-get -y --no-install-recommends install libgomp1 ENV APP_HOME /app

install Java

RUN mkdir -p /usr/share/man/man1 && \ apt-get update -y && \ apt-get install -y openjdk-17-jre-headless

install essential packages

RUN apt-get install -y \ libxml2-dev libxslt-dev \ build-essential libmagic-dev

install tesseract

RUN apt-get install -y \ tesseract-ocr \ lsb-release \ && echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null \ && apt-get update -oAcquire::AllowInsecureRepositories=true \ && apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated \ && apt-get update \ && apt-get install -y \ tesseract-ocr libtesseract-dev \ && wget -P /usr/share/tesseract-ocr/5/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata RUN apt-get install unzip -y && \ apt-get install git -y && \ apt-get autoremove -y WORKDIR ${APP_HOME} COPY ./requirements.txt ./requirements.txt RUN pip install --upgrade pip setuptools RUN apt-get install -y libmagic1 RUN mkdir -p -m 0600 ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts RUN pip install -r requirements.txt

Set NLTK Data directory environment variable to ensure it uses a known location

RUN mkdir -p /usr/local/share/nltk_data && chmod a+rwx /usr/local/share/nltk_data ENV NLTK_DATA /usr/local/share/nltk_data

Download necessary NLTK data using the defined base directory

RUN python -m nltk.downloader -d /usr/local/share/nltk_data stopwords RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt RUN pip install awslambdaric

COPY . ./

ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]

Set up the command for the Lambda handler

CMD [ "handler.parse" ]

handler.py

import base64 import json import tempfile import os import traceback from werkzeug.utils import secure_filename from nlm_ingestor.ingestor import ingestor_api from nlm_utils.utils import file_utils import subprocess import os import time import threading

def parse_document(file_content, filename, render_format="all", use_new_indent_parser=False, apply_ocr=False): parse_options = { "parse_and_render_only": True, "render_format": render_format, "use_new_indent_parser": use_new_indent_parser, "parse_pages": (), "apply_ocr": apply_ocr }

try:
    # Create a temporary file to save the decoded content
    tempfile_handler, tmp_file_path = tempfile.mkstemp(suffix=os.path.splitext(filename)[1])
    with os.fdopen(tempfile_handler, 'wb') as tmp_file:
        tmp_file.write(file_content)

    # calculate the file properties
    props = file_utils.extract_file_properties(tmp_file_path)
    print(f"Parsing document: {filename}")
    return_dict, _ = ingestor_api.ingest_document(
        filename,
        tmp_file_path,
        props["mimeType"],
        parse_options=parse_options,
    )
    return return_dict or {}

except Exception as e:
    traceback.print_exc()
    return {"status": "fail", "reason": str(e)}

finally:
    if os.path.exists(tmp_file_path):
        os.unlink(tmp_file_path)

def read_output(process): while True: output = process.stdout.readline() if output == '': break print(output.strip())

def start_tika(): print('see jar', os.path.exists("jars/tika-server-standard-nlm-modified-2.4.1_v6.jar")) tika_path = "jars/tika-server-standard-nlm-modified-2.4.1_v6.jar" java_path = "/usr/bin/java" # Use the common path for Java process = subprocess.Popen([java_path, "-jar", tika_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

thread = threading.Thread(target=read_output, args=(process,))

# thread.start()

# Main thread can perform other tasks here, or wait for the output thread to finish
# thread.join()
print("Tika Server process completed.")

Call this function early in your Lambda handler

import requests

def test_tika(): try: response = requests.get('http://localhost:9998/tika') if response.status_code == 200: print("Tika Server is reachable and ready!") return True else: print("Tika Server is not ready. Status Code:", response.status_code) return False except Exception as e: print("Failed to connect to Tika Server:", str(e)) return False

def parse(event, context): print(context) if 'body' not in event: return { "statusCode": 400, "body": json.dumps({"message": "No data provided"}) } start_tika()

working = test_tika()
while not working:
    time.sleep(3)
    working = test_tika()

# Decode the file from base64
file_content = base64.b64decode(event['body'])
filename = "uploaded_document.pdf"  # This needs to be passed or inferred some way

# Extract additional parameters
params = event.get('queryStringParameters', {})
render_format = params.get('render_format', 'all')
use_new_indent_parser = params.get('use_new_indent_parser', 'no') == 'yes'
apply_ocr = params.get('apply_ocr', 'no') == 'yes'

# Process the document
result = parse_document(
    file_content, filename, render_format, use_new_indent_parser, apply_ocr
)

return {
    "statusCode": 200,
    "return_dict": result
}
jpbalarini commented 1 month ago

Hi @dgonier do you mind pasting your code again? It seems that the format has been messed up. Thanks!