This repo provides the server side code for llmsherpa API to connect. It includes parsers for various file formats.
Apache License 2.0
923 stars 112 forks

For anyone hoping to deploy this as a lambda #56

Open dgonier opened 2 months ago

dgonier commented 2 months ago



FROM python:3.11-bookworm RUN apt-get update && apt-get -y --no-install-recommends install libgomp1 ENV APP_HOME /app

install Java

RUN mkdir -p /usr/share/man/man1 && \ apt-get update -y && \ apt-get install -y openjdk-17-jre-headless

install essential packages

RUN apt-get install -y \ libxml2-dev libxslt-dev \ build-essential libmagic-dev

install tesseract

RUN apt-get install -y \ tesseract-ocr \ lsb-release \ && echo "deb$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null \ && apt-get update -oAcquire::AllowInsecureRepositories=true \ && apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated \ && apt-get update \ && apt-get install -y \ tesseract-ocr libtesseract-dev \ && wget -P /usr/share/tesseract-ocr/5/tessdata/ RUN apt-get install unzip -y && \ apt-get install git -y && \ apt-get autoremove -y WORKDIR ${APP_HOME} COPY ./requirements.txt ./requirements.txt RUN pip install --upgrade pip setuptools RUN apt-get install -y libmagic1 RUN mkdir -p -m 0600 ~/.ssh && ssh-keyscan >> ~/.ssh/known_hosts RUN pip install -r requirements.txt

Set NLTK Data directory environment variable to ensure it uses a known location

RUN mkdir -p /usr/local/share/nltk_data && chmod a+rwx /usr/local/share/nltk_data ENV NLTK_DATA /usr/local/share/nltk_data

Download necessary NLTK data using the defined base directory

RUN python -m nltk.downloader -d /usr/local/share/nltk_data stopwords RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt RUN pip install awslambdaric

COPY . ./

ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]

Set up the command for the Lambda handler

CMD [ "handler.parse" ]

import base64 import json import tempfile import os import traceback from werkzeug.utils import secure_filename from nlm_ingestor.ingestor import ingestor_api from nlm_utils.utils import file_utils import subprocess import os import time import threading

def parse_document(file_content, filename, render_format="all", use_new_indent_parser=False, apply_ocr=False): parse_options = { "parse_and_render_only": True, "render_format": render_format, "use_new_indent_parser": use_new_indent_parser, "parse_pages": (), "apply_ocr": apply_ocr }

    # Create a temporary file to save the decoded content
    tempfile_handler, tmp_file_path = tempfile.mkstemp(suffix=os.path.splitext(filename)[1])
    with os.fdopen(tempfile_handler, 'wb') as tmp_file:

    # calculate the file properties
    props = file_utils.extract_file_properties(tmp_file_path)
    print(f"Parsing document: {filename}")
    return_dict, _ = ingestor_api.ingest_document(
    return return_dict or {}

except Exception as e:
    return {"status": "fail", "reason": str(e)}

    if os.path.exists(tmp_file_path):

def read_output(process): while True: output = process.stdout.readline() if output == '': break print(output.strip())

def start_tika(): print('see jar', os.path.exists("jars/tika-server-standard-nlm-modified-2.4.1_v6.jar")) tika_path = "jars/tika-server-standard-nlm-modified-2.4.1_v6.jar" java_path = "/usr/bin/java" # Use the common path for Java process = subprocess.Popen([java_path, "-jar", tika_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

thread = threading.Thread(target=read_output, args=(process,))

# thread.start()

# Main thread can perform other tasks here, or wait for the output thread to finish
# thread.join()
print("Tika Server process completed.")

Call this function early in your Lambda handler

import requests

def test_tika(): try: response = requests.get('http://localhost:9998/tika') if response.status_code == 200: print("Tika Server is reachable and ready!") return True else: print("Tika Server is not ready. Status Code:", response.status_code) return False except Exception as e: print("Failed to connect to Tika Server:", str(e)) return False

def parse(event, context): print(context) if 'body' not in event: return { "statusCode": 400, "body": json.dumps({"message": "No data provided"}) } start_tika()

working = test_tika()
while not working:
    working = test_tika()

# Decode the file from base64
file_content = base64.b64decode(event['body'])
filename = "uploaded_document.pdf"  # This needs to be passed or inferred some way

# Extract additional parameters
params = event.get('queryStringParameters', {})
render_format = params.get('render_format', 'all')
use_new_indent_parser = params.get('use_new_indent_parser', 'no') == 'yes'
apply_ocr = params.get('apply_ocr', 'no') == 'yes'

# Process the document
result = parse_document(
    file_content, filename, render_format, use_new_indent_parser, apply_ocr

return {
    "statusCode": 200,
    "return_dict": result
jpbalarini commented 1 month ago

Hi @dgonier do you mind pasting your code again? It seems that the format has been messed up. Thanks!