Download necessary NLTK data using the defined base directory
RUN python -m nltk.downloader -d /usr/local/share/nltk_data stopwords
RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt
RUN pip install awslambdaric
import base64
import json
import tempfile
import os
import traceback
from werkzeug.utils import secure_filename
from nlm_ingestor.ingestor import ingestor_api
from nlm_utils.utils import file_utils
import subprocess
import os
import time
import threading
# thread.start()
# Main thread can perform other tasks here, or wait for the output thread to finish
# thread.join()
print("Tika Server process completed.")
Call this function early in your Lambda handler
import requests
def test_tika():
try:
response = requests.get('http://localhost:9998/tika')
if response.status_code == 200:
print("Tika Server is reachable and ready!")
return True
else:
print("Tika Server is not ready. Status Code:", response.status_code)
return False
except Exception as e:
print("Failed to connect to Tika Server:", str(e))
return False
def parse(event, context):
print(context)
if 'body' not in event:
return {
"statusCode": 400,
"body": json.dumps({"message": "No data provided"})
}
start_tika()
working = test_tika()
while not working:
time.sleep(3)
working = test_tika()
# Decode the file from base64
file_content = base64.b64decode(event['body'])
filename = "uploaded_document.pdf" # This needs to be passed or inferred some way
# Extract additional parameters
params = event.get('queryStringParameters', {})
render_format = params.get('render_format', 'all')
use_new_indent_parser = params.get('use_new_indent_parser', 'no') == 'yes'
apply_ocr = params.get('apply_ocr', 'no') == 'yes'
# Process the document
result = parse_document(
file_content, filename, render_format, use_new_indent_parser, apply_ocr
)
return {
"statusCode": 200,
"return_dict": result
}
Dockerfile
syntax=docker/dockerfile:experimental
FROM python:3.11-bookworm RUN apt-get update && apt-get -y --no-install-recommends install libgomp1 ENV APP_HOME /app
install Java
RUN mkdir -p /usr/share/man/man1 && \ apt-get update -y && \ apt-get install -y openjdk-17-jre-headless
install essential packages
RUN apt-get install -y \ libxml2-dev libxslt-dev \ build-essential libmagic-dev
install tesseract
RUN apt-get install -y \ tesseract-ocr \ lsb-release \ && echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null \ && apt-get update -oAcquire::AllowInsecureRepositories=true \ && apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated \ && apt-get update \ && apt-get install -y \ tesseract-ocr libtesseract-dev \ && wget -P /usr/share/tesseract-ocr/5/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata RUN apt-get install unzip -y && \ apt-get install git -y && \ apt-get autoremove -y WORKDIR ${APP_HOME} COPY ./requirements.txt ./requirements.txt RUN pip install --upgrade pip setuptools RUN apt-get install -y libmagic1 RUN mkdir -p -m 0600 ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts RUN pip install -r requirements.txt
Set NLTK Data directory environment variable to ensure it uses a known location
RUN mkdir -p /usr/local/share/nltk_data && chmod a+rwx /usr/local/share/nltk_data ENV NLTK_DATA /usr/local/share/nltk_data
Download necessary NLTK data using the defined base directory
RUN python -m nltk.downloader -d /usr/local/share/nltk_data stopwords RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt RUN pip install awslambdaric
COPY . ./
ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
Set up the command for the Lambda handler
CMD [ "handler.parse" ]
handler.py
import base64 import json import tempfile import os import traceback from werkzeug.utils import secure_filename from nlm_ingestor.ingestor import ingestor_api from nlm_utils.utils import file_utils import subprocess import os import time import threading
def parse_document(file_content, filename, render_format="all", use_new_indent_parser=False, apply_ocr=False): parse_options = { "parse_and_render_only": True, "render_format": render_format, "use_new_indent_parser": use_new_indent_parser, "parse_pages": (), "apply_ocr": apply_ocr }
def read_output(process): while True: output = process.stdout.readline() if output == '': break print(output.strip())
def start_tika(): print('see jar', os.path.exists("jars/tika-server-standard-nlm-modified-2.4.1_v6.jar")) tika_path = "jars/tika-server-standard-nlm-modified-2.4.1_v6.jar" java_path = "/usr/bin/java" # Use the common path for Java process = subprocess.Popen([java_path, "-jar", tika_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
thread = threading.Thread(target=read_output, args=(process,))
Call this function early in your Lambda handler
import requests
def test_tika(): try: response = requests.get('http://localhost:9998/tika') if response.status_code == 200: print("Tika Server is reachable and ready!") return True else: print("Tika Server is not ready. Status Code:", response.status_code) return False except Exception as e: print("Failed to connect to Tika Server:", str(e)) return False
def parse(event, context): print(context) if 'body' not in event: return { "statusCode": 400, "body": json.dumps({"message": "No data provided"}) } start_tika()