Unstructured-IO / unstructured-api

Apache License 2.0
509 stars 108 forks source link

500 Server Error when parsing some PDF #465

Open AmrAhmedElagoz opened 1 week ago

AmrAhmedElagoz commented 1 week ago

Describe the bug INFO:main:Detected file type: application/pdf INFO:main:Sending request to https://api.unstructured.io/general/v0/general INFO:main:Headers: {'Accept': 'application/json', 'unstructured-api-key': 'xxx'} INFO:main:File being sent: Mahmoud_Gamal_Resume.pdf INFO:main:Response status code: 500 INFO:main:Response headers: {'Date': 'Sun, 29 Sep 2024 21:51:15 GMT', 'Content-Type': 'application/json', 'Content-Length': '47', 'Connection': 'keep-alive', 'server': 'uvicorn'} ERROR:main:500 Internal Server Error: {"detail":"'6114cee903d6a72fa0370b97d042b71c'"} ERROR:main:Error details: { "detail": "'6114cee903d6a72fa0370b97d042b71c'" } HTTP error occurred: 500 Server Error: Internal Server Error for url: https://api.unstructured.io/general/v0/general

To Reproduce A simple code to reproduce the error:

import requests
import json
import logging
import magic

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def is_pdf(file_path):
    mime = magic.Magic(mime=True)
    file_type = mime.from_file(file_path)
    logger.info(f"Detected file type: {file_type}")
    return file_type == "application/pdf"

def parse_pdf(api_key, file_path):
    if not is_pdf(file_path):
        return "Error: The provided file is not a PDF."

    url = "https://api.unstructured.io/general/v0/general"

    headers = {
        "Accept": "application/json",
        "unstructured-api-key": api_key
    }

    try:
        with open(file_path, "rb") as file:
            files = {"files": (file_path, file, "application/pdf")}

            logger.info(f"Sending request to {url}")
            logger.info(f"Headers: {headers}")
            logger.info(f"File being sent: {file_path}")

            response = requests.post(url, headers=headers, files=files)

        logger.info(f"Response status code: {response.status_code}")
        logger.info(f"Response headers: {response.headers}")

        response.raise_for_status()

        return response.json()
    except requests.exceptions.HTTPError as http_err:
        if response.status_code == 500:
            logger.error(f"500 Internal Server Error: {response.text}")
            try:
                error_details = response.json()
                logger.error(f"Error details: {json.dumps(error_details, indent=2)}")
            except json.JSONDecodeError:
                logger.error("Could not parse error response as JSON")
        return f"HTTP error occurred: {http_err}"
    except requests.exceptions.RequestException as err:
        return f"An error occurred: {err}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"

def main():
    api_key = "YOUR_API_KEY_HERE"
    file_path = "Mahmoud_Gamal_Resume.pdf"

    result = parse_pdf(api_key, file_path)
    print(result)

if __name__ == "__main__":
    main()

Environment:

Additional context I attached one of the .pdf that produces such error, a side note when I used llamaparse it worked fine with this pdf Mahmoud_Gamal_Resume.pdf