run-llama / llama_parse

Parse files for optimal RAG
https://www.llamaindex.ai
MIT License
3.01k stars 287 forks source link

Extracted sub-images as different file instead of single image it supposed to be #374

Open tkcoding opened 2 months ago

tkcoding commented 2 months ago

Describe the bug Image in PDF was extracted as different sub-image files instead of single figure it should be

Files mattergen.pdf

Job ID 72f9de41-28a4-4957-85e6-decfea552889

Screenshots image

Client: Please remove untested options:

Options

from llama_parse import LlamaParse
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.schema import ImageDocument
from typing import List
from llama_index.core.node_parser import LlamaParseJsonNodeParser
from llama_index.core.schema import BaseNode, TextNode, Document
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
import glob
ins = """
Extract images and caption as accurate as possible.
"""

class llama_document_parser(object):
    def __init__(self,parsing_ins):
        self.parser = LlamaParse(
            parsing_instruction=ins,
            verbose=True,
            ignore_errors=False,
            do_not_cache=True,
        )

    def get_image_text_nodes(self,download_path: str,json_objs: List[dict]):
        """Extract out text from images using a multimodal model."""
        image_dicts = self.parser.get_images(json_objs, download_path=download_path)
        image_documents = []
        img_text_nodes = []
        for image_dict in image_dicts:
            image_doc = ImageDocument(image_path=image_dict["path"])
            img_text_nodes.append(image_doc)
        return img_text_nodes

    def document_processing_llamaparse(self,file_name: str ,image_output_folder:str):
        """Parse document in using llamaparse and return extracted elements in json format"""
        json_objs = self.parser.get_json_result(file_name)
        json_list = json_objs[0]["pages"]
        print(json_list)
        if not os.path.exists(image_output_folder):
            os.mkdir(image_output_folder)

        image_text_nodes = self.get_image_text_nodes(image_output_folder,json_objs)
        return json_list

paper_dict = [
                {"image_folder":"docs/mattergen/",\
              "doc_folder":"docs/example_documents/",\
              "fname":"mattergen.pdf",}]

image_folder = paper_dict[0]["image_folder"]
doc_folder = paper_dict[0]["doc_folder"]
fname = paper_dict[0]["fname"]
llama_parser = llama_document_parser(parsing_ins=ins)
# llamaparse to extract documents
json_list = llama_parser.document_processing_llamaparse(file_name=f"{doc_folder}{fname}",image_output_folder=image_folder)

Additional context Add any additional context about the problem here.

tkcoding commented 1 month ago

Hi @hexapode , is this bug replicable and is there any plan to improve this? Since Image extraction is one of the crucial feature LlamaParse provides that I consider as a deal breaker.

TonySimonovsky commented 4 days ago

Would also like to know if there some workaround