DS4SD / docling

Get your documents ready for gen AI
https://ds4sd.github.io/docling
MIT License
10.46k stars 507 forks source link

Docx cannot get pic info #391

Open Zhengyu-Ju opened 2 days ago

Zhengyu-Ju commented 2 days ago

question

when i use WordFormatOption() function to define a converter,i found that i cannot get the picture info ,i see the code of class SimplePipeline,l found that there is no property to set generate picture. when i use the pdf pipeline_options to WordFormatOption l found a also cannot get picture.

code

pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.generate_picture_images = True artifacts_path = "/tmp/pycharm_project_763/fileReader/docling-models" pipeline_options.artifacts_path = artifacts_path

pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)

doc_converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend ), InputFormat.DOCX: WordFormatOption( pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend ),

InputFormat.ASCIIDOC:

}

) result = doc_converter.convert(source) print(result.document.pictures)

jokus-pokus commented 1 day ago

Hi @Zhengyu-Ju what do you want to do with the docx document? Can you provide a sample docx?

Zhengyu-Ju commented 1 day ago

example file

test.docx

code

source = '/home/ubuntu/test.docx' pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.generate_picture_images = True artifacts_path = "/tmp/pycharm_project_763/fileReader/docling-models" pipeline_options.artifacts_path = artifacts_path pipeline_options.ocr_options.model_storage_directory = '/tmp/pycharm_project_763/fileReader/craft' pipeline_options.ocr_options.lang = ['ch_sim','en'] doc_converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options ), InputFormat.DOCX: WordFormatOption( pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend ),

InputFormat.ASCIIDOC:

}

) result = doc_converter.convert(source)

question

i cannot get any picture info in variable result

dolfim-ibm commented 1 day ago

Did you try this example? https://ds4sd.github.io/docling/examples/export_figures/

Zhengyu-Ju commented 1 day ago

l have tried use the pipeline_options to define WordFormatOption bedore,like this DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options ), InputFormat.DOCX: WordFormatOption( pipeline_options=pipeline_options ),

} ) but i also get no pic info

jokus-pokus commented 17 hours ago

Pls try the example @dolfim-ibm linked. I've done tests with the example, which worked

Zhengyu-Ju commented 8 hours ago
from docling.datamodel.pipeline_options import PdfPipelineOptions
IMAGE_RESOLUTION_SCALE = 2.0
import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("/home/ubuntu/test.docx")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True
    artifacts_path = "/tmp/pycharm_project_763/fileReader/docling-models"
    pipeline_options.artifacts_path = artifacts_path
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save markdown with embedded pictures
    content_md = conv_res.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    with md_filename.open("w") as fp:
        fp.write(content_md)

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
main()

question i only changed the model path and file path ,but the generated md file have no pics, so can you see what's wrong with my code?