Open Zhengyu-Ju opened 2 days ago
Hi @Zhengyu-Ju what do you want to do with the docx document? Can you provide a sample docx?
source = '/home/ubuntu/test.docx' pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.generate_picture_images = True artifacts_path = "/tmp/pycharm_project_763/fileReader/docling-models" pipeline_options.artifacts_path = artifacts_path pipeline_options.ocr_options.model_storage_directory = '/tmp/pycharm_project_763/fileReader/craft' pipeline_options.ocr_options.lang = ['ch_sim','en'] doc_converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options ), InputFormat.DOCX: WordFormatOption( pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend ),
}
) result = doc_converter.convert(source)
i cannot get any picture info in variable result
Did you try this example? https://ds4sd.github.io/docling/examples/export_figures/
l have tried use the pipeline_options to define WordFormatOption bedore,like this DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options ), InputFormat.DOCX: WordFormatOption( pipeline_options=pipeline_options ),
} ) but i also get no pic info
Pls try the example @dolfim-ibm linked. I've done tests with the example, which worked
from docling.datamodel.pipeline_options import PdfPipelineOptions
IMAGE_RESOLUTION_SCALE = 2.0
import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
def main():
logging.basicConfig(level=logging.INFO)
input_doc_path = Path("/home/ubuntu/test.docx")
output_dir = Path("scratch")
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
# The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
# with the image field
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True
artifacts_path = "/tmp/pycharm_project_763/fileReader/docling-models"
pipeline_options.artifacts_path = artifacts_path
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_res.input.file.stem
# Save page images
for page_no, page in conv_res.document.pages.items():
page_no = page.page_no
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.pil_image.save(fp, format="PNG")
# Save images of figures and tables
table_counter = 0
picture_counter = 0
for element, _level in conv_res.document.iterate_items():
if isinstance(element, TableItem):
table_counter += 1
element_image_filename = (
output_dir / f"{doc_filename}-table-{table_counter}.png"
)
with element_image_filename.open("wb") as fp:
element.get_image(conv_res.document).save(fp, "PNG")
if isinstance(element, PictureItem):
picture_counter += 1
element_image_filename = (
output_dir / f"{doc_filename}-picture-{picture_counter}.png"
)
with element_image_filename.open("wb") as fp:
element.get_image(conv_res.document).save(fp, "PNG")
# Save markdown with embedded pictures
content_md = conv_res.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
md_filename = output_dir / f"{doc_filename}-with-images.md"
with md_filename.open("w") as fp:
fp.write(content_md)
end_time = time.time() - start_time
_log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
main()
question i only changed the model path and file path ,but the generated md file have no pics, so can you see what's wrong with my code?
question
when i use WordFormatOption() function to define a converter,i found that i cannot get the picture info ,i see the code of class SimplePipeline,l found that there is no property to set generate picture. when i use the pdf pipeline_options to WordFormatOption l found a also cannot get picture.
code
pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.generate_picture_images = True artifacts_path = "/tmp/pycharm_project_763/fileReader/docling-models" pipeline_options.artifacts_path = artifacts_path
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
doc_converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend ), InputFormat.DOCX: WordFormatOption( pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend ),
InputFormat.ASCIIDOC:
) result = doc_converter.convert(source) print(result.document.pictures)