Open ezscode opened 1 day ago
When I extract content from URL, for image only get placeholder <!-- image -->. I want to save image info like path/url in .doctags or .json
<!-- image -->
.doctags
.json
If this function support already, can anybody show me the code.
My code now is like :
import json import logging import time from pathlib import Path from docling.backend.html_backend import HTMLDocumentBackend from docling.datamodel.base_models import InputFormat from docling.pipeline.simple_pipeline import PipelineOptions from docling.document_converter import DocumentConverter, SimplePipeline, HTMLFormatOption from docling_core.types.doc import ( PictureItem, TextItem, ) _log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) options = PipelineOptions() s_pip = SimplePipeline(options) doc_converter = DocumentConverter( allowed_formats=[ # InputFormat.IMAGE, InputFormat.HTML, ], format_options={ InputFormat.HTML: HTMLFormatOption( pipeline_cls=SimplePipeline, # class,not instance backend=HTMLDocumentBackend ) } ) url = 'https://huggingface.co/blog/aya-expanse' start_time = time.time() conv_result = doc_converter.convert(url) end_time = time.time() - start_time for item, level in conv_result.document.iterate_items(): print('-- ', type(item), level) if isinstance(item, TextItem): print(item.text) elif isinstance(item, PictureItem): print('-- ', item.label) pass _log.info(f"Document converted in {end_time:.2f} seconds.") ## Export results # output_dir = Path("scratch") # output_dir = '/Users/pc087/Documents/code/code24/03-pdf/docli/scratch' output_dir = Path("03-pdf/docli/scratch") output_dir.mkdir(parents=True, exist_ok=True) doc_filename = conv_result.input.file.stem print('-- output_dir : ', output_dir, doc_filename ) # Export Deep Search document JSON format: with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp: fp.write(json.dumps(conv_result.document.export_to_dict(), ensure_ascii=False)) # Export Markdown format: with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp: fp.write(conv_result.document.export_to_markdown()) # Export Document Tags format: with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp: fp.write(conv_result.document.export_to_document_tokens())
When I extract content from URL, for image only get placeholder
<!-- image -->
. I want to save image info like path/url in.doctags
or.json
If this function support already, can anybody show me the code.
My code now is like :