DS4SD / docling

Get your documents ready for gen AI
https://ds4sd.github.io/docling
MIT License
10.48k stars 507 forks source link

Support Image path/url #405

Open ezscode opened 1 day ago

ezscode commented 1 day ago

When I extract content from URL, for image only get placeholder <!-- image -->. I want to save image info like path/url in .doctags or .json


If this function support already, can anybody show me the code.

My code now is like :

import json
import logging
import time
from pathlib import Path
from docling.backend.html_backend import HTMLDocumentBackend 
from docling.datamodel.base_models import InputFormat
from docling.pipeline.simple_pipeline import PipelineOptions  

from docling.document_converter import DocumentConverter, SimplePipeline, HTMLFormatOption 

from docling_core.types.doc import ( 
    PictureItem, 
    TextItem,
)

_log = logging.getLogger(__name__)

logging.basicConfig(level=logging.INFO)

options = PipelineOptions()

s_pip = SimplePipeline(options)

doc_converter = DocumentConverter(
    allowed_formats=[ 
            # InputFormat.IMAGE, 
            InputFormat.HTML, 
        ], 
    format_options={
        InputFormat.HTML: HTMLFormatOption(
            pipeline_cls=SimplePipeline,  # class,not instance
            backend=HTMLDocumentBackend 
        )   
    }    
)       

url = 'https://huggingface.co/blog/aya-expanse'  

start_time = time.time()                      
conv_result = doc_converter.convert(url)     
end_time = time.time() - start_time

for item, level in conv_result.document.iterate_items():

    print('-- ', type(item), level)
    if isinstance(item, TextItem):
        print(item.text) 
    elif isinstance(item, PictureItem):
        print('-- ', item.label)  
        pass

_log.info(f"Document converted in {end_time:.2f} seconds.")

## Export results                        
# output_dir = Path("scratch")
# output_dir = '/Users/pc087/Documents/code/code24/03-pdf/docli/scratch'   
output_dir = Path("03-pdf/docli/scratch")        
output_dir.mkdir(parents=True, exist_ok=True)    
doc_filename = conv_result.input.file.stem          

print('-- output_dir : ', output_dir, doc_filename )   

# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
    fp.write(json.dumps(conv_result.document.export_to_dict(), ensure_ascii=False))

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_markdown())

# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_document_tokens())