Closed ZengJin123 closed 5 months ago
我的服务是通过命令行直接启动的,不是docker启动的
server log testing.. {'parse_and_render_only': True, 'render_format': 'all', 'use_new_indent_parser': True, 'parse_pages': (), 'apply_ocr': False} error uploading file, stacktrace: Traceback (most recent call last): File "/data/nlm-ingestor/nlm_ingestor/ingestion_daemon/main.py", line 48, in parse_document ingest_status, return_dict = ingestor_api.ingest_document( File "/data/nlm-ingestor/nlm_ingestor/ingestor/ingestor_api.py", line 37, in ingest_document pdfi = pdf_ingestor.PDFIngestor(doc_location, parse_options) File "/data/nlm-ingestor/nlm_ingestor/ingestor/pdf_ingestor.py", line 35, in init blocks, _block_texts, _sents, _file_data, result, page_dim, num_pages = parse_blocks( File "/data/nlm-ingestor/nlm_ingestor/ingestor/pdf_ingestor.py", line 172, in parse_blocks parsed_doc = visual_ingestor.Doc(pages, ignore_blocks, render_format) File "/data/nlm-ingestor/nlm_ingestor/ingestor/visual_ingestor/visual_ingestor.py", line 117, in init self.parse(pages) File "/data/nlm-ingestor/nlm_ingestor/ingestor/visual_ingestor/visual_ingestor.py", line 155, in parse page_style = pages[page_idx].attrs.get("style", None) or pages[0].attrs["style"] KeyError: 'style'
error uploading file, stacktrace: Traceback (most recent call last): File "/data/nlm-ingestor/nlm_ingestor/ingestion_daemon/main.py", line 48, in parse_document ingest_status, return_dict = ingestor_api.ingest_document( File "/data/nlm-ingestor/nlm_ingestor/ingestor/ingestor_api.py", line 37, in ingest_document pdfi = pdf_ingestor.PDFIngestor(doc_location, parse_options) File "/data/nlm-ingestor/nlm_ingestor/ingestor/pdf_ingestor.py", line 35, in init blocks, _block_texts, _sents, _file_data, result, page_dim, num_pages = parse_blocks( File "/data/nlm-ingestor/nlm_ingestor/ingestor/pdf_ingestor.py", line 172, in parse_blocks parsed_doc = visual_ingestor.Doc(pages, ignore_blocks, render_format) File "/data/nlm-ingestor/nlm_ingestor/ingestor/visual_ingestor/visual_ingestor.py", line 117, in init self.parse(pages) File "/data/nlm-ingestor/nlm_ingestor/ingestor/visual_ingestor/visual_ingestor.py", line 155, in parse page_style = pages[page_idx].attrs.get("style", None) or pages[0].attrs["style"] KeyError: 'style' Traceback (most recent call last): File "/data/nlm-ingestor/nlm_ingestor/ingestion_daemon/main.py", line 48, in parse_document ingest_status, return_dict = ingestor_api.ingest_document( File "/data/nlm-ingestor/nlm_ingestor/ingestor/ingestor_api.py", line 37, in ingest_document pdfi = pdf_ingestor.PDFIngestor(doc_location, parse_options) File "/data/nlm-ingestor/nlm_ingestor/ingestor/pdf_ingestor.py", line 35, in init blocks, _block_texts, _sents, _file_data, result, page_dim, num_pages = parse_blocks( File "/data/nlm-ingestor/nlm_ingestor/ingestor/pdf_ingestor.py", line 172, in parse_blocks parsed_doc = visual_ingestor.Doc(pages, ignore_blocks, render_format) File "/data/nlm-ingestor/nlm_ingestor/ingestor/visual_ingestor/visual_ingestor.py", line 117, in init self.parse(pages) File "/data/nlm-ingestor/nlm_ingestor/ingestor/visual_ingestor/visual_ingestor.py", line 155, in parse page_style = pages[page_idx].attrs.get("style", None) or pages[0].attrs["style"] KeyError: 'style' 127.0.0.1 - - [20/Feb/2024 17:27:40] "POST /api/parseDocument?useNewIndentParser=yes HTTP/1.1" 500 -
your tika server is not running
These lines add the style
property to the p
tags. So something is going wrong here.
Happens in the pdf ingestor.
new_p, changed = style_utils.format_p_tag(orig_p, filter_out_pattern,
filter_ls_pattern, soup)
# Create string out of dictionary
p["style"] = ";".join([":".join([key, str(val)]) for key, val in input_style.items()])
my code from llmsherpa.readers import LayoutPDFReader import os, sys directory_path = "/data/pdf_test/llmsherpa" sys.path.insert(0, directory_path) llmsherpa_api_url = "http://localhost:5001/api/parseDocument?renderFormat=all&useNewIndentParser=true" pdf_url = "https://solutions.weblite.ca/pdfocrx/scansmpl.pdf" do_ocr = True if do_ocr: llmsherpa_api_url = llmsherpa_api_url + "&applyOcr=yes" pdf_reader = LayoutPDFReader(llmsherpa_api_url) doc = pdf_reader.read_pdf(pdf_url) print(doc.to_html())
Error reporting Traceback (most recent call last): File "/data/pdf_test/t.py", line 13, in
doc = pdf_reader.read_pdf(pdf_url)
File "/root/anaconda3/envs/nlm/lib/python3.9/site-packages/llmsherpa/readers/file_reader.py", line 73, in read_pdf
blocks = response_json['return_dict']['result']['blocks']
KeyError: 'return_dict'