Unstructured-IO / unstructured

Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines.
https://www.unstructured.io/
Apache License 2.0
9.21k stars 764 forks source link

bug/<short-name>Unstructured Partition PDF , tesseract ERROR!!! #3789

Open suhaif314 opened 22 hours ago

suhaif314 commented 22 hours ago

Describe the bug I am trying to load the partionpdf from the unstructured library, I have tried many ways like creating a path in the global variable, and adding the PATH to the (OCR agent)

To Reproduce

raw_pdf_element = partition_pdf( filename= r"C:\Users\Documents\Practice_myself\data\2206.01062.pdf", strategy='hi_res', extract_images_in_pdf=True, extract_image_block_types=["Image", "table"], extract_image_block_to_payload=False, extract_image_block_output_dir='extracted_data'

)

Expected behavior I expect that the tesseract.exe it can't able to read it.

Screenshots image image

Environment Info { "name": "TesseractNotFoundError", "message": "tesseract is not installed or it's not in your PATH. See README file for more information.", "stack": "--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:451, in get_tesseract_version() 450 try: --> 451 output = subprocess.check_output( 452 [tesseract_cmd, '--version'], 453 stderr=subprocess.STDOUT, 454 env=environ, 455 stdin=subprocess.DEVNULL, 456 ) 457 except OSError:

File C:\Program Files\Python312\Lib\subprocess.py:466, in check_output(timeout, *popenargs, *kwargs) 464 kwargs['input'] = empty --> 466 return run(popenargs, stdout=PIPE, timeout=timeout, check=True, 467 **kwargs).stdout

File C:\Program Files\Python312\Lib\subprocess.py:548, in run(input, capture_output, timeout, check, *popenargs, *kwargs) 546 kwargs['stderr'] = PIPE --> 548 with Popen(popenargs, **kwargs) as process: 549 try:

File C:\Program Files\Python312\Lib\subprocess.py:1026, in Popen.init(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group) 1023 self.stderr = io.TextIOWrapper(self.stderr, 1024 encoding=encoding, errors=errors) -> 1026 self._execute_child(args, executable, preexec_fn, close_fds, 1027 pass_fds, cwd, env, 1028 startupinfo, creationflags, shell, 1029 p2cread, p2cwrite, 1030 c2pread, c2pwrite, 1031 errread, errwrite, 1032 restore_signals, 1033 gid, gids, uid, umask, 1034 start_new_session, process_group) 1035 except: 1036 # Cleanup if the child failed starting.

File C:\Program Files\Python312\Lib\subprocess.py:1538, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group) 1537 try: -> 1538 hp, ht, pid, tid = _winapi.CreateProcess(executable, args, 1539 # no special security 1540 None, None, 1541 int(not close_fds), 1542 creationflags, 1543 env, 1544 cwd, 1545 startupinfo) 1546 finally: 1547 # Child is launched. Close the parent's copy of those pipe 1548 # handles that only the child should have open. You need (...) 1551 # pipe will not close when the child process exits and the 1552 # ReadFile will hang.

FileNotFoundError: [WinError 2] The system cannot find the file specified

During handling of the above exception, another exception occurred:

TesseractNotFoundError Traceback (most recent call last) Cell In[21], line 1 ----> 1 raw_pdf_element = partition_pdf( 2 filename= r\"C:\Users\Documents\Practice_myself\data\2206.01062.pdf\", 3 strategy='hi_res', 4 extract_images_in_pdf=True, 5 extract_image_block_types=[\"Image\", \"table\"], 6 extract_image_block_to_payload=False, 7 extract_image_block_output_dir='extracted_data' 8 9 )

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\documents\elements.py:578, in process_metadata..decorator..wrapper(*args, kwargs) 576 @functools.wraps(func) 577 def wrapper(*args: _P.args, *kwargs: _P.kwargs) -> list[Element]: --> 578 elements = func(args, kwargs) 579 call_args = get_call_args_applying_defaults(func, *args, **kwargs) 581 unique_element_ids: bool = call_args.get(\"unique_element_ids\", False)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\file_utils\filetype.py:725, in add_filetype..decorator..wrapper(*args, kwargs) 723 @functools.wraps(func) 724 def wrapper(*args: _P.args, *kwargs: _P.kwargs) -> list[Element]: --> 725 elements = func(args, kwargs) 727 for element in elements: 728 # NOTE(robinson) - Attached files have already run through this logic 729 # in their own partitioning function 730 if element.metadata.attached_to_filename is None:

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\file_utils\filetype.py:683, in add_metadata..wrapper(*args, kwargs) 681 @functools.wraps(func) 682 def wrapper(*args: _P.args, *kwargs: _P.kwargs) -> list[Element]: --> 683 elements = func(args, kwargs) 684 call_args = get_call_args_applying_defaults(func, *args, **kwargs) 686 if call_args.get(\"metadata_filename\"):

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\chunking\dispatch.py:74, in add_chunking_strategy..wrapper(*args, kwargs) 71 \"\"\"The decorated function is replaced with this one.\"\"\" 73 # -- call the partitioning function to get the elements -- ---> 74 elements = func(*args, *kwargs) 76 # -- look for a chunking-strategy argument -- 77 call_args = get_call_args_applying_defaults(func, args, kwargs)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf.py:209, in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_filename, metadata_last_modified, chunking_strategy, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, kwargs) 205 exactly_one(filename=filename, file=file) 207 languages = check_language_args(languages or [], ocr_languages) --> 209 return partition_pdf_or_image( 210 filename=filename, 211 file=file, 212 include_page_breaks=include_page_breaks, 213 strategy=strategy, 214 infer_table_structure=infer_table_structure, 215 languages=languages, 216 metadata_last_modified=metadata_last_modified, 217 hi_res_model_name=hi_res_model_name, 218 extract_images_in_pdf=extract_images_in_pdf, 219 extract_image_block_types=extract_image_block_types, 220 extract_image_block_output_dir=extract_image_block_output_dir, 221 extract_image_block_to_payload=extract_image_block_to_payload, 222 starting_page_number=starting_page_number, 223 extract_forms=extract_forms, 224 form_extraction_skip_tables=form_extraction_skip_tables, 225 kwargs, 226 )

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf.py:305, in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, kwargs) 303 with warnings.catch_warnings(): 304 warnings.simplefilter(\"ignore\") --> 305 elements = _partition_pdf_or_image_local( 306 filename=filename, 307 file=spooled_to_bytes_io_if_needed(file), 308 is_image=is_image, 309 infer_table_structure=infer_table_structure, 310 include_page_breaks=include_page_breaks, 311 languages=languages, 312 ocr_languages=ocr_languages, 313 metadata_last_modified=metadata_last_modified or last_modified, 314 hi_res_model_name=hi_res_model_name, 315 pdf_text_extractable=pdf_text_extractable, 316 extract_images_in_pdf=extract_images_in_pdf, 317 extract_image_block_types=extract_image_block_types, 318 extract_image_block_output_dir=extract_image_block_output_dir, 319 extract_image_block_to_payload=extract_image_block_to_payload, 320 starting_page_number=starting_page_number, 321 extract_forms=extract_forms, 322 form_extraction_skip_tables=form_extraction_skip_tables, 323 kwargs, 324 ) 325 out_elements = _process_uncategorized_text_elements(elements) 327 elif strategy == PartitionStrategy.FAST:

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\utils.py:216, in requires_dependencies..decorator..wrapper(*args, kwargs) 213 @wraps(func) 214 def wrapper(*args: _P.args, *kwargs: _P.kwargs): 215 run_check() --> 216 return func(args, kwargs)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf.py:626, in _partition_pdf_or_image_local(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_languages, ocr_mode, model_name, hi_res_model_name, pdf_image_dpi, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, analysis, analyzed_image_output_dir_path, starting_page_number, extract_forms, form_extraction_skip_tables, pdf_hi_res_max_pages, **kwargs) 619 # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout 620 merged_document_layout = merge_inferred_with_extracted_layout( 621 inferred_document_layout=inferred_document_layout, 622 extracted_layout=extracted_layout, 623 hi_res_model_name=hi_res_model_name, 624 ) --> 626 final_document_layout = process_file_with_ocr( 627 filename, 628 merged_document_layout, 629 extracted_layout=extracted_layout, 630 is_image=is_image, 631 infer_table_structure=infer_table_structure, 632 ocr_languages=ocr_languages, 633 ocr_mode=ocr_mode, 634 pdf_image_dpi=pdf_image_dpi, 635 ocr_layout_dumper=ocr_layout_dumper, 636 ) 637 else: 638 inferred_document_layout = process_data_with_model( 639 file, 640 is_image=is_image, 641 model_name=hi_res_model_name, 642 pdf_image_dpi=pdf_image_dpi, 643 )

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\utils.py:216, in requires_dependencies..decorator..wrapper(*args, kwargs) 213 @wraps(func) 214 def wrapper(*args: _P.args, *kwargs: _P.kwargs): 215 run_check() --> 216 return func(args, kwargs)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:178, in process_file_with_ocr(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper) 176 except Exception as e: 177 if os.path.isdir(filename) or os.path.isfile(filename): --> 178 raise e 179 else: 180 raise FileNotFoundError(f'File \"{filename}\" not found!') from e

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:165, in process_file_with_ocr(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper) 163 extracted_regions = extracted_layout[i] if i < len(extracted_layout) else None 164 with PILImage.open(image_path) as image: --> 165 merged_page_layout = supplement_page_layout_with_ocr( 166 page_layout=out_layout.pages[i], 167 image=image, 168 infer_table_structure=infer_table_structure, 169 ocr_languages=ocr_languages, 170 ocr_mode=ocr_mode, 171 extracted_regions=extracted_regions, 172 ocr_layout_dumper=ocr_layout_dumper, 173 ) 174 merged_page_layouts.append(merged_page_layout) 175 return DocumentLayout.from_pages(merged_page_layouts)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\utils.py:216, in requires_dependencies..decorator..wrapper(*args, kwargs) 213 @wraps(func) 214 def wrapper(*args: _P.args, *kwargs: _P.kwargs): 215 run_check() --> 216 return func(args, kwargs)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:203, in supplement_page_layout_with_ocr(page_layout, image, infer_table_structure, ocr_languages, ocr_mode, extracted_regions, ocr_layout_dumper) 201 ocr_agent = OCRAgent.get_agent(language=ocr_languages) 202 if ocr_mode == OCRMode.FULL_PAGE.value: --> 203 ocr_layout = ocr_agent.get_layout_from_image(image) 204 if ocr_layout_dumper: 205 ocr_layout_dumper.add_ocred_page(ocr_layout)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\utils\ocr_models\tesseract_ocr.py:50, in OCRAgentTesseract.get_layout_from_image(self, image) 48 trace_logger.detail(\"Processing entire page OCR with tesseract...\") 49 zoom = 1 ---> 50 ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data( 51 np.array(image), 52 lang=self.language, 53 output_type=Output.DATAFRAME, 54 ) 55 ocr_df = ocr_df.dropna() 57 # tesseract performance degrades when the text height is out of the preferred zone so we 58 # zoom the image (in or out depending on estimated text height) for optimum OCR results 59 # but this needs to be evaluated based on actual use case as the optimum scaling also 60 # depend on type of characters (font, language, etc); be careful about this 61 # functionality

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:590, in image_to_data(image, lang, config, nice, output_type, timeout, pandas_config) 576 def image_to_data( 577 image, 578 lang=None, (...) 583 pandas_config=None, 584 ): 585 \"\"\" 586 Returns string containing box boundaries, confidences, 587 and other information. Requires Tesseract 3.05+ 588 \"\"\" --> 590 if get_tesseract_version(cached=True) < TESSERACT_MIN_VERSION: 591 raise TSVNotSupported() 593 config = f'-c tessedit_create_tsv=1 {config.strip()}'

File c:\User\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:163, in run_once..wrapper(*args, kwargs) 160 @wraps(func) 161 def wrapper(*args, *kwargs): 162 if not kwargs.pop('cached', False) or wrapper._result is wrapper: --> 163 wrapper._result = func(args, kwargs) 164 return wrapper._result

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:458, in get_tesseract_version() 451 output = subprocess.check_output( 452 [tesseract_cmd, '--version'], 453 stderr=subprocess.STDOUT, 454 env=environ, 455 stdin=subprocess.DEVNULL, 456 ) 457 except OSError: --> 458 raise TesseractNotFoundError() 460 raw_version = output.decode(DEFAULT_ENCODING) 461 strversion, * = raw_version.lstrip(string.printable[10:]).partition(' ')

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information." }

Additional context Add any other context about the problem here.