Describe the bug
I am trying to load the partionpdf from the unstructured library, I have tried many ways like creating a path in the global variable, and adding the PATH to the (OCR agent)
Expected behavior
I expect that the tesseract.exe it can't able to read it.
Screenshots
Environment Info
{
"name": "TesseractNotFoundError",
"message": "tesseract is not installed or it's not in your PATH. See README file for more information.",
"stack": "---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:451, in get_tesseract_version()
450 try:
--> 451 output = subprocess.check_output(
452 [tesseract_cmd, '--version'],
453 stderr=subprocess.STDOUT,
454 env=environ,
455 stdin=subprocess.DEVNULL,
456 )
457 except OSError:
File C:\Program Files\Python312\Lib\subprocess.py:1538, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)
1537 try:
-> 1538 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
1539 # no special security
1540 None, None,
1541 int(not close_fds),
1542 creationflags,
1543 env,
1544 cwd,
1545 startupinfo)
1546 finally:
1547 # Child is launched. Close the parent's copy of those pipe
1548 # handles that only the child should have open. You need
(...)
1551 # pipe will not close when the child process exits and the
1552 # ReadFile will hang.
FileNotFoundError: [WinError 2] The system cannot find the file specified
During handling of the above exception, another exception occurred:
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\file_utils\filetype.py:725, in add_filetype..decorator..wrapper(*args, kwargs)
723 @functools.wraps(func)
724 def wrapper(*args: _P.args, *kwargs: _P.kwargs) -> list[Element]:
--> 725 elements = func(args, kwargs)
727 for element in elements:
728 # NOTE(robinson) - Attached files have already run through this logic
729 # in their own partitioning function
730 if element.metadata.attached_to_filename is None:
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\file_utils\filetype.py:683, in add_metadata..wrapper(*args, kwargs)
681 @functools.wraps(func)
682 def wrapper(*args: _P.args, *kwargs: _P.kwargs) -> list[Element]:
--> 683 elements = func(args, kwargs)
684 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
686 if call_args.get(\"metadata_filename\"):
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\chunking\dispatch.py:74, in add_chunking_strategy..wrapper(*args, kwargs)
71 \"\"\"The decorated function is replaced with this one.\"\"\"
73 # -- call the partitioning function to get the elements --
---> 74 elements = func(*args, *kwargs)
76 # -- look for a chunking-strategy argument --
77 call_args = get_call_args_applying_defaults(func, args, kwargs)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:178, in process_file_with_ocr(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper)
176 except Exception as e:
177 if os.path.isdir(filename) or os.path.isfile(filename):
--> 178 raise e
179 else:
180 raise FileNotFoundError(f'File \"{filename}\" not found!') from e
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:203, in supplement_page_layout_with_ocr(page_layout, image, infer_table_structure, ocr_languages, ocr_mode, extracted_regions, ocr_layout_dumper)
201 ocr_agent = OCRAgent.get_agent(language=ocr_languages)
202 if ocr_mode == OCRMode.FULL_PAGE.value:
--> 203 ocr_layout = ocr_agent.get_layout_from_image(image)
204 if ocr_layout_dumper:
205 ocr_layout_dumper.add_ocred_page(ocr_layout)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\utils\ocr_models\tesseract_ocr.py:50, in OCRAgentTesseract.get_layout_from_image(self, image)
48 trace_logger.detail(\"Processing entire page OCR with tesseract...\")
49 zoom = 1
---> 50 ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
51 np.array(image),
52 lang=self.language,
53 output_type=Output.DATAFRAME,
54 )
55 ocr_df = ocr_df.dropna()
57 # tesseract performance degrades when the text height is out of the preferred zone so we
58 # zoom the image (in or out depending on estimated text height) for optimum OCR results
59 # but this needs to be evaluated based on actual use case as the optimum scaling also
60 # depend on type of characters (font, language, etc); be careful about this
61 # functionality
Describe the bug I am trying to load the partionpdf from the unstructured library, I have tried many ways like creating a path in the global variable, and adding the PATH to the (OCR agent)
To Reproduce
raw_pdf_element = partition_pdf( filename= r"C:\Users\Documents\Practice_myself\data\2206.01062.pdf", strategy='hi_res', extract_images_in_pdf=True, extract_image_block_types=["Image", "table"], extract_image_block_to_payload=False, extract_image_block_output_dir='extracted_data'
)
Expected behavior I expect that the tesseract.exe it can't able to read it.
Screenshots
Environment Info { "name": "TesseractNotFoundError", "message": "tesseract is not installed or it's not in your PATH. See README file for more information.", "stack": "--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:451, in get_tesseract_version() 450 try: --> 451 output = subprocess.check_output( 452 [tesseract_cmd, '--version'], 453 stderr=subprocess.STDOUT, 454 env=environ, 455 stdin=subprocess.DEVNULL, 456 ) 457 except OSError:
File C:\Program Files\Python312\Lib\subprocess.py:466, in check_output(timeout, *popenargs, *kwargs) 464 kwargs['input'] = empty --> 466 return run(popenargs, stdout=PIPE, timeout=timeout, check=True, 467 **kwargs).stdout
File C:\Program Files\Python312\Lib\subprocess.py:548, in run(input, capture_output, timeout, check, *popenargs, *kwargs) 546 kwargs['stderr'] = PIPE --> 548 with Popen(popenargs, **kwargs) as process: 549 try:
File C:\Program Files\Python312\Lib\subprocess.py:1026, in Popen.init(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group) 1023 self.stderr = io.TextIOWrapper(self.stderr, 1024 encoding=encoding, errors=errors) -> 1026 self._execute_child(args, executable, preexec_fn, close_fds, 1027 pass_fds, cwd, env, 1028 startupinfo, creationflags, shell, 1029 p2cread, p2cwrite, 1030 c2pread, c2pwrite, 1031 errread, errwrite, 1032 restore_signals, 1033 gid, gids, uid, umask, 1034 start_new_session, process_group) 1035 except: 1036 # Cleanup if the child failed starting.
File C:\Program Files\Python312\Lib\subprocess.py:1538, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group) 1537 try: -> 1538 hp, ht, pid, tid = _winapi.CreateProcess(executable, args, 1539 # no special security 1540 None, None, 1541 int(not close_fds), 1542 creationflags, 1543 env, 1544 cwd, 1545 startupinfo) 1546 finally: 1547 # Child is launched. Close the parent's copy of those pipe 1548 # handles that only the child should have open. You need (...) 1551 # pipe will not close when the child process exits and the 1552 # ReadFile will hang.
FileNotFoundError: [WinError 2] The system cannot find the file specified
During handling of the above exception, another exception occurred:
TesseractNotFoundError Traceback (most recent call last) Cell In[21], line 1 ----> 1 raw_pdf_element = partition_pdf( 2 filename= r\"C:\Users\Documents\Practice_myself\data\2206.01062.pdf\", 3 strategy='hi_res', 4 extract_images_in_pdf=True, 5 extract_image_block_types=[\"Image\", \"table\"], 6 extract_image_block_to_payload=False, 7 extract_image_block_output_dir='extracted_data' 8 9 )
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\documents\elements.py:578, in process_metadata..decorator..wrapper(*args, kwargs)
576 @functools.wraps(func)
577 def wrapper(*args: _P.args, *kwargs: _P.kwargs) -> list[Element]:
--> 578 elements = func(args, kwargs)
579 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
581 unique_element_ids: bool = call_args.get(\"unique_element_ids\", False)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\file_utils\filetype.py:725, in add_filetype..decorator..wrapper(*args, kwargs)
723 @functools.wraps(func)
724 def wrapper(*args: _P.args, *kwargs: _P.kwargs) -> list[Element]:
--> 725 elements = func(args, kwargs)
727 for element in elements:
728 # NOTE(robinson) - Attached files have already run through this logic
729 # in their own partitioning function
730 if element.metadata.attached_to_filename is None:
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\file_utils\filetype.py:683, in add_metadata..wrapper(*args, kwargs)
681 @functools.wraps(func)
682 def wrapper(*args: _P.args, *kwargs: _P.kwargs) -> list[Element]:
--> 683 elements = func(args, kwargs)
684 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
686 if call_args.get(\"metadata_filename\"):
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\chunking\dispatch.py:74, in add_chunking_strategy..wrapper(*args, kwargs)
71 \"\"\"The decorated function is replaced with this one.\"\"\"
73 # -- call the partitioning function to get the elements --
---> 74 elements = func(*args, *kwargs)
76 # -- look for a chunking-strategy argument --
77 call_args = get_call_args_applying_defaults(func, args, kwargs)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf.py:209, in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_filename, metadata_last_modified, chunking_strategy, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, kwargs) 205 exactly_one(filename=filename, file=file) 207 languages = check_language_args(languages or [], ocr_languages) --> 209 return partition_pdf_or_image( 210 filename=filename, 211 file=file, 212 include_page_breaks=include_page_breaks, 213 strategy=strategy, 214 infer_table_structure=infer_table_structure, 215 languages=languages, 216 metadata_last_modified=metadata_last_modified, 217 hi_res_model_name=hi_res_model_name, 218 extract_images_in_pdf=extract_images_in_pdf, 219 extract_image_block_types=extract_image_block_types, 220 extract_image_block_output_dir=extract_image_block_output_dir, 221 extract_image_block_to_payload=extract_image_block_to_payload, 222 starting_page_number=starting_page_number, 223 extract_forms=extract_forms, 224 form_extraction_skip_tables=form_extraction_skip_tables, 225 kwargs, 226 )
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf.py:305, in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, kwargs) 303 with warnings.catch_warnings(): 304 warnings.simplefilter(\"ignore\") --> 305 elements = _partition_pdf_or_image_local( 306 filename=filename, 307 file=spooled_to_bytes_io_if_needed(file), 308 is_image=is_image, 309 infer_table_structure=infer_table_structure, 310 include_page_breaks=include_page_breaks, 311 languages=languages, 312 ocr_languages=ocr_languages, 313 metadata_last_modified=metadata_last_modified or last_modified, 314 hi_res_model_name=hi_res_model_name, 315 pdf_text_extractable=pdf_text_extractable, 316 extract_images_in_pdf=extract_images_in_pdf, 317 extract_image_block_types=extract_image_block_types, 318 extract_image_block_output_dir=extract_image_block_output_dir, 319 extract_image_block_to_payload=extract_image_block_to_payload, 320 starting_page_number=starting_page_number, 321 extract_forms=extract_forms, 322 form_extraction_skip_tables=form_extraction_skip_tables, 323 kwargs, 324 ) 325 out_elements = _process_uncategorized_text_elements(elements) 327 elif strategy == PartitionStrategy.FAST:
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\utils.py:216, in requires_dependencies..decorator..wrapper(*args, kwargs)
213 @wraps(func)
214 def wrapper(*args: _P.args, *kwargs: _P.kwargs):
215 run_check()
--> 216 return func(args, kwargs)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf.py:626, in _partition_pdf_or_image_local(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_languages, ocr_mode, model_name, hi_res_model_name, pdf_image_dpi, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, analysis, analyzed_image_output_dir_path, starting_page_number, extract_forms, form_extraction_skip_tables, pdf_hi_res_max_pages, **kwargs) 619 # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout 620 merged_document_layout = merge_inferred_with_extracted_layout( 621 inferred_document_layout=inferred_document_layout, 622 extracted_layout=extracted_layout, 623 hi_res_model_name=hi_res_model_name, 624 ) --> 626 final_document_layout = process_file_with_ocr( 627 filename, 628 merged_document_layout, 629 extracted_layout=extracted_layout, 630 is_image=is_image, 631 infer_table_structure=infer_table_structure, 632 ocr_languages=ocr_languages, 633 ocr_mode=ocr_mode, 634 pdf_image_dpi=pdf_image_dpi, 635 ocr_layout_dumper=ocr_layout_dumper, 636 ) 637 else: 638 inferred_document_layout = process_data_with_model( 639 file, 640 is_image=is_image, 641 model_name=hi_res_model_name, 642 pdf_image_dpi=pdf_image_dpi, 643 )
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\utils.py:216, in requires_dependencies..decorator..wrapper(*args, kwargs)
213 @wraps(func)
214 def wrapper(*args: _P.args, *kwargs: _P.kwargs):
215 run_check()
--> 216 return func(args, kwargs)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:178, in process_file_with_ocr(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper) 176 except Exception as e: 177 if os.path.isdir(filename) or os.path.isfile(filename): --> 178 raise e 179 else: 180 raise FileNotFoundError(f'File \"{filename}\" not found!') from e
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:165, in process_file_with_ocr(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper) 163 extracted_regions = extracted_layout[i] if i < len(extracted_layout) else None 164 with PILImage.open(image_path) as image: --> 165 merged_page_layout = supplement_page_layout_with_ocr( 166 page_layout=out_layout.pages[i], 167 image=image, 168 infer_table_structure=infer_table_structure, 169 ocr_languages=ocr_languages, 170 ocr_mode=ocr_mode, 171 extracted_regions=extracted_regions, 172 ocr_layout_dumper=ocr_layout_dumper, 173 ) 174 merged_page_layouts.append(merged_page_layout) 175 return DocumentLayout.from_pages(merged_page_layouts)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\utils.py:216, in requires_dependencies..decorator..wrapper(*args, kwargs)
213 @wraps(func)
214 def wrapper(*args: _P.args, *kwargs: _P.kwargs):
215 run_check()
--> 216 return func(args, kwargs)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:203, in supplement_page_layout_with_ocr(page_layout, image, infer_table_structure, ocr_languages, ocr_mode, extracted_regions, ocr_layout_dumper) 201 ocr_agent = OCRAgent.get_agent(language=ocr_languages) 202 if ocr_mode == OCRMode.FULL_PAGE.value: --> 203 ocr_layout = ocr_agent.get_layout_from_image(image) 204 if ocr_layout_dumper: 205 ocr_layout_dumper.add_ocred_page(ocr_layout)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\utils\ocr_models\tesseract_ocr.py:50, in OCRAgentTesseract.get_layout_from_image(self, image) 48 trace_logger.detail(\"Processing entire page OCR with tesseract...\") 49 zoom = 1 ---> 50 ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data( 51 np.array(image), 52 lang=self.language, 53 output_type=Output.DATAFRAME, 54 ) 55 ocr_df = ocr_df.dropna() 57 # tesseract performance degrades when the text height is out of the preferred zone so we 58 # zoom the image (in or out depending on estimated text height) for optimum OCR results 59 # but this needs to be evaluated based on actual use case as the optimum scaling also 60 # depend on type of characters (font, language, etc); be careful about this 61 # functionality
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:590, in image_to_data(image, lang, config, nice, output_type, timeout, pandas_config) 576 def image_to_data( 577 image, 578 lang=None, (...) 583 pandas_config=None, 584 ): 585 \"\"\" 586 Returns string containing box boundaries, confidences, 587 and other information. Requires Tesseract 3.05+ 588 \"\"\" --> 590 if get_tesseract_version(cached=True) < TESSERACT_MIN_VERSION: 591 raise TSVNotSupported() 593 config = f'-c tessedit_create_tsv=1 {config.strip()}'
File c:\User\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:163, in run_once..wrapper(*args, kwargs)
160 @wraps(func)
161 def wrapper(*args, *kwargs):
162 if not kwargs.pop('cached', False) or wrapper._result is wrapper:
--> 163 wrapper._result = func(args, kwargs)
164 return wrapper._result
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:458, in get_tesseract_version() 451 output = subprocess.check_output( 452 [tesseract_cmd, '--version'], 453 stderr=subprocess.STDOUT, 454 env=environ, 455 stdin=subprocess.DEVNULL, 456 ) 457 except OSError: --> 458 raise TesseractNotFoundError() 460 raw_version = output.decode(DEFAULT_ENCODING) 461 strversion, * = raw_version.lstrip(string.printable[10:]).partition(' ')
TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information." }
Additional context Add any other context about the problem here.