i changed the code slightly to point it to a directory with pdf files ..
loader = DirectoryLoader("Q:/", recursive=True)
And keep getting the following errors ..
I have tried:
pip3 install pdf2image pdfminer.six
somewhere else advised
pip install unstructured==0.7.12
however then I got "pytesseract.pytesseract.TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information."
so I did a pip install tesseract
And now I end up back with the poppler error
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\pdf2image\pdf2image.py", line 568, in pdfinfo_from_path
proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1026, in init
self._execute_child(args, executable, preexec_fn, close_fds,
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1538, in _execute_child
hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [WinError 2] The system cannot find the file specified
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\User\Downloads\chatgpt-retrieval-main\chatgpt-retrieval-main\chatgpt.py", line 37, in
index = VectorstoreIndexCreator(vectorstore_kwargs={"persist_directory":"persist"}).from_loaders([loader])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\indexes\vectorstore.py", line 81, in from_loaders
docs.extend(loader.load())
^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\directory.py", line 156, in load
self.load_file(i, p, docs, pbar)
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\directory.py", line 105, in load_file
raise e
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\directory.py", line 99, in load_file
sub_docs = self.loader_cls(str(item), self.loader_kwargs).load()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\unstructured.py", line 86, in load
elements = self._get_elements()
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\unstructured.py", line 172, in _get_elements
return partition(filename=self.file_path, self.unstructured_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\partition\auto.py", line 180, in partition
elements = partition_pdf(
^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\documents\elements.py", line 138, in wrapper
elements = func(*args, kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\file_utils\filetype.py", line 519, in wrapper
elements = func(*args, *kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\partition\pdf.py", line 83, in partition_pdf
return partition_pdf_or_image(
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\partition\pdf.py", line 141, in partition_pdf_or_image
return _partition_pdf_or_image_with_ocr(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\utils.py", line 43, in wrapper
return func(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\partition\pdf.py", line 353, in _partition_pdf_or_image_with_ocr
document = pdf2image.convert_from_path(filename)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\pdf2image\pdf2image.py", line 127, in convert_from_path
page_count = pdfinfo_from_path(
^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\pdf2image\pdf2image.py", line 594, in pdfinfo_from_path
raise PDFInfoNotInstalledError(
pdf2image.exceptions.PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?
i changed the code slightly to point it to a directory with pdf files .. loader = DirectoryLoader("Q:/", recursive=True)
And keep getting the following errors ..
I have tried: pip3 install pdf2image pdfminer.six somewhere else advised pip install unstructured==0.7.12
however then I got "pytesseract.pytesseract.TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information." so I did a pip install tesseract
And now I end up back with the poppler error
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\pdf2image\pdf2image.py", line 568, in pdfinfo_from_path proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1026, in init self._execute_child(args, executable, preexec_fn, close_fds, File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1538, in _execute_child hp, ht, pid, tid = _winapi.CreateProcess(executable, args, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ FileNotFoundError: [WinError 2] The system cannot find the file specified
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "C:\Users\User\Downloads\chatgpt-retrieval-main\chatgpt-retrieval-main\chatgpt.py", line 37, in
index = VectorstoreIndexCreator(vectorstore_kwargs={"persist_directory":"persist"}).from_loaders([loader])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\indexes\vectorstore.py", line 81, in from_loaders
docs.extend(loader.load())
^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\directory.py", line 156, in load
self.load_file(i, p, docs, pbar)
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\directory.py", line 105, in load_file
raise e
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\directory.py", line 99, in load_file
sub_docs = self.loader_cls(str(item), self.loader_kwargs).load()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\unstructured.py", line 86, in load
elements = self._get_elements()
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\langchain\document_loaders\unstructured.py", line 172, in _get_elements
return partition(filename=self.file_path, self.unstructured_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\partition\auto.py", line 180, in partition
elements = partition_pdf(
^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\documents\elements.py", line 138, in wrapper
elements = func(*args, kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\file_utils\filetype.py", line 519, in wrapper
elements = func(*args, *kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\partition\pdf.py", line 83, in partition_pdf
return partition_pdf_or_image(
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\partition\pdf.py", line 141, in partition_pdf_or_image
return _partition_pdf_or_image_with_ocr(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\utils.py", line 43, in wrapper
return func(args, kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\unstructured\partition\pdf.py", line 353, in _partition_pdf_or_image_with_ocr
document = pdf2image.convert_from_path(filename)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\pdf2image\pdf2image.py", line 127, in convert_from_path
page_count = pdfinfo_from_path(
^^^^^^^^^^^^^^^^^^
File "C:\Users\User\AppData\Local\Programs\Python\Python311\Lib\site-packages\pdf2image\pdf2image.py", line 594, in pdfinfo_from_path
raise PDFInfoNotInstalledError(
pdf2image.exceptions.PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?