[X] I added a very descriptive title to this issue.
[X] I searched the LangChain documentation with the integrated search.
[X] I used the GitHub search to find a similar question and didn't find it.
[X] I am sure that this is a bug in LangChain rather than my code.
[X] The bug is not resolved by updating to the latest stable version of LangChain (or the specific integration package).
Example Code
'''python
import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, PDFMinerLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from constants import CHROMA_SETTINGS
persist_directory = "db"
def main():
for root, dirs, files in os.walk("docs"):
for file in files:
if file.endswith(".pdf"):
print(file)
loader = PyPDFLoader(os.path.join(root, file))
documents = loader.load() # Moved inside the if statement to ensure documents are loaded only for PDFs
print("splitting into chunks")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
create embeddings here
print("Loading sentence transformers model")
embeddings =HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# create vector store here
print(f"Creating embeddings. May take some minutes...")
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
db.persist()
db = None
print(f"Ingestion complete! You can now run privateGPT.py to query your documents")
if name == "main":
main()
'''
Error Message and Stack Trace (if applicable)
(env) PS C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot> & c:/Users/USER/Documents/models/chatToPdf/Chat-with-PDF-Chatbot/env/Scripts/python.exe c:/Users/USER/Documents/models/chatToPdf/Chat-with-PDF-Chatbot/ingest.py
duckdb+parquet
db
True
Applied Natural Language Processing with Python_ Implementing Machine Learning and Deep Learning Algorithms for Natural Language Processing ( PDFDrive ).pdf
splitting into chunks
Loading sentence transformers model
C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\transformers\tokenization_utils_base.py:1601: FutureWarning: clean_up_tokenization_spaces was not set. It will be set to
True by default. This behavior will be depracted in transformers v4.45, and will be then set to False by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884
warnings.warn(
Creating embeddings. May take some minutes...
Traceback (most recent call last):
File "c:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\ingest.py", line 32, in
main()
File "c:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\ingest.py", line 25, in main
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 878, in from_documents
return cls.from_texts(
tings)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^env\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 814, in from_texts^^^^^^
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\chromadb\api\client.py", line 56, in init env\Lib\site-packages\langchain_core_api\deprecation.py", line 205, in warn_if_direct_instance
super().init(settings=settings)
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\chromadb\api\shared_system_client.py", line 18, ienv\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 122, in initn init
self._identifier = SharedSystemClient._get_identifier_from_settings(settings) env\Lib\site-packages\chromadb__init.py", line 334, in Client
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^tings)^^^^^^^^^^ ^^^^^^
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\chromadb\api\client.py", line 56, in initenv\Lib\site-packages\chromadb\api\shared_system_client.py", line 47, in _get_identifier_from_settings env\Lib\site-packages\chromadb\api\shared_system_client.py", line 18, in init
api_impl = settings.chroma_api_impl (settings)
^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\chromadb\api\shared_system_client.py", line 47, in _get_identifier_from_settingsenv\Lib\site-packages\pydantic_internal_model_construction.py", line
237, in getattr
raise AttributeError(item) env\Lib\site-packages\pydantic_internal_model_construction.py", line 237, in getattr__
AttributeError: chroma_api_impl
(env) PS C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot>
Description
i am trying to create a chat to pdf model but it has been giving a lot of chroma related issues
System Info
System Information
OS: Windows
OS Version: 10.0.19045
Python Version: 3.11.4 (tags/v3.11.4:d2340ef, Jun 7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]
Checked other resources
Example Code
'''python import HuggingFaceEmbeddings from langchain_community.document_loaders import DirectoryLoader, PDFMinerLoader, PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from constants import CHROMA_SETTINGS
persist_directory = "db"
def main(): for root, dirs, files in os.walk("docs"): for file in files: if file.endswith(".pdf"): print(file) loader = PyPDFLoader(os.path.join(root, file)) documents = loader.load() # Moved inside the if statement to ensure documents are loaded only for PDFs print("splitting into chunks") text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) texts = text_splitter.split_documents(documents)
create embeddings here
if name == "main": main() '''
Error Message and Stack Trace (if applicable)
(env) PS C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot> & c:/Users/USER/Documents/models/chatToPdf/Chat-with-PDF-Chatbot/env/Scripts/python.exe c:/Users/USER/Documents/models/chatToPdf/Chat-with-PDF-Chatbot/ingest.py duckdb+parquet db True Applied Natural Language Processing with Python_ Implementing Machine Learning and Deep Learning Algorithms for Natural Language Processing ( PDFDrive ).pdf splitting into chunks Loading sentence transformers model C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\transformers\tokenization_utils_base.py:1601: FutureWarning:
main()
File "c:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\ingest.py", line 25, in main
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 878, in from_documents
return cls.from_texts(
tings)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^env\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 814, in from_texts^^^^^^
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\chromadb\api\client.py", line 56, in init env\Lib\site-packages\langchain_core_api\deprecation.py", line 205, in warn_if_direct_instance
super().init(settings=settings)
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\chromadb\api\shared_system_client.py", line 18, ienv\Lib\site-packages\langchain_community\vectorstores\chroma.py", line 122, in initn init
self._identifier = SharedSystemClient._get_identifier_from_settings(settings) env\Lib\site-packages\chromadb__init.py", line 334, in Client
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^tings)^^^^^^^^^^ ^^^^^^
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\chromadb\api\client.py", line 56, in initenv\Lib\site-packages\chromadb\api\shared_system_client.py", line 47, in _get_identifier_from_settings env\Lib\site-packages\chromadb\api\shared_system_client.py", line 18, in init
api_impl = settings.chroma_api_impl (settings)
^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^
File "C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\env\Lib\site-packages\chromadb\api\shared_system_client.py", line 47, in _get_identifier_from_settingsenv\Lib\site-packages\pydantic_internal_model_construction.py", line
237, in getattr
raise AttributeError(item) env\Lib\site-packages\pydantic_internal_model_construction.py", line 237, in getattr__
AttributeError: chroma_api_impl
(env) PS C:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot>
clean_up_tokenization_spaces
was not set. It will be set toTrue
by default. This behavior will be depracted in transformers v4.45, and will be then set toFalse
by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884 warnings.warn( Creating embeddings. May take some minutes... Traceback (most recent call last): File "c:\Users\USER\Documents\models\chatToPdf\Chat-with-PDF-Chatbot\ingest.py", line 32, inDescription
i am trying to create a chat to pdf model but it has been giving a lot of chroma related issues
System Info
System Information
Package Information
Optional packages not installed
Other Dependencies