langchain-ai / chat-langchain

https://chat.langchain.com
MIT License
5.09k stars 1.19k forks source link

3MB txt failed FAISS.from_documents(). openai limit met?can I divide &conquer big files to get the vectorstore? #28

Open jneygor8504 opened 1 year ago

jneygor8504 commented 1 year ago

codes: from dotenv import load_dotenv load_dotenv()

from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import UnstructuredFileLoader from langchain.vectorstores.faiss import FAISS from langchain.embeddings import OpenAIEmbeddings import pickle

Load Data

loader = UnstructuredFileLoader("a.txt") raw_documents = loader.load()

Split text

text_splitter = RecursiveCharacterTextSplitter() documents = text_splitter.split_documents(raw_documents)

Load Data to vectorstore

embeddings = OpenAIEmbeddings() vectorstore = FAISS.from_documents(documents, embeddings)


error:

(base) PS F:\test> python .\inges.py D:\miniconda3\lib\site-packages\requests__init.py:102: RequestsDependencyWarning: urllib3 (1.26.8) or chardet (5.1.0)/charset_normalizer (2.1.1) doesn't match a supported version! warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported " Traceback (most recent call last): File "F:\test\ingest.py", line 22, in raw_documents = loader.load() File "D:\miniconda3\lib\site-packages\langchain\document_loaders\unstructured.py", line 38, in load elements = self._get_elements() File "D:\miniconda3\lib\site-packages\langchain\document_loaders\unstructured.py", line 70, in _get_elements return partition(filename=self.file_path) File "D:\miniconda3\lib\site-packages\unstructured\partition\auto.py", line 58, in partition return partition_text(filename=filename, file=file) File "D:\miniconda3\lib\site-packages\unstructured\partition\text.py", line 41, in partition_text file_text = f.read() File "D:\miniconda3\lib\codecs.py", line 322, in decode (result, consumed) = self._buffer_decode(data, self.errors, final) UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte (base) PS F:\test> python .\ingestNeovim.py D:\miniconda3\lib\site-packages\requests__init.py:102: RequestsDependencyWarning: urllib3 (1.26.8) or chardet (5.1.0)/charset_normalizer (2.1.1) doesn't match a supported version! warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported " (base) PS F:\test> python .\ingestNeovim.py >1.log D:\miniconda3\lib\site-packages\requests__init.py:102: RequestsDependencyWarning: urllib3 (1.26.8) or chardet (5.1.0)/charset_normalizer (2.1.1) doesn't match a supported version! warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported " (base) PS F:\test> python .\ingestNeovim.py D:\miniconda3\lib\site-packages\requests__init.py:102: RequestsDependencyWarning: urllib3 (1.26.8) or chardet (5.1.0)/charset_normalizer (2.1.1) doesn't match a supported version! warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported " Retrying langchain.embeddings.openai.embed_with_retry.._completion_with_retry in 4.0 seconds as it raised APIError: Internal error { "error": { "message": "Internal error", "type": "internal_error", "param": null, "code": "internal_error" } } 500 {'error': {'message': 'Internal error', 'type': 'internal_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Sat, 11 Mar 2023 12:41:21 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '152', 'Connection': 'keep-alive', 'Vary': 'Origin', 'X-Request-Id': 'c3d10e2343a2b3408ed03a5c4ffe61a3', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains'}. Retrying langchain.embeddings.openai.embed_with_retry.._completion_with_retry in 4.0 seconds as it raised APIError: Internal error { "error": { "message": "Internal error", "type": "internal_error", "param": null, "code": "internal_error" } } 500 {'error': {'message': 'Internal error', 'type': 'internal_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Sat, 11 Mar 2023 12:42:18 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '152', 'Connection': 'keep-alive', 'Vary': 'Origin', 'X-Request-Id': '8ad519d0b7c0acac76dff60931b41a86', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains'}. Retrying langchain.embeddings.openai.embed_with_retry.._completion_with_retry in 4.0 seconds as it raised APIError: Internal error { "error": { "message": "Internal error", "type": "internal_error", "param": null, "code": "internal_error" } } 500 {'error': {'message': 'Internal error', 'type': 'internal_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Sat, 11 Mar 2023 12:43:02 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '152', 'Connection': 'keep-alive', 'Vary': 'Origin', 'X-Request-Id': '9e4d69bd26627466aff64ef69b1379cf', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains'}. Retrying langchain.embeddings.openai.embed_with_retry.._completion_with_retry in 8.0 seconds as it raised APIError: Internal error { "error": { "message": "Internal error", "type": "internal_error", "param": null, "code": "internal_error" } } 500 {'error': {'message': 'Internal error', 'type': 'internal_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Sat, 11 Mar 2023 12:43:49 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '152', 'Connection': 'keep-alive', 'Vary': 'Origin', 'X-Request-Id': '1fce29a7209b4fe29836a86db29ab6ae', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains'}. Retrying langchain.embeddings.openai.embed_with_retry.._completion_with_retry in 10.0 seconds as it raised APIError: Internal error { "error": { "message": "Internal error", "type": "internal_error", "param": null, "code": "internal_error" } } 500 {'error': {'message': 'Internal error', 'type': 'internal_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Sat, 11 Mar 2023 12:44:50 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '152', 'Connection': 'keep-alive', 'Vary': 'Origin', 'X-Request-Id': 'f10ec050ead37c907556b22b91fd888e', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains'}. Traceback (most recent call last): File "F:\test\ingestNeovim.py", line 31, in vectorstore = FAISS.from_documents(documents, embeddings) File "D:\miniconda3\lib\site-packages\langchain\vectorstores\base.py", line 113, in from_documents return cls.from_texts(texts, embedding, metadatas=metadatas, kwargs) File "D:\miniconda3\lib\site-packages\langchain\vectorstores\faiss.py", line 250, in from_texts embeddings = embedding.embed_documents(texts) File "D:\miniconda3\lib\site-packages\langchain\embeddings\openai.py", line 203, in embed_documents response = embed_with_retry( File "D:\miniconda3\lib\site-packages\langchain\embeddings\openai.py", line 53, in embed_with_retry return _completion_with_retry(kwargs) File "D:\miniconda3\lib\site-packages\tenacity\init__.py", line 289, in wrapped_f return self(f, *args, **kw) File "D:\miniconda3\lib\site-packages\tenacity\init.py", line 379, in call do = self.iter(retry_state=retry_state) File "D:\miniconda3\lib\site-packages\tenacity\init__.py", line 325, in iter raise retry_exc.reraise() File "D:\miniconda3\lib\site-packages\tenacity\init__.py", line 158, in reraise raise self.last_attempt.result() File "D:\miniconda3\lib\concurrent\futures_base.py", line 451, in result return self.get_result() File "D:\miniconda3\lib\concurrent\futures_base.py", line 403, in get_result raise self._exception File "D:\miniconda3\lib\site-packages\tenacity\init.py", line 382, in call__ result = fn(args, kwargs) File "D:\miniconda3\lib\site-packages\langchain\embeddings\openai.py", line 51, in _completion_with_retry return embeddings.client.create(kwargs) File "D:\miniconda3\lib\site-packages\openai\api_resources\embedding.py", line 34, in create response = super().create(args, **kwargs) File "D:\miniconda3\lib\site-packages\openai\api_resources\abstract\engine_apiresource.py", line 115, in create response, , api_key = requestor.request( File "D:\miniconda3\lib\site-packages\openai\api_requestor.py", line 181, in request resp, got_stream = self._interpret_response(result, stream) File "D:\miniconda3\lib\site-packages\openai\api_requestor.py", line 396, in _interpret_response self._interpret_response_line( File "D:\miniconda3\lib\site-packages\openai\api_requestor.py", line 429, in _interpret_response_line raise self.handle_error_response( openai.error.APIError: Internal error { "error": { "message": "Internal error", "type": "internal_error", "param": null, "code": "internal_error" } } 500 {'error': {'message': 'Internal error', 'type': 'internal_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Sat, 11 Mar 2023 12:45:47 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '152', 'Connection': 'keep-alive', 'Vary': 'Origin', 'X-Request-Id': '39e42631e12228f19c485055b24bb8fe', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains'}

SalehHindi commented 1 year ago

I am getting the same error. What did you do to solve this?