Closed YunchengLiang closed 10 months ago
This is the latest release 0.0.30
Can you provide your code to reproduce?
Can you provide your code to reproduce?
Yes for sure!
from embedchain import OpenSourceApp
chat_bot = OpenSourceApp()
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
def huggingface_tokenizer_length(text: str) -> int:
return len(tokenizer.encode(text))
from embedchain.config import AddConfig, ChunkerConfig
chunker_config = ChunkerConfig(chunk_size=230, chunk_overlap=20, length_function=huggingface_tokenizer_length)
pdf_url = 'https://www.rogers.com/cms/pdf/en/Consumer_SUG_V20.pdf' #online resources
chat_bot.add('pdf_file', pdf_url, config=AddConfig(chunker=chunker_config))
I can't reproduce. It works for me.
def use_pysqlite3():
"""
Swap std-lib sqlite3 with pysqlite3.
"""
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "pysqlite3-binary"])
__import__("pysqlite3")
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
# Don't be surprised if this doesn't log as you expect, because the logger is instantiated after the import
use_pysqlite3()
from embedchain import OpenSourceApp
chat_bot = OpenSourceApp()
chat_bot.reset()
chat_bot = OpenSourceApp()
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
def huggingface_tokenizer_length(text: str) -> int:
return len(tokenizer.encode(text))
from embedchain.config import AddConfig, ChunkerConfig
chunker_config = ChunkerConfig(chunk_size=230, chunk_overlap=20, length_function=huggingface_tokenizer_length)
pdf_url = 'https://www.rogers.com/cms/pdf/en/Consumer_SUG_V20.pdf' #online resources
chat_bot.add('pdf_file', pdf_url, config=AddConfig(chunker=chunker_config))
maybe the reset is the key?
I can't reproduce. It works for me.
def use_pysqlite3(): """ Swap std-lib sqlite3 with pysqlite3. """ import subprocess import sys subprocess.check_call([sys.executable, "-m", "pip", "install", "pysqlite3-binary"]) __import__("pysqlite3") sys.modules["sqlite3"] = sys.modules.pop("pysqlite3") # Don't be surprised if this doesn't log as you expect, because the logger is instantiated after the import use_pysqlite3() from embedchain import OpenSourceApp chat_bot = OpenSourceApp() chat_bot.reset() chat_bot = OpenSourceApp() from transformers import AutoTokenizer, AutoModel tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') def huggingface_tokenizer_length(text: str) -> int: return len(tokenizer.encode(text)) from embedchain.config import AddConfig, ChunkerConfig chunker_config = ChunkerConfig(chunk_size=230, chunk_overlap=20, length_function=huggingface_tokenizer_length) pdf_url = 'https://www.rogers.com/cms/pdf/en/Consumer_SUG_V20.pdf' #online resources chat_bot.add('pdf_file', pdf_url, config=AddConfig(chunker=chunker_config))
maybe the reset is the key?
I tried the reset but seems it is not the problem. I have some questions
TypeError: init_index(): incompatible function arguments. The following argument types are supported:
1. (self: hnswlib.Index, max_elements: int, M: int = 16, ef_construction: int = 200, random_seed: int = 100, allow_replace_deleted: bool = False) -> None
Invoked with: <hnswlib.Index(space='l2', dim=384)>; kwargs: max_elements=1000, ef_construction=100, M=16, is_persistent_index=True, persistence_location='db\\89cdd1e6-5c9c-49f0-9b16-96170d70598a'
Does this error suggest that arguments "is_persistent_index" and "persistence_location" are not supported arguments?
- Are we using the same libraries ( chromadb-0.4.2 embedchain-0.0.30) ?
I'm using the main branch of this repository. I don't think that makes the difference.
The error I get is
Is this the whole error? Where is the traceback.
I guess we need someone else to try to reporduce this, as I said I can't reproduce your error, I don't know which package is throwing the error and if this even an embedchain issue.
I get a new error now when I run your file
from embedchain import OpenSourceApp
from embedchain.config import OpenSourceAppConfig
config = OpenSourceAppConfig(log_level="DEBUG")
chat_bot = OpenSourceApp(config=config)
chat_bot.reset()
chat_bot = OpenSourceApp(config=config)
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
def huggingface_tokenizer_length(text: str) -> int:
return len(tokenizer.encode(text))
from embedchain.config import AddConfig, ChunkerConfig
chunker_config = ChunkerConfig(chunk_size=230, chunk_overlap=20, length_function=huggingface_tokenizer_length)
pdf_url = "https://www.rogers.com/cms/pdf/en/Consumer_SUG_V20.pdf" # online resources
chat_bot.add(pdf_url, config=AddConfig(chunker=chunker_config))
error:
ERROR:chromadb.telemetry.posthog:Failed to send telemetry event client_start: module 'chromadb' has no attribute 'get_settings'
Found model file at /home/carl/.cache/gpt4all/orca-mini-3b.ggmlv3.q4_0.bin
llama.cpp: loading model from /home/carl/.cache/gpt4all/orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format = ggjt v3 (latest)
llama_model_load_internal: n_vocab = 32000
llama_model_load_internal: n_ctx = 2048
llama_model_load_internal: n_embd = 3200
llama_model_load_internal: n_mult = 240
llama_model_load_internal: n_head = 32
llama_model_load_internal: n_layer = 26
llama_model_load_internal: n_rot = 100
llama_model_load_internal: ftype = 2 (mostly Q4_0)
llama_model_load_internal: n_ff = 8640
llama_model_load_internal: n_parts = 1
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size = 0.06 MB
llama_model_load_internal: mem required = 2862.72 MB (+ 682.00 MB per state)
llama_new_context_with_model: kv self size = 650.00 MB
Found model file at /home/carl/.cache/gpt4all/orca-mini-3b.ggmlv3.q4_0.bin
llama.cpp: loading model from /home/carl/.cache/gpt4all/orca-mini-3b.ggmlv3.q4_0.bin
llama_model_load_internal: format = ggjt v3 (latest)
llama_model_load_internal: n_vocab = 32000
llama_model_load_internal: n_ctx = 2048
llama_model_load_internal: n_embd = 3200
llama_model_load_internal: n_mult = 240
llama_model_load_internal: n_head = 32
llama_model_load_internal: n_layer = 26
llama_model_load_internal: n_rot = 100
llama_model_load_internal: ftype = 2 (mostly Q4_0)
llama_model_load_internal: n_ff = 8640
llama_model_load_internal: n_parts = 1
llama_model_load_internal: model size = 3B
llama_model_load_internal: ggml ctx size = 0.06 MB
llama_model_load_internal: mem required = 2862.72 MB (+ 682.00 MB per state)
llama_new_context_with_model: kv self size = 650.00 MB
Traceback (most recent call last):
File "/home/carl/code/embedchain/test.py", line 24, in <module>
chat_bot.add(pdf_url, config=AddConfig(chunker=chunker_config))
File "/home/carl/code/embedchain/embedchain/embedchain.py", line 62, in add
self.load_and_embed(data_formatter.loader, data_formatter.chunker, source, metadata)
File "/home/carl/code/embedchain/embedchain/embedchain.py", line 86, in load_and_embed
existing_docs = self.collection.get(
^^^^^^^^^^^^^^^^^^^^
File "/home/carl/code/embedchain/.venv/lib/python3.11/site-packages/chromadb/api/models/Collection.py", line 134, in get
return self._client._get(
^^^^^^^^^^^^^^^^^^
File "/home/carl/code/embedchain/.venv/lib/python3.11/site-packages/chromadb/api/segment.py", line 312, in _get
metadata_segment = self._manager.get_segment(collection_id, MetadataReader)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/carl/code/embedchain/.venv/lib/python3.11/site-packages/chromadb/segment/impl/manager/local.py", line 106, in get_segment
segment = next(filter(lambda s: s["type"] in known_types, segments))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
StopIteration
This is using an unmerged PR. I'm just posting this to keep an eye on it, because it works with the normal App. edit: tested in main, same error.
The error I get is
Is this the whole error? Where is the traceback.
I guess we need someone else to try to reporduce this, as I said I can't reproduce your error, I don't know which package is throwing the error and if this even an embedchain issue.
yeah.... i will ask one of my coworker to do the same and see what happens, will come back to this thread asap
I get a new error now when I run your file
from embedchain import OpenSourceApp from embedchain.config import OpenSourceAppConfig config = OpenSourceAppConfig(log_level="DEBUG") chat_bot = OpenSourceApp(config=config) chat_bot.reset() chat_bot = OpenSourceApp(config=config) from transformers import AutoModel, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") def huggingface_tokenizer_length(text: str) -> int: return len(tokenizer.encode(text)) from embedchain.config import AddConfig, ChunkerConfig chunker_config = ChunkerConfig(chunk_size=230, chunk_overlap=20, length_function=huggingface_tokenizer_length) pdf_url = "https://www.rogers.com/cms/pdf/en/Consumer_SUG_V20.pdf" # online resources chat_bot.add(pdf_url, config=AddConfig(chunker=chunker_config))
error:
ERROR:chromadb.telemetry.posthog:Failed to send telemetry event client_start: module 'chromadb' has no attribute 'get_settings' Found model file at /home/carl/.cache/gpt4all/orca-mini-3b.ggmlv3.q4_0.bin llama.cpp: loading model from /home/carl/.cache/gpt4all/orca-mini-3b.ggmlv3.q4_0.bin llama_model_load_internal: format = ggjt v3 (latest) llama_model_load_internal: n_vocab = 32000 llama_model_load_internal: n_ctx = 2048 llama_model_load_internal: n_embd = 3200 llama_model_load_internal: n_mult = 240 llama_model_load_internal: n_head = 32 llama_model_load_internal: n_layer = 26 llama_model_load_internal: n_rot = 100 llama_model_load_internal: ftype = 2 (mostly Q4_0) llama_model_load_internal: n_ff = 8640 llama_model_load_internal: n_parts = 1 llama_model_load_internal: model size = 3B llama_model_load_internal: ggml ctx size = 0.06 MB llama_model_load_internal: mem required = 2862.72 MB (+ 682.00 MB per state) llama_new_context_with_model: kv self size = 650.00 MB Found model file at /home/carl/.cache/gpt4all/orca-mini-3b.ggmlv3.q4_0.bin llama.cpp: loading model from /home/carl/.cache/gpt4all/orca-mini-3b.ggmlv3.q4_0.bin llama_model_load_internal: format = ggjt v3 (latest) llama_model_load_internal: n_vocab = 32000 llama_model_load_internal: n_ctx = 2048 llama_model_load_internal: n_embd = 3200 llama_model_load_internal: n_mult = 240 llama_model_load_internal: n_head = 32 llama_model_load_internal: n_layer = 26 llama_model_load_internal: n_rot = 100 llama_model_load_internal: ftype = 2 (mostly Q4_0) llama_model_load_internal: n_ff = 8640 llama_model_load_internal: n_parts = 1 llama_model_load_internal: model size = 3B llama_model_load_internal: ggml ctx size = 0.06 MB llama_model_load_internal: mem required = 2862.72 MB (+ 682.00 MB per state) llama_new_context_with_model: kv self size = 650.00 MB Traceback (most recent call last): File "/home/carl/code/embedchain/test.py", line 24, in <module> chat_bot.add(pdf_url, config=AddConfig(chunker=chunker_config)) File "/home/carl/code/embedchain/embedchain/embedchain.py", line 62, in add self.load_and_embed(data_formatter.loader, data_formatter.chunker, source, metadata) File "/home/carl/code/embedchain/embedchain/embedchain.py", line 86, in load_and_embed existing_docs = self.collection.get( ^^^^^^^^^^^^^^^^^^^^ File "/home/carl/code/embedchain/.venv/lib/python3.11/site-packages/chromadb/api/models/Collection.py", line 134, in get return self._client._get( ^^^^^^^^^^^^^^^^^^ File "/home/carl/code/embedchain/.venv/lib/python3.11/site-packages/chromadb/api/segment.py", line 312, in _get metadata_segment = self._manager.get_segment(collection_id, MetadataReader) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/carl/code/embedchain/.venv/lib/python3.11/site-packages/chromadb/segment/impl/manager/local.py", line 106, in get_segment segment = next(filter(lambda s: s["type"] in known_types, segments)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ StopIteration
This is using an unmerged PR. I'm just posting this to keep an eye on it, because it works with the normal App. edit: tested in main, same error.
it works fine on my coworker's computer... guess something wrong with my configuration and such, i will try use a new conda kernel
Closing this as its very old.
🐛 Describe the bug