chroma-core / chroma

the AI-native open-source embedding database
https://www.trychroma.com/
Apache License 2.0
14.58k stars 1.22k forks source link

vectorstore.similarity_search_with_score(query="query", k=4): error KeyError: 8 #260

Closed sergerdn closed 1 year ago

sergerdn commented 1 year ago

What am I missing here? I'm not sure if I posted it in the right project, it might be a bug on the LangChain side.

[tool.poetry.dependencies]
python = "^3.10"
click = "^8.1.3"
langchain = "^0.0.126"
python-dotenv = "^1.0.0"
openai = "^0.27.2"
pypdf = "^3.7.0"
setuptools = "^67.6.1"
chromadb = {git = "ssh://git@github.com/chroma-core/chroma.git", rev = "main"}
C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\Scripts\python.exe "C:/Program Files/JetBrains/PyCharm 2022.3.2/plugins/python/helpers/pydev/pydevd.py" --multiprocess --qt-support=auto --client 127.0.0.1 --port 60732 --file D:\Projects\ChatPine\ChatPine-DataLoader\main_chromadb.py 
Connected to pydev debugger (build 231.8109.197)
INFO:chromadb:Running Chroma using direct local API.
WARNING:chromadb:Using embedded DuckDB with persistence: data will be stored in: D:\Projects\ChatPine\ChatPine-DataLoader\db
INFO:clickhouse_connect.driver.ctypes:Successfully imported ClickHouse Connect C data optimizations
INFO:clickhouse_connect.driver.ctypes:Successfully import ClickHouse Connect C/Numpy optimizations
INFO:clickhouse_connect.json_impl:Using python library for writing JSON byte strings
INFO:chromadb.db.duckdb:loaded in 56 embeddings
INFO:chromadb.db.duckdb:loaded in 1 collections
INFO:chromadb.db.duckdb:collection with name langchain_store already exists, returning existing collection
DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
DEBUG:openai:api_version=None data='{"input": ["FREDERICK"], "encoding_format": "base64"}' message='Post details'
DEBUG:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=18 request_id=b0e58e9a2381b36fb523da78076f3161 response_code=200
DEBUG:chromadb.db.index.hnswlib:time to pre process our knn query: 0.0
DEBUG:chromadb.db.index.hnswlib:time to run knn query: 0.0
Traceback (most recent call last):
  File "C:\Program Files\JetBrains\PyCharm 2022.3.2\plugins\python\helpers\pydev\pydevd.py", line 1496, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
  File "C:\Program Files\JetBrains\PyCharm 2022.3.2\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "D:\Projects\ChatPine\ChatPine-DataLoader\main_chromadb.py", line 66, in <module>
    main()
  File "D:\Projects\ChatPine\ChatPine-DataLoader\main_chromadb.py", line 62, in main
    query_chromadb()
  File "D:\Projects\ChatPine\ChatPine-DataLoader\main_chromadb.py", line 57, in query_chromadb
    vectorstore.similarity_search_with_score(query="FREDERICK", k=8)
  File "D:\Projects\Pycharm\sergerdn\langchain\langchain\vectorstores\chroma.py", line 179, in similarity_search_with_score
    results = self._collection.query(
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\api\models\Collection.py", line 203, in query
    return self._client._query(
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\api\local.py", line 247, in _query
    uuids, distances = self._db.get_nearest_neighbors(
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\db\clickhouse.py", line 521, in get_nearest_neighbors
    uuids, distances = index.get_nearest_neighbors(embeddings, n_results, ids)
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\db\index\hnswlib.py", line 250, in get_nearest_neighbors
    ids = [[self._label_to_id[label] for label in labels] for labels in database_labels]
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\db\index\hnswlib.py", line 250, in <listcomp>
    ids = [[self._label_to_id[label] for label in labels] for labels in database_labels]
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\db\index\hnswlib.py", line 250, in <listcomp>
    ids = [[self._label_to_id[label] for label in labels] for labels in database_labels]
KeyError: 8
INFO:chromadb.db.duckdb:PersistentDuckDB del, about to run persist
INFO:chromadb.db.duckdb:Persisting DB to disk, putting it in the save folder: D:\Projects\ChatPine\ChatPine-DataLoader\db

Process finished with exit code 1
import logging
import os

import chromadb
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

load_dotenv()
logging.basicConfig(level=logging.DEBUG)

ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")

def get_documents():
    return PyPDFLoader("fixtures/pdf/MorseVsFrederick.pdf").load()

def init_chromadb():
    client_settings = chromadb.config.Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=DB_DIR,
        anonymized_telemetry=False
    )
    embeddings = OpenAIEmbeddings()

    vectorstore = Chroma(
        collection_name="langchain_store",
        embedding_function=embeddings,
        client_settings=client_settings,
        persist_directory=DB_DIR,
    )

    vectorstore.add_documents(documents=get_documents(), embedding=embeddings)
    vectorstore.persist()
    print(vectorstore)

def query_chromadb():
    client_settings = chromadb.config.Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=DB_DIR,
        anonymized_telemetry=False
    )

    embeddings = OpenAIEmbeddings()

    vectorstore = Chroma(
        collection_name="langchain_store",
        embedding_function=embeddings,
        client_settings=client_settings,
        persist_directory=DB_DIR,
    )
    ############ ERROR IS HERE:  ############ 
    vectorstore.similarity_search_with_score(query="FREDERICK", k=4)

def main():
    # init_chromadb()
    query_chromadb()

if __name__ == '__main__':
    main()
sergerdn commented 1 year ago

I tested the query with the client, but I received the same error.

def query_chromadb_direct():
    client_settings = chromadb.config.Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=DB_DIR,
        anonymized_telemetry=False
    )

    client = chromadb.Client(settings=client_settings)

    collection = client.get_collection("langchain_store")

    embeddings = OpenAIEmbeddings()

    results = collection.query(
        query_embeddings=embeddings.embed_query("FREDERICK"),
        n_results=2,
    )

    print(results)
C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\Scripts\python.exe D:\Projects\ChatPine\ChatPine-DataLoader\main_chromadb.py 
INFO:chromadb:Running Chroma using direct local API.
WARNING:chromadb:Using embedded DuckDB with persistence: data will be stored in: D:\Projects\ChatPine\ChatPine-DataLoader\db
INFO:clickhouse_connect.driver.ctypes:Successfully imported ClickHouse Connect C data optimizations
INFO:clickhouse_connect.driver.ctypes:Successfully import ClickHouse Connect C/Numpy optimizations
INFO:clickhouse_connect.json_impl:Using python library for writing JSON byte strings
INFO:chromadb.db.duckdb:loaded in 56 embeddings
INFO:chromadb.db.duckdb:loaded in 1 collections
WARNING:chromadb.api.models.Collection:No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu
DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
DEBUG:openai:api_version=None data='{"input": ["FREDERICK"], "encoding_format": "base64"}' message='Post details'
DEBUG:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=15 request_id=efcbef269f83499e492a4093cffee704 response_code=200
DEBUG:chromadb.db.index.hnswlib:time to pre process our knn query: 0.0
DEBUG:chromadb.db.index.hnswlib:time to run knn query: 0.0
Traceback (most recent call last):
  File "D:\Projects\ChatPine\ChatPine-DataLoader\main_chromadb.py", line 89, in <module>
    main()
  File "D:\Projects\ChatPine\ChatPine-DataLoader\main_chromadb.py", line 85, in main
    query_chromadb_direct()
  File "D:\Projects\ChatPine\ChatPine-DataLoader\main_chromadb.py", line 74, in query_chromadb_direct
    results = collection.query(
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\api\models\Collection.py", line 203, in query
    return self._client._query(
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\api\local.py", line 247, in _query
    uuids, distances = self._db.get_nearest_neighbors(
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\db\clickhouse.py", line 521, in get_nearest_neighbors
    uuids, distances = index.get_nearest_neighbors(embeddings, n_results, ids)
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\db\index\hnswlib.py", line 250, in get_nearest_neighbors
    ids = [[self._label_to_id[label] for label in labels] for labels in database_labels]
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\db\index\hnswlib.py", line 250, in <listcomp>
    ids = [[self._label_to_id[label] for label in labels] for labels in database_labels]
  File "C:\Users\Administrator\AppData\Local\pypoetry\Cache\virtualenvs\chatpine-dataloader-GLxZdGXB-py3.10\lib\site-packages\chromadb\db\index\hnswlib.py", line 250, in <listcomp>
    ids = [[self._label_to_id[label] for label in labels] for labels in database_labels]
KeyError: 8
INFO:chromadb.db.duckdb:PersistentDuckDB del, about to run persist
INFO:chromadb.db.duckdb:Persisting DB to disk, putting it in the save folder: D:\Projects\ChatPine\ChatPine-DataLoader\db

Process finished with exit code 1
sergerdn commented 1 year ago

I ran poetry update to update the package from Git to the latest version, and it appears to have fixed my code. I think that some of your latest commit, fixed my code.