chroma-core / chroma

the AI-native open-source embedding database
https://www.trychroma.com/
Apache License 2.0
15.45k stars 1.3k forks source link

[Bug]: Chroma stops writing data after number of rows #2876

Closed DavoodHakimi closed 1 month ago

DavoodHakimi commented 1 month ago

What happened?

i have a strange problem , when i want to write a csv into Chroma db with chromadb.PersistentClient after writing 99 rows, code just stops without any error or anything else, this is my code :

from os.path import isfile, join
from tqdm import tqdm
import chromadb
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
import time

TEXT_EMBEDDING_MODEL = "BAAI/bge-m3" # Or any other Embedding model from HuggingFace
VECTOR_DB_NAME = "my_collection"
VECTOR_DB_PATH = "./DB/"

docs_path="./data/"
file_names = [f for f in listdir(docs_path) if isfile(join(docs_path, f))]

chroma_client = chromadb.PersistentClient(path = VECTOR_DB_PATH)
collection=chroma_client.get_or_create_collection(name = VECTOR_DB_NAME)
existing_ids = set(collection.get()["ids"]) # reading the doc ids to prevent from writing duplicate docs

text_embedder=HuggingFaceEmbeddings(
    model_name = TEXT_EMBEDDING_MODEL,
)

def write_2_DB(filename):
    print("Trying to Read", filename)

    df = pd.read_excel(filename)

    print("\n Number of rows:", len(df))  # No need to subtract 1, since header is not counted in len
    print("\n Loaded Successfully!")

    # Get column names
    cols = df.columns

    for idx in df.index :
        print(idx)
        doc_id = str(df[cols[0]].loc[idx])  # Use .loc to access the row with index `idx`
        row = df[cols[1]].loc[idx]  # Access the row using .loc for correct indexing
        if doc_id not in existing_ids:
            doc_text = str(row)

            documents_list = []
            embeddings_list = []
            ids_list = []

            vector = text_embedder.embed_query(doc_text)  # Embed the chunk

            documents_list.append(doc_text)
            embeddings_list.append(vector)
            ids_list.append(doc_id)  # Use column "knowledge_number" as the ID
            met={"id":doc_id}
            try:
                print(ids_list)
                collection.add(
                    embeddings=embeddings_list,
                    documents=documents_list,
                    ids=ids_list,
                    metadatas = met,
                )
            except Exception as e:
                print(f"Error writing to Chroma: {e}")
        else:
            pass
    print("all done!")

# Example usage
write_2_DB("./test.xlsx")

Versions

Python 3.10 , Chroma 0.5.11

Relevant log output

No response