Closed nishanthc-nd closed 1 year ago
Should work:
import logging
import os
import chromadb
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
load_dotenv()
logging.basicConfig(level=logging.DEBUG)
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")
def get_documents():
return PyPDFLoader("fixtures/pdf/MorseVsFrederick.pdf").load()
def init_chromadb():
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=DB_DIR,
anonymized_telemetry=False
)
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(
collection_name="langchain_store",
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=DB_DIR,
)
vectorstore.add_documents(documents=get_documents(), embedding=embeddings)
vectorstore.persist()
print(vectorstore)
def query_chromadb():
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=DB_DIR,
anonymized_telemetry=False
)
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(
collection_name="langchain_store",
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=DB_DIR,
)
vectorstore.similarity_search_with_score(query="FREDERICK", k=4)
def main():
init_chromadb()
query_chromadb()
if __name__ == '__main__':
main()
I tried the code given by @sergerdn still not working. The place where I am retrieving the persistent storage is in RetrievalQA method.
collection_name="long-docs" persist_directory="/content/sample_data/chromadb/" client_settings = Settings( chroma_db_impl="duckdb+parquet", persist_directory=persist_directory, # Optional, defaults to .chromadb/ in the current directory anonymized_telemetry=False )
vectorstore = Chroma( collection_name=collection_name, embedding_function=embeddings, client_settings=client_settings, persist_directory=persist_directory, )
Then for QA qar = RetrievalQA.from_chain_type(llm=local_llm, chain_type="stuff", retriever = vectorstore.as_retriever(), chain_type_kwargs=chain_type_kwargs,return_source_documents=True)
The vectorstore here is not accessible. I have persisted the db using persist(). Yet I see this error. Error: NoIndexException: Index not found, please create an instance before querying
Index folder structure : chromadb index chroma-collections.parquet chroma-embeddings.parquet
@kavlata
Can you confirm whether you tried to run my code with no modifications and whether it did not work as expected?
@sergerdn The following code didnt work. I ran this on google colab.
import chromadb from chromadb.config import Settings collection_name="long-docs" persist_directory="/content/sample_data/chromadb/" client_settings = Settings( chroma_db_impl="duckdb+parquet", persist_directory=persist_directory, # Optional, defaults to .chromadb/ in the current directory anonymized_telemetry=False )
vectorstore = Chroma( collection_name=collection_name, embedding_function=embeddings, client_settings=client_settings, persist_directory=persist_directory, ) result = vectorstore.similarity_search_with_score(query="contract", k=4) print(result)
Error: NoIndexException: Index not found, please create an instance before querying
Please provide me with your full code for reproducing errors, including the code for inserting data into ChromaDB. Additionally, please use backticks (`) when you post your code.
Thanks @sergerdn for guiding. Python code is below.
from langchain.text_splitter import CharacterTextSplitter, TextSplitter #, NLTKTextSplitter
from llama_index import SimpleDirectoryReader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import chromadb
from chromadb.config import Settings
documents = SimpleDirectoryReader('/content/sample_data/',required_exts='.txt').load_langchain_documents()
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,chunk_overlap=20,length_function=len,separators=["\n\n", "\n", " ", ""])
texts = text_splitter.split_documents(documents)
collection_name="long-docs"
persist_directory="/content/sample_data/chromadb/"
docsearch_db = Chroma.from_documents(texts, embeddings, collection_name=collection_name, persist_directory=persist_directory)
docsearch_db.persist()
collection_name="long-docs"
persist_directory="/content/sample_data/chromadb/"
client_settings = Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=persist_directory, # Optional, defaults to .chromadb/ in the current directory
anonymized_telemetry=False
)
vectorstore = Chroma(
collection_name=collection_name,
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=persist_directory,
)
result = vectorstore.similarity_search_with_score(query="contract", k=1)
print(result)
Do not use the same directory for both the Chrome database and documents under any circumstances. I believe they should be in different directories:
documents = SimpleDirectoryReader('/content/sample_data/source_docs/',required_exts='.txt').load_langchain_documents()
persist_directory="/content/sample_data/chromadb/"
I will look into your code ASAP.
@kavlata
Look at my code, I use:
# settings for ChromaDB
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=DB_DIR,
anonymized_telemetry=False
)
# create instance
vectorstore = Chroma(
collection_name="langchain_store",
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=DB_DIR,
)
# add docs
vectorstore.add_documents(documents=get_documents(), embedding=embeddings)
vectorstore.persist()
You use another api:
texts = text_splitter.split_documents(documents)
# You don't have a database at the moment. The referee for you has encountered an error.
# We seem to have encountered a bug or an undocumented feature
# as it does not match the expected behaviour of creating a DB, which should have been already created.
docsearch_db = Chroma.from_documents(
texts, embeddings,
collection_name=collection_name, persist_directory=persist_directory
)
Could you please use my api and confirm whether it works on your end?
Also, please provide any links to the documentation that you are reading when you write your scripts. This is important because if we have some documented API in the documentation, but it does not work as expected, I believe it is a bug.
I have tested my code once again and can confirm that it is working correctly.
import logging
import os
import chromadb
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
load_dotenv()
logging.basicConfig(level=logging.DEBUG)
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")
def get_documents():
return PyPDFLoader("fixtures/pdf/MorseVsFrederick.pdf").load()
def init_chromadb():
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=DB_DIR,
anonymized_telemetry=False
)
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(
collection_name="langchain_store",
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=DB_DIR,
)
vectorstore.add_documents(documents=get_documents(), embedding=embeddings)
vectorstore.persist()
def query_chromadb():
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=DB_DIR,
anonymized_telemetry=False
)
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(
collection_name="langchain_store",
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=DB_DIR,
)
result = vectorstore.similarity_search_with_score(query="FREDERICK", k=4)
print(result)
def main():
# init_chromadb()
query_chromadb()
if __name__ == '__main__':
main()
INFO:chromadb:Running Chroma using direct local API.
WARNING:chromadb:Using embedded DuckDB with persistence: data will be stored in: research/db
INFO:clickhouse_connect.driver.ctypes:Successfully imported ClickHouse Connect C data optimizations
INFO:clickhouse_connect.driver.ctypes:Successfully import ClickHouse Connect C/Numpy optimizations
INFO:clickhouse_connect.json_impl:Using python library for writing JSON byte strings
INFO:chromadb.db.duckdb:loaded in 56 embeddings
INFO:chromadb.db.duckdb:loaded in 1 collections
INFO:chromadb.db.duckdb:collection with name langchain_store already exists, returning existing collection
DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings
DEBUG:openai:api_version=None data='{"input": ["FREDERICK"], "encoding_format": "base64"}' message='Post details'
DEBUG:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=165 request_id=525955271cefffd5ab3e60aa99a7a5b4 response_code=200
DEBUG:chromadb.db.index.hnswlib:time to pre process our knn query: 3.0994415283203125e-06
DEBUG:chromadb.db.index.hnswlib:time to run knn query: 0.00028586387634277344
INFO:chromadb.db.duckdb:PersistentDuckDB del, about to run persist
INFO:chromadb.db.duckdb:Persisting DB to disk, putting it in the save folder: research/db
[(Document(page_content='551US2\nUnit:\n$U68\n[09-20-11 18:50:10] PAGES PGT:\nOPIN\n400 \nMORSE \nv. \nFREDERICK \nOpinion of the Court \nso āclearly establishedā that a reasonable principal in Morseās \nposition would have understood that her actions were uncon\xad\nstitutional, and that Morse was therefore not entitled to \nqualiļ¬ed immunity. \nId., \nat 1123ā1125. \nWe granted certiorari on two questions: whether Freder\xad\nick \nhad a First Amendment right to wield his banner, and, if \nso, whether that right was so clearly established that the \nprincipal may be held liable for damages. 549 U. S. 1075 \n(2006). We resolve the ļ¬rst question against Frederick, and \ntherefore have no occasion to reach the second.\n1 \nII \nAt the outset, we reject Frederickās argument that this is \nnot a school speech caseāas has every other authority to \naddress the question. See App. 22ā23 (Principal Morse); \nApp. to Pet. for Cert. 63a (superintendent); \nid., \nat 69a (school \nboard); \nid., \nat 34aā35a (District Court); 439 F. 3d, at 1117 \n(Ninth Circuit). The event occurred during normal school \nhours. It was sanctioned by Principal Morse āas an ap\xad\nproved social event or class trip,ā App. 22ā23, and the school \ndistrictās rules expressly provide that pupils in āapproved \nsocial events and class trips are subject to district rules for \n1 \nJustice\n \nBreyer\n \nwould rest decision on qualiļ¬ed immunity without \nreaching the underlying First Amendment question. The problem with \nthis approach is the rather signiļ¬cant one that it is inadequate to decide \nthe case before us. Qualiļ¬ed immunity shields public ofļ¬cials from money \ndamages only. See \nWood \nv. \nStrickland, \n420 U. S. 308, 314, n. 6 (1975). In \nthis case, Frederick asked not just for damages, but also for declaratory \nand injunctive relief. App. 13. \nJustice\n \nBreyer\nās proposed decision on \nqualiļ¬ed immunity grounds would dispose of the damages claims, but \nFrederickās other claims would remain unaddressed. To get around that \nproblem, \nJustice\n \nBreyer\n \nhypothesizes that Frederickās suspensionāthe \ntarget of his request for injunctive reliefāāmay well be justiļ¬ed on non\xad\nspeech-related grounds.ā See \npost, \nat 433 (opinion concurring in judg\xad\nment in part and dissenting in part). That hypothesis was never consid\xad\nered by the courts below, never raised by any of the parties, and is belied \nby the record, which nowhere suggests that the suspension would have \nbeen justiļ¬ed solely on non-speech-related grounds. ', metadata={'source': 'fixtures/pdf/MorseVsFrederick.pdf', 'page': 7}), 0.3957888185977936), (Document(page_content='551US2\nUnit:\n$U68\n[09-20-11 18:50:10] PAGES PGT:\nOPIN\n442 \nMORSE \nv. \nFREDERICK \nStevens,\n \nJ\n.,\n \ndissenting \ndenburg, \n395 U. S., at 447ā448, yet would permit a listenerās \nperceptions to determine which speech deserved constitu\xad\ntional protection.\n5 \nSuch a peculiar doctrine is alien to our case law. In \nAbrams \nv. \nUnited States, \n250 U. S. 616 (1919), this Court \nafļ¬rmed the conviction of a group of Russian ārebels, revolu\xad\ntionists, [and] anarchists,ā \nid., \nat 617ā618 (internal quotation \nmarks omitted), on the ground that the leaļ¬ets they distrib\xad\nuted were thought to āincite, provoke and encourage resist\xad\nance to the United States,ā \nid., \nat 617 (internal quotation \nmarks omitted). Yet Justice Holmesā dissentāwhich has \nemphatically carried the dayānever inquired into the rea\xad\nsonableness of the United Statesā judgment that the leaļ¬ets \nwould likely undermine the war effort. The dissent instead \nridiculed that judgment: ā[N]obody can suppose that the sur\xad\nreptitious publishing of a silly leaļ¬et by an unknown man, \nwithout more, would present any immediate danger that its \nopinions would hinder the success of the government arms \nor have any appreciable tendency to do so.ā \nId., \nat 628. In \nThomas \nv. \nCollins, \n323 U. S. 516 (1945) (opinion for the Court \nby Rutledge, J.), we overturned the conviction of a union \norganizer who violated a restraining order prohibiting him \nfrom exhorting workers. In so doing, we held that the dis\xad\ntinction between advocacy and incitement could not depend \non how one of those workers might have understood the or\xad\nganizerās speech. That would āpu[t] the speaker in these cir\xad\ncumstances wholly at the mercy of the varied understanding \nof his hearers and consequently of whatever inference may \n5 \nThe reasonableness of the view that Frederickās message was unpro\xad\ntected speech is relevant to ascertaining whether qualiļ¬ed immunity \nshould shield the principal from liability, not to whether her actions vio\xad\nlated Frederickās constitutional rights. Cf. \nSaucier \nv. \nKatz, \n533 U. S. 194, \n202 (2001) (āThe relevant, dispositive inquiry in determining whether a \nright is clearly established is whether it would be clear to a reasonable \nofļ¬cer that his conduct was unlawful in the situation he confrontedā). ', metadata={'source': 'fixtures/pdf/MorseVsFrederick.pdf', 'page': 49}), 0.39652174711227417), (Document(page_content='551US2\nUnit:\n$U68\n[09-20-11 18:50:10] PAGES PGT:\nOPIN\n393 \nOCTOBER TERM, 2006 \nSyllabus \nMORSE \net\n \nal.\n \nv. \nFREDERICK \ncertiorari\n \nto\n \nthe\n \nunited\n \nstates\n \ncourt\n \nof\n \nappeals\n \nfor\n \nthe\n \nninth\n \ncircuit\n \nNo. 06ā278. Argued March 19, 2007āDecided June 25, 2007 \nAt a school-sanctioned and school-supervised event, petitioner Morse, the \nh\nigh school principal, saw students unfurl a banner stating āBONG \nHiTS 4 JESUS,ā which she regarded as promoting illegal drug use. \nConsistent with established school policy prohibiting such messages at \nschool events, Morse directed the students to take down the banner. \nWhen one of the students who had brought the banner to the eventā \nrespondent Frederickārefused, Morse conļ¬scated the banner and later \nsuspended him. The school superintendent upheld the suspension, ex\xad\nplaining, \ninter alia, \nthat Frederick was disciplined because his banner \nappeared to advocate illegal drug use in violation of school policy. Peti\xad\ntioner school board also upheld the suspension. Frederick ļ¬led suit \nunder 42 U. S. C. Ā§ 1983, alleging that the school board and Morse had \nviolated his First Amendment rights. The District Court granted peti\xad\ntioners summary judgment, ruling that they were entitled to qualiļ¬ed \nimmunity and that they had not infringed Frederickās speech rights. \nThe Ninth Circuit reversed. Accepting that Frederick acted during a \nschool-authorized activity and that the banner expressed a positive sen\xad\ntiment about marijuana use, the court nonetheless found a First Amend\xad\nment violation because the school punished Frederick without demon\xad\nstrating that his speech threatened substantial disruption. It also \nconcluded that Morse was not entitled to qualiļ¬ed immunity because \nFrederickās right to display the banner was so clearly established that \na reasonable principal in Morseās position would have understood that \nher actions were unconstitutional. \nHeld: \nBecause \nschools may take steps to safeguard those entrusted to \ntheir care from speech that can reasonably be regarded as encouraging \nillegal drug use, the school ofļ¬cials in this case did not violate the First \nAmendment by conļ¬scating the pro-drug banner and suspending Fred\xad\nerick. Pp. 400ā410. \n(a) \nFrederickās argument that this is not a school speech case is re\xad\njec\nted. The event in question occurred during normal school hours and \nwas sanctioned by Morse as an approved social event at which the dis\xad\ntrictās student conduct rules expressly applied. Teachers and adminis\xad\ntrators were among the students and were charged with supervising \nthem. Frederick stood among other students across the street from ', metadata={'source': 'fixtures/pdf/MorseVsFrederick.pdf', 'page': 0}), 0.40352582931518555), (Document(page_content='551US2\nUnit:\n$U68\n[09-20-11 18:50:10] PAGES PGT:\nOPIN\n398 \nMORSE \nv. \nFREDERICK \nOpinion of the Court \nPrincipal Morse immediately crossed the street and de\xad\nmanded that the banner be taken down. Everyone but \nFrederick complied. Morse conļ¬scated the banner and told \nFrederick to report to her ofļ¬ce, where she suspended him \nfor 10 days. Morse later explained that she told Frederick \nto take the banner down because she thought it encouraged \nillegal drug use, in violation of established school policy. \nJuneau School Board Policy No. 5520 states: āThe Board spe\xad\nciļ¬cally prohibits any assembly or public expression that \n. . . advocates the use of substances that are illegal to \nm i n o r s.... ā \nId., \nat 53a. In addition, Juneau School Board \nPolicy No. 5850 subjects ā[p]upils who participate in ap\xad\nproved social events and class tripsā to the same student \nconduct rules that apply during the regular school program. \nId., \nat 58a. \nFrederick administratively appealed his suspension, but \nthe \nJuneau School District Superintendent upheld it, limiting \nit to time served (eight days). In a memorandum setting \nforth his reasons, the superintendent determined that Fred\xad\nerick had displayed his banner āin the midst of his fellow \nstudents, during school hours, at a school-sanctioned activ\xad\nity.ā \nId., \nat 63a. He further explained that Frederick āwas \nnot disciplined because the principal of the school ādisagreedā \nwith his message, but because his speech appeared to advo\xad\ncate the use of illegal drugs.ā \nId., \nat 61a. \nThe superintendent continued: \nāThe common-sense understanding of the phrase \nā\nbong hitsā is that it is a reference to a means of smoking \nmarijuana. Given [Frederickās] inability or unw\nilling\xad\nness to express any other credible meaning for the \nphrase, I can only agree with the principal and countless \nothers who saw the banner as advocating the use of ille\xad\ngal drugs. [Frederickās] speech was not political. He \nwas not advocating the legalization of marijuana or pro\xad\nmoting a religious belief. He was displaying a fairly \nsilly message promoting illegal drug usage in the midst ', metadata={'source': 'fixtures/pdf/MorseVsFrederick.pdf', 'page': 5}), 0.4070335924625397)]
Process finished with exit code 0
pyproject.toml
:
[tool.poetry.dependencies]
python = "^3.10"
langchain = "^0.0.132"
#chromadb = {git = "ssh://git@github.com/hwchase17/langchain.git", rev = "main"}
python-dotenv = "^1.0.0"
openai = "^0.27.2"
pypdf = "^3.7.0"
elasticsearch = "^8.6.2"
setuptools = "^67.6.1"
chromadb = "^0.3.17"
#chromadb = {git = "ssh://git@github.com/chroma-core/chroma.git", rev = "main"}
Thanks so very much @sergerdn It works now.
add_documents and then persist() is working. Thanks !!!
The documentation I was referring to is below
@kavlata
It seems like we have a bug in the code.
Facing the same issue even after following steps from here. Also installed libs with the exact version as specified.
chromadb.errors.NoIndexException: Index not found, please create an instance before querying
It works if I call like this:
def main():
init_chromadb()
query_chromadb()
==> This essentially loads the documents, persist it, and then makes query to vectorstore.
But fails if I restart the Jupyter kernel and run with below code:
def main():
query_chromadb()
==> This tries to load the vectorstore and makes query to vectorstore.
Any suggestion please, how can I fix?
@chintan-donda
Try creating the directory for the database first.
@sergerdn When to create directory? At the time of init_chromadb() step or query_chromadb()? Can u pls share the sample code snippet?
@sergerdn When to create directory? At the time of init_chromadb() step or query_chromadb()? Can u pls share the sample code snippet?
Why not create the directory if it does not already exist before doing anything? What could be the problem? I'm sorry, but I don't understand you.
@sergerdn I'm already creating the directory if not exist, before you suggested. Still the same issue.
@sergerdn I'm already creating the directory if not exist, before you suggested. Still the same issue.
Post your code to repeat your problem. Please do not use a Python notebook, give me a script.
@chintan-donda
Tested and it works. Make sure to uncomment init_chromadb()
during the first run to create the database with documents. During the second run, only execute query_chromadb
.
Please do not use Python notebooks, as they create isolated environments, and you may not have the same environment during a second run.
import json
import logging
import os
import re
import chromadb
from dotenv import load_dotenv
from fastapi.encoders import jsonable_encoder
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
load_dotenv()
logging.basicConfig(level=logging.DEBUG)
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")
def replace_newlines_and_spaces(text):
# Replace all newline characters with spaces
text = text.replace("\n", " ")
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
return text
def get_documents():
return PyPDFLoader("fixtures/pdf/MorseVsFrederick.pdf").load()
def init_chromadb():
if not os.path.exists(DB_DIR):
os.mkdir(DB_DIR)
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=DB_DIR,
anonymized_telemetry=False
)
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(
collection_name="langchain_store",
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=DB_DIR,
)
documents = []
for num, doc in enumerate(get_documents()):
doc.page_content = replace_newlines_and_spaces(doc.page_content)
documents.append(doc)
vectorstore.add_documents(documents=documents, embedding=embeddings)
vectorstore.persist()
print(vectorstore)
def query_chromadb():
if not os.path.exists(DB_DIR):
raise Exception(f"{DB_DIR} does not exist, nothing can be queried")
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=DB_DIR,
anonymized_telemetry=False
)
embeddings = OpenAIEmbeddings()
vectorstore = Chroma(
collection_name="langchain_store",
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=DB_DIR,
)
result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4)
jsonable_result = jsonable_encoder(result)
print(json.dumps(jsonable_result, indent=2))
def main():
#init_chromadb()
query_chromadb()
if __name__ == '__main__':
main()
@sergerdn Thanks for the code snippet. With exactly your code snippet, it still fails with the same error chromadb.errors.NoIndexException: Index not found, please create an instance before querying
.
Below is my configuration:
chromadb==0.3.17
fastapi==0.95.1
langchain==0.0.132
python-dotenv==1.0.0
Python 3.9.6
Macbook Pro M1, OS - Venture 13.3.1
It's working fine for me with the below code and lib versions:
import json
import logging
import os
import re
import sys
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from fastapi.encoders import jsonable_encoder
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.DEBUG)
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")
def replace_newlines_and_spaces(text):
# Replace all newline characters with spaces
text = text.replace("\n", " ")
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
return text
def get_documents():
return PyPDFLoader("fixtures/pdf/MorseVsFrederick.pdf").load()
def init_chromadb():
# Delete existing index directory and recreate the directory
if os.path.exists(DB_DIR):
import shutil
shutil.rmtree(DB_DIR, ignore_errors=True)
os.mkdir(DB_DIR)
documents = []
for num, doc in enumerate(get_documents()):
doc.page_content = replace_newlines_and_spaces(doc.page_content)
documents.append(doc)
# Split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# Select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# Create the vectorestore to use as the index
vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
vectorstore.persist()
print(vectorstore)
vectorstore = None
def query_chromadb():
if not os.path.exists(DB_DIR):
raise Exception(f"{DB_DIR} does not exist, nothing can be queried")
# Select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# Load Vector store from local disk
vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4)
jsonable_result = jsonable_encoder(result)
print(json.dumps(jsonable_result, indent=2))
def main():
init_chromadb()
query_chromadb()
if __name__ == '__main__':
main()
Note: init_chromadb()
creates a subdirectory with name index
under the DB_DIR
. If it doesn't create it then you will get the error chromadb.errors.NoIndexException: Index not found, please create an instance before querying
.
But with the above code, it'd work without any issue.
Libs installed and their version:
langchain==0.0.132
fastapi==0.95.1
python-dotenv==1.0.0
Python 3.9.6
Macbook Pro M1, OS - Venture 13.3.1
@chintan-donda
Tested and it works. Make sure to uncomment
init_chromadb()
during the first run to create the database with documents. During the second run, only executequery_chromadb
.Please do not use Python notebooks, as they create isolated environments, and you may not have the same environment during a second run.
import json import logging import os import re import chromadb from dotenv import load_dotenv from fastapi.encoders import jsonable_encoder from langchain.document_loaders import PyPDFLoader from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma load_dotenv() logging.basicConfig(level=logging.DEBUG) ABS_PATH = os.path.dirname(os.path.abspath(__file__)) DB_DIR = os.path.join(ABS_PATH, "db") def replace_newlines_and_spaces(text): # Replace all newline characters with spaces text = text.replace("\n", " ") # Replace multiple spaces with a single space text = re.sub(r'\s+', ' ', text) return text def get_documents(): return PyPDFLoader("fixtures/pdf/MorseVsFrederick.pdf").load() def init_chromadb(): if not os.path.exists(DB_DIR): os.mkdir(DB_DIR) client_settings = chromadb.config.Settings( chroma_db_impl="duckdb+parquet", persist_directory=DB_DIR, anonymized_telemetry=False ) embeddings = OpenAIEmbeddings() vectorstore = Chroma( collection_name="langchain_store", embedding_function=embeddings, client_settings=client_settings, persist_directory=DB_DIR, ) documents = [] for num, doc in enumerate(get_documents()): doc.page_content = replace_newlines_and_spaces(doc.page_content) documents.append(doc) vectorstore.add_documents(documents=documents, embedding=embeddings) vectorstore.persist() print(vectorstore) def query_chromadb(): if not os.path.exists(DB_DIR): raise Exception(f"{DB_DIR} does not exist, nothing can be queried") client_settings = chromadb.config.Settings( chroma_db_impl="duckdb+parquet", persist_directory=DB_DIR, anonymized_telemetry=False ) embeddings = OpenAIEmbeddings() vectorstore = Chroma( collection_name="langchain_store", embedding_function=embeddings, client_settings=client_settings, persist_directory=DB_DIR, ) result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4) jsonable_result = jsonable_encoder(result) print(json.dumps(jsonable_result, indent=2)) def main(): #init_chromadb() query_chromadb() if __name__ == '__main__': main()
notebook not workļ¼ you are right
Try to comment "collection_name="langchain_store"," in query_chromadb()
I used the from_texts method to persist my embeddings, as shown below:
embedder = OpenAIEmbeddings()
db = Chroma.from_texts(texts = embedding_input['texts'],
embedding=embedder,
metadatas=embedding_input['metadatas'],
ids=embedding_input['ids'],
persist_directory=f'.../embeddings/{dir_name}_chroma',
)
db.persist()
The persist_directory is populated as shown below:
|--- {dir_name}_chroma |---|--- index |-------|--- id_to_uuid_9f0.... |-------|--- index_9f5... |-------|--- index_metadata_9f0... |-------|--- uuid_to_id_9f0... |---|--- chroma-collections.parquet |---|--- chroma-embeddings.parquet
If I comment the db.persist()
line of code the directory remains the same.
I use the following code, based on the previous replies, to fetch the embeddings from the directory
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import Embeddings, OpenAIEmbeddings
collection_name = 'col_name'
dir_name = '/dir/dir1/dir2'
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=dir_name,
anonymized_telemetry=False
)
embeddings = OpenAIEmbeddings()
db = Chroma(
collection_name=collection_name,
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=dir_name,
)
result = db.similarity_search_with_score(query="profit", k=4)
print(result)
I still get the following error:
NoIndexException: Index not found, please create an instance before querying
@dsaks9 Create the dir_name directory first before the call to Chroma.
import os
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import Embeddings, OpenAIEmbeddings
collection_name = 'col_name'
dir_name = '/dir/dir1/dir2'
# Delete existing index directory and recreate the directory
if os.path.exists(dir_name):
import shutil
shutil.rmtree(dir_name, ignore_errors=True)
os.makedirs(dir_name)
client_settings = chromadb.config.Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=dir_name,
anonymized_telemetry=False
)
embeddings = OpenAIEmbeddings()
db = Chroma(
collection_name=collection_name,
embedding_function=embeddings,
client_settings=client_settings,
persist_directory=dir_name,
)
result = db.similarity_search_with_score(query="profit", k=4)
print(result)
Check if it works?
@chintan-donda thanks for the suggestion, although still running into the same error.
Realized I had just forgotten to include the collection name when first creating the embeddings; once I supply collection_name to the from_texts method the code works properly.
I am facing a unique use while using Chroma.from_documents
Up until yesterday the parquet files were being created but all of a sudden now the the embedding and collections parquet aren't being created at all.
my code is as simple as -
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_documents(all_docs)
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
os.makedirs(self.persist_dir, exist_ok=True)
Chroma.from_documents(documents=texts,
embedding=embeddings,
persist_directory=self.persist_dir,
collection_name=self.collection_id)
When i look into the file system i see this
/mnt/content/user-1e9f1de7-9a9f-4623-bcea-0a8ce2b99649,
/mnt/content/user-1e9f1de7-9a9f-4623-bcea-0a8ce2b99649/index,
/mnt/content/user-1e9f1de7-9a9f-4623-bcea-0a8ce2b99649/index/index_metadata_b27bfcbb-0b96-4f75-a575-f6382975afff.pkl,
/mnt/content/user-1e9f1de7-9a9f-4623-bcea-0a8ce2b99649/index/index_b27bfcbb-0b96-4f75-a575-f6382975afff.bin,
/mnt/content/user-1e9f1de7-9a9f-4623-bcea-0a8ce2b99649/index/uuid_to_id_b27bfcbb-0b96-4f75-a575-f6382975afff.pkl,
/mnt/content/user-1e9f1de7-9a9f-4623-bcea-0a8ce2b99649/index/id_to_uuid_b27bfcbb-0b96-4f75-a575-f6382975afff.pkl
I see that the chroma-embeddings.parquet
and chroma-collections.parquet
are missing altogether. I am a bit stumped as to why this is happening.
I used the huggingface model to create the vectorindex in google colab and stored (persisted) the vectordatabase in my drive , now I want to use this drive to get the results when i run it on my local machine
Im usigng the index folder created in my drive when i ran the colab notebook , however when im using it now (local) its throwing me the error for "NoIndexException: Index not found, please create an instance before querying"
Any suggestion?
Seems like its working now , I forgot to use vectordb.persist() to make sure we save the vectordatabse
If you are still facing Error: NoIndexException: Index not found
, my solution was to bump the version of your chromadb package, I belive there was a bug there that is now resolved.
pip install --upgrade chromadb
@nishanthc-nd can you please try and see if we can close this issue?
my solution was to bump the version of your chromadb package, I believe there was a bug there that is now resolved.
pip install --upgrade chromadb
I'm getting this as well. As soon as I end the python process, I can no longer read the persisted index. If I create a new chroma client while the process is going on, it seems to read from the persisted dir. But if create a completely new process, it doesn't work.
Same issue here
some one can help me my code is bellow:
import re
import uuid
from fastapi import FastAPI, UploadFile
import uvicorn
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from uuid import UUID
app = FastAPI()
DB_DIR = "./db"
embeddings = HuggingFaceEmbeddings(model_name="nghuyong/ernie-3.0-medium-zh")
def replace_newlines_and_spaces(text):
# Replace all newline characters with spaces
text = text.replace("\n", " ")
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
return text
def get_documents(file_path):
return TextLoader(file_path, encoding="utf-8").load()
def init_chromadb(file_path):
if not os.path.exists(DB_DIR):
os.mkdir(DB_DIR)
documents = []
for num, doc in enumerate(get_documents(file_path)):
doc.page_content = replace_newlines_and_spaces(doc.page_content)
documents.append(doc)
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20, length_function=len,
separators=["\n\n", "\n", " ", "", "ć"])
texts = text_splitter.split_documents(documents)
print(len(texts))
# Create the vectorestore to use as the index
vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
vectorstore.persist()
@app.post("/process-file")
async def process_file(file: UploadFile):
try:
random_name = str(uuid.uuid1())
index_path = "./upload_file/" + random_name + "/"
file_path = index_path + file.filename
if not os.path.exists("./upload_file/" + random_name):
os.mkdir("./upload_file/" + random_name)
with open(file_path, "wb") as buffer:
buffer.write(file.file.read())
init_chromadb(file_path)
return {"status": "success"}
except Exception as e:
return {"status": "failure", "error": str(e)}
if __name__ == "__main__":
uvicorn.run(app, host="127.0.0.1", port=7777)
i make a api use langchain, when the api is runing i use db.get() i can see the all the documents but when i close the api , and run db.get() again , there is only one document i first upload. why?..
Thanks so very much @sergerdn It works now.
add_documents and then persist() is working. Thanks !!!
The documentation I was referring to is below
This solution seemed to work for me, I am closing this issue.
I tried the code given by @sergerdn still not working. The place where I am retrieving the persistent storage is in RetrievalQA method.
collection_name="long-docs" persist_directory="/content/sample_data/chromadb/" client_settings = Settings( chroma_db_impl="duckdb+parquet", persist_directory=persist_directory, # Optional, defaults to .chromadb/ in the current directory anonymized_telemetry=False )
vectorstore = Chroma( collection_name=collection_name, embedding_function=embeddings, client_settings=client_settings, persist_directory=persist_directory, )
Then for QA qar = RetrievalQA.from_chain_type(llm=local_llm, chain_type="stuff", retriever = vectorstore.as_retriever(), chain_type_kwargs=chain_type_kwargs,return_source_documents=True)
The vectorstore here is not accessible. I have persisted the db using persist(). Yet I see this error. Error: NoIndexException: Index not found, please create an instance before querying
Index folder structure : chromadb index chroma-collections.parquet chroma-embeddings.parquet Did you solve this problem can share your repo
documents = SimpleDirectoryReader(input_files=['uber_2021.pdf']).load_data()
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection) storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents( documents, storage_context=storage_context ) its giving me this error RuntimeError: Chroma is running in http-only client mode, and can only be run with 'chromadb.api.fastapi.FastAPI' as the chroma_api_impl. see https://docs.trychroma.com/usage-guide?lang=py#using-the-python-http-only-client for more information. help me with solution
persist_directory = 'chroma_db_store/index/' or 'chroma_db_store' docsearch = Chroma(persist_directory=persist_directory, embedding_function=embeddings) query = "Hey" docs = docsearch.similarity_search(query)
NoIndexException: Index not found, please create an instance before querying
Folder structure chroma_db_store: