Closed vicmcorrea closed 1 year ago
Hello,
My first guess would be that either the index you use or namespace does not exist? I would try without the namespace parameter here:
docsearch = Pinecone.from_existing_index(
index_name=index_name, embedding=embeddings, namespace=namespace
)
It would be easier to understand what the problem is if you could share the unexpected behavior you observe or an error message you get perhaps?
I did confirm that namespace and index does exist. Originally I got an "Error 'source'", and now when I tried your solution for the docsearch I got a "No documents found in the user session" error.
Okay the issue is that since your vectors are already in pinecone, you need to tell LangChain to return the documents used to generate the answer so we can then send them back to the chainlit UI.
import os
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
import pinecone
import chainlit as cl
pinecone.init(
api_key=os.environ.get("PINECONE_API_KEY"),
environment=os.environ.get("PINECONE_ENV"),
)
index_name = "test"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
embeddings = OpenAIEmbeddings()
namespaces = "research"
welcome_message = """Welcome to the Chainlit PDF QA demo! To get started:
Upload a PDF or text file
Ask a question about the file
"""
@cl.langchain_factory
def langchain_factory():
index_name = "langchain-demo"
namespace = None
docsearch = Pinecone.from_existing_index(
index_name=index_name, embedding=embeddings, namespace=namespace
)
chain = RetrievalQAWithSourcesChain.from_chain_type(
ChatOpenAI(temperature=0, streaming=True),
chain_type="stuff",
retriever=docsearch.as_retriever(max_tokens_limit=4097),
return_source_documents=True,
)
cl.send_message(content="The system is ready, you can now ask questions!")
return chain
@cl.langchain_postprocess
def process_response(res):
answer = res["answer"]
sources = res.get("sources", "").strip() # Use the get method with a default value
source_elements = []
docs = res.get("source_documents", None)
if docs:
metadatas = [doc.metadata for doc in docs]
all_sources = [m["source"] for m in metadatas]
if sources:
found_sources = []
for source_index, source in enumerate(sources.split(",")):
orig_source_name = source.strip().replace(".", "")
clean_source_name = f"source {source_index}"
try:
found_index = all_sources.index(orig_source_name)
except ValueError:
continue
text = docs[found_index].page_content
found_sources.append(clean_source_name)
source_elements.append(cl.Text(text=text, name=clean_source_name))
if found_sources:
answer += f"\nSources: {', '.join(found_sources)}"
else:
answer += "\nNo sources found"
cl.send_message(content=answer, elements=source_elements)
notice the return_source_documents=True
parameter. In this example namespace
is set to None
but you are free to change that :)
I've tried similar solution that you've suggested and still not working with mine I get a sources error and with yours it still doesnt retrieve anything from pinecone (I've made sure it works on a command-line app I have).
https://python.langchain.com/en/latest/modules/chains/index_examples/qa_with_sources.html https://community.pinecone.io/t/retrieve-embeddings-stored-in-index-name/906/2
import os
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
import pinecone
import chainlit as cl
pinecone.init(
api_key=os.environ.get("PINECONE_API_KEY"),
environment=os.environ.get("PINECONE_ENV"),
)
index_name = "test-ab"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
embeddings = OpenAIEmbeddings()
namespaces = "research"
welcome_message = """Welcome to the Chainlit PDF QA demo! To get started:
Upload a PDF or text file
Ask a question about the file
"""
@cl.langchain_factory
def langchain_factory():
index_name = "test-ab"
docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)
chain = RetrievalQAWithSourcesChain.from_chain_type(
ChatOpenAI(temperature=0, streaming=True),
chain_type="stuff",
retriever=docsearch.as_retriever(max_tokens_limit=4097),
return_source_documents=True,
)
cl.send_message(content="The system is ready, you can now ask questions!")
return chain
@cl.langchain_postprocess
def process_response(res):
answer = res["answer"]
sources = res.get("sources", "").strip() # Use the get method with a default value
source_elements = []
docs = res.get("source_documents", None)
if docs:
metadatas = [doc.metadata for doc in docs]
all_sources = [m["source"] for m in metadatas]
if sources:
found_sources = []
for source_index, source in enumerate(sources.split(",")):
orig_source_name = source.strip().replace(".", "")
clean_source_name = f"source {source_index}"
try:
found_index = all_sources.index(orig_source_name)
except ValueError:
continue
text = docs[found_index].page_content
found_sources.append(clean_source_name)
source_elements.append(cl.Text(text=text, name=clean_source_name))
if found_sources:
answer += f"\nSources: {', '.join(found_sources)}"
else:
answer += "\nNo sources found"
cl.send_message(content=answer, elements=source_elements)
The code you sent above works my pinecone DB. If I understand correctly in your case this:
docs = res.get("source_documents", None)
is equal to None
?
If would be useful to see what exactly you have inside of res
in the process_response
function.
I got it to work!, it was some environment issue I had. One quick question, do the embeddings need to have a metadata for source to work?
Awesome!
In this case the metadata will automatically be created by the LangChain chain I believe. Based on the vector it got back and the text that vector points to
Hey, I tried to make it working from an existing pinecone database but cant make it work, any suggestions?