QA pinecone issue - Githubissues

vicmcorrea commented 1 year ago

Hey, I tried to make it working from an existing pinecone database but cant make it work, any suggestions?


import os
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
import pinecone
import chainlit as cl
from chainlit.types import AskFileResponse

pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),
    environment=os.environ.get("PINECONE_ENV"),
)

index_name = "test"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
embeddings = OpenAIEmbeddings()

namespaces = "research"

welcome_message = """Welcome to the Chainlit PDF QA demo! To get started:
1. Upload a PDF or text file
2. Ask a question about the file
"""

@cl.langchain_factory
def langchain_factory():
    # Set a fixed namespace "research"
    namespace = "research"

    docsearch = Pinecone.from_existing_index(
        index_name=index_name, embedding=embeddings, namespace=namespace
    )

    chain = RetrievalQAWithSourcesChain.from_chain_type(
        ChatOpenAI(temperature=0, streaming=True),
        chain_type="stuff",
        retriever=docsearch.as_retriever(max_tokens_limit=4097),
    )

    cl.send_message("The system is ready, you can now ask questions!")

    return chain

@cl.langchain_postprocess
def process_response(res):
    answer = res["answer"]
    sources = res.get("sources", "").strip()  # Use the get method with a default value
    source_elements = []

    docs = cl.user_session.get("docs")

    if docs:
        metadatas = [doc.metadata for doc in docs]
        all_sources = [m["source"] for m in metadatas]

        if sources:
            found_sources = []

            for source in sources.split(","):
                source_name = source.strip().replace(".", "")

                try:
                    index = all_sources.index(source_name)
                except ValueError:
                    continue
                text = docs[index].page_content
                found_sources.append(source_name)

                source_elements.append(cl.Text(text=text, name=source_name))

            if found_sources:
                answer += f"\nSources: {', '.join(found_sources)}"
            else:
                answer += "\nNo sources found"
    else:
        answer += "\nNo documents found in the user session"

    cl.send_message(answer, elements=source_elements)

willydouhard commented 1 year ago

Hello,

My first guess would be that either the index you use or namespace does not exist? I would try without the namespace parameter here:

docsearch = Pinecone.from_existing_index(
    index_name=index_name, embedding=embeddings, namespace=namespace
)

It would be easier to understand what the problem is if you could share the unexpected behavior you observe or an error message you get perhaps?

vicmcorrea commented 1 year ago

I did confirm that namespace and index does exist. Originally I got an "Error 'source'", and now when I tried your solution for the docsearch I got a "No documents found in the user session" error.

willydouhard commented 1 year ago

Okay the issue is that since your vectors are already in pinecone, you need to tell LangChain to return the documents used to generate the answer so we can then send them back to the chainlit UI.

import os
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
import pinecone
import chainlit as cl

pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),
    environment=os.environ.get("PINECONE_ENV"),
)

index_name = "test"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
embeddings = OpenAIEmbeddings()

namespaces = "research"

welcome_message = """Welcome to the Chainlit PDF QA demo! To get started:

Upload a PDF or text file
Ask a question about the file
"""

@cl.langchain_factory
def langchain_factory():
    index_name = "langchain-demo"
    namespace = None

    docsearch = Pinecone.from_existing_index(
        index_name=index_name, embedding=embeddings, namespace=namespace
    )

    chain = RetrievalQAWithSourcesChain.from_chain_type(
        ChatOpenAI(temperature=0, streaming=True),
        chain_type="stuff",
        retriever=docsearch.as_retriever(max_tokens_limit=4097),
        return_source_documents=True,
    )
    cl.send_message(content="The system is ready, you can now ask questions!")
    return chain

@cl.langchain_postprocess
def process_response(res):
    answer = res["answer"]
    sources = res.get("sources", "").strip()  # Use the get method with a default value
    source_elements = []
    docs = res.get("source_documents", None)

    if docs:
        metadatas = [doc.metadata for doc in docs]
        all_sources = [m["source"] for m in metadatas]

        if sources:
            found_sources = []

            for source_index, source in enumerate(sources.split(",")):
                orig_source_name = source.strip().replace(".", "")
                clean_source_name = f"source {source_index}"
                try:
                    found_index = all_sources.index(orig_source_name)
                except ValueError:
                    continue
                text = docs[found_index].page_content
                found_sources.append(clean_source_name)
                source_elements.append(cl.Text(text=text, name=clean_source_name))

            if found_sources:
                answer += f"\nSources: {', '.join(found_sources)}"
            else:
                answer += "\nNo sources found"

    cl.send_message(content=answer, elements=source_elements)

notice the return_source_documents=True parameter. In this example namespace is set to None but you are free to change that :)

vicmcorrea commented 1 year ago

I've tried similar solution that you've suggested and still not working with mine I get a sources error and with yours it still doesnt retrieve anything from pinecone (I've made sure it works on a command-line app I have).

https://python.langchain.com/en/latest/modules/chains/index_examples/qa_with_sources.html https://community.pinecone.io/t/retrieve-embeddings-stored-in-index-name/906/2

import os
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
import pinecone
import chainlit as cl

pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),
    environment=os.environ.get("PINECONE_ENV"),
)

index_name = "test-ab"
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
embeddings = OpenAIEmbeddings()

namespaces = "research"

welcome_message = """Welcome to the Chainlit PDF QA demo! To get started:

Upload a PDF or text file
Ask a question about the file
"""

@cl.langchain_factory
def langchain_factory():
    index_name = "test-ab"

    docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)

    chain = RetrievalQAWithSourcesChain.from_chain_type(
        ChatOpenAI(temperature=0, streaming=True),
        chain_type="stuff",
        retriever=docsearch.as_retriever(max_tokens_limit=4097),
        return_source_documents=True,
    )
    cl.send_message(content="The system is ready, you can now ask questions!")
    return chain

@cl.langchain_postprocess
def process_response(res):
    answer = res["answer"]
    sources = res.get("sources", "").strip()  # Use the get method with a default value
    source_elements = []
    docs = res.get("source_documents", None)

    if docs:
        metadatas = [doc.metadata for doc in docs]
        all_sources = [m["source"] for m in metadatas]

        if sources:
            found_sources = []

            for source_index, source in enumerate(sources.split(",")):
                orig_source_name = source.strip().replace(".", "")
                clean_source_name = f"source {source_index}"
                try:
                    found_index = all_sources.index(orig_source_name)
                except ValueError:
                    continue
                text = docs[found_index].page_content
                found_sources.append(clean_source_name)
                source_elements.append(cl.Text(text=text, name=clean_source_name))

            if found_sources:
                answer += f"\nSources: {', '.join(found_sources)}"
            else:
                answer += "\nNo sources found"

    cl.send_message(content=answer, elements=source_elements)

willydouhard commented 1 year ago

The code you sent above works my pinecone DB. If I understand correctly in your case this:

    docs = res.get("source_documents", None)

is equal to None?

If would be useful to see what exactly you have inside of res in the process_response function.

vicmcorrea commented 1 year ago

I got it to work!, it was some environment issue I had. One quick question, do the embeddings need to have a metadata for source to work?

willydouhard commented 1 year ago

Awesome!

In this case the metadata will automatically be created by the LangChain chain I believe. Based on the vector it got back and the text that vector points to

Chainlit / cookbook

QA pinecone issue #1