zilliztech / GPTCache

Semantic cache for LLMs. Fully integrated with LangChain and llama_index.
https://gptcache.readthedocs.io
MIT License
7.04k stars 495 forks source link

[Bug]: Using ConversationalRetrievalChain with question_generator and LLMChain in Langchain does not produce cache #481

Closed Yafaa5 closed 1 year ago

Yafaa5 commented 1 year ago

Current Behavior

This code follow the documentation step to add cache to ConversationalRetrievalChain with Langchain but it's not working properly

from gptcache.adapter.langchain_models import LangChainChat
from langchain import ElasticVectorSearch
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain.chains.question_answering import load_qa_chain
import openai 
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
from gptcache.processor.pre import get_messages_last_content
from gptcache import cache
from gptcache.manager import CacheBase, VectorBase, get_data_manager
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
from gptcache.embedding import OpenAI

openai_client = OpenAI()
load_dotenv()

openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")

openai_client = OpenAI(model=os.getenv("EMBEDDING_MODEL_DEPLOYMENT"))

# get the content(only question) form the prompt to cache
def get_content_func(data, **_):
    return data.get("prompt").split("Question")[-1]

cache_base = CacheBase('sqlite')
vector_base = VectorBase('faiss', dimension=openai_client.dimension, collection_name='chatbot')
data_manager = get_data_manager(cache_base, vector_base)
cache.init(
    pre_embedding_func=get_messages_last_content,
    embedding_func=openai_client.to_embeddings,
    data_manager=data_manager,
    similarity_evaluation=SearchDistanceEvaluation(),
    )
cache.set_openai_key()

gpt_client = LangChainChat(chat=AzureChatOpenAI(
        openai_api_base=os.getenv("OPENAI_API_BASE"),
        openai_api_version="2023-03-15-preview",
        deployment_name=os.getenv("CHAT_COMPLETION_DEPLOYMENT"),
        openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        openai_api_type="azure"
    ))

QUESTION_ANSWER_PROMPT = """
        [INSTRUCTION]: You are a helpful chatbot that has to satisfy user requests in its
        original language in the [USER REQUEST] section to the best of your capabilities.

        [SOURCES OF INFORMATION]:{context}
        [USER REQUEST]: {question}"""

question_prompt_template = PromptTemplate(template=QUESTION_ANSWER_PROMPT, input_variables=["context", "question"])

CONDENSE_PROMPT = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
        Chat History:
        {chat_history}
        Follow Up Input: {question}
        Standalone question:"""

condense_prompt_template = PromptTemplate.from_template(CONDENSE_PROMPT)

doc_chain = load_qa_chain(gpt_client, chain_type="stuff", prompt=question_prompt_template)

question_generator = LLMChain(llm=gpt_client, prompt=condense_prompt_template)

question_answer_chain = ConversationalRetrievalChain(retriever=elastic_client._es_client.as_retriever(search_type="similarity", search_kwargs={"k": 12}), combine_docs_chain=doc_chain, return_source_documents=True, question_generator=question_generator, return_generated_question=True, verbose=True)

vectordbkwargs = {"search_distance": 0.7}

chat_history=""
user_query = "Who won the competition?"
chat_history=""
start_time = time.time()
result = question_answer_chain({"question": user_query, "chat_history": chat_history ,"vectordbkwargs": vectordbkwargs})
print("Time consuming: {:.2f}s".format(time.time() - start_time))

this return 2.77s repeating the same query return 4.55s so there is no cache is working

Expected Behavior

Repeating. the same query should give around 0 time since it will returned from the cache

Steps To Reproduce

Python = 3.9.7
GPTcache = latest (v0.1.35)
langchain = latest (v0.0.229)

Environment

Windows,Jupyter Notebook

Anything else?

No response

SimFG commented 1 year ago

Hi, @Yafaa5 The LangChainChat is gptcache.adapter.langchain_models.LangChainChat?

Yafaa5 commented 1 year ago

@SimFG Exactly , updated

SimFG commented 1 year ago

@Yafaa5 This test code doesn't seem to work well, please check out it

Yafaa5 commented 1 year ago

@SimFG It working in my side and don't show any errors I just did not put the elastic client code for the case of simplicity and since I am using elastic in Azure

SimFG commented 1 year ago

hi, @Yafaa5 I run the demo code and the cache work well. my all code:

import getpass
import time

from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Milvus

from gptcache import cache
from gptcache.adapter.langchain_models import LangChainChat
from gptcache.embedding import Onnx
from gptcache.manager import CacheBase, VectorBase, get_data_manager
from gptcache.processor.pre import get_messages_last_content
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation

openai_key = getpass.getpass("Enter your OpenAI key: ")

# diff 1
# openai_client = OpenAI(model=os.getenv("EMBEDDING_MODEL_DEPLOYMENT"))
openai_client = Onnx()

# get the content(only question) form the prompt to cache
def get_content_func(data, **_):
    return data.get("prompt").split("Question")[-1]

cache_base = CacheBase('sqlite')
vector_base = VectorBase('faiss', dimension=openai_client.dimension, collection_name='chatbot')
data_manager = get_data_manager(cache_base, vector_base)
cache.init(
    pre_embedding_func=get_messages_last_content,
    embedding_func=openai_client.to_embeddings,
    data_manager=data_manager,
    similarity_evaluation=SearchDistanceEvaluation(),
)

# diff 2
gpt_client = LangChainChat(chat=ChatOpenAI(openai_api_key=openai_key))

QUESTION_ANSWER_PROMPT = """
        [INSTRUCTION]: You are a helpful chatbot that has to satisfy user requests in its
        original language in the [USER REQUEST] section to the best of your capabilities.

        [SOURCES OF INFORMATION]:{context}
        [USER REQUEST]: {question}"""

question_prompt_template = PromptTemplate(template=QUESTION_ANSWER_PROMPT, input_variables=["context", "question"])

CONDENSE_PROMPT = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
        Chat History:
        {chat_history}
        Follow Up Input: {question}
        Standalone question:"""

condense_prompt_template = PromptTemplate.from_template(CONDENSE_PROMPT)

doc_chain = load_qa_chain(gpt_client, chain_type="stuff", prompt=question_prompt_template)

question_generator = LLMChain(llm=gpt_client, prompt=condense_prompt_template)

# diff 3
vector_store = Milvus.from_texts(texts=[], embedding=OpenAIEmbeddings(openai_api_key=openai_key))
question_answer_chain = ConversationalRetrievalChain(
    retriever=vector_store.as_retriever(),
    combine_docs_chain=doc_chain, return_source_documents=True, question_generator=question_generator,
    return_generated_question=True, verbose=True)

vectordbkwargs = {"search_distance": 0.7}

chat_history = ""
user_query = "Who won the competition?"
chat_history = ""
start_time = time.time()
result = question_answer_chain({"question": user_query, "chat_history": chat_history, "vectordbkwargs": vectordbkwargs})
print("Time consuming: {:.2f}s".format(time.time() - start_time))

I have commented the diff code, like diff 1, diff 2, diff 3.

The test result: image

So i guess it maybe cause the unstable network

Yafaa5 commented 1 year ago

@SimFG Can you check the result output for multiple queries because I am getting the same cached response for all queries I do, even thought they are semantically different

SimFG commented 1 year ago

@Yafaa5 This is because the entire sentence, most of the content is prompt, and the user input is very little, so it looks very similar. The recommended method is to remove the prompt during preprocessing

Yafaa5 commented 1 year ago

How to exactly do that ?

SimFG commented 1 year ago

@Yafaa5 you need to custom the get_content_func function according your need, like removing the prompt content and saving the user input

Yafaa5 commented 1 year ago

@SimFG In my case I am not using the get_content_func but get_messages_last_content from the gptcache.processor.pre is used instead if I understood correctly this function need to be updated ?

def get_messages_last_content(data: Dict[str, Any], **_: Any) -> str:
    """ get the last content of the llm request messages array

    :param data: the user llm request data
    :type data: Dict[str, Any]

    Example:
        .. code-block:: python

            from gptcache.processor.pre import get_messages_last_content

            content = get_messages_last_content({"messages": [{"content": "hello"}, {"content": "world"}]})
            # "world"
    """
    return data.get("messages")[-1].content
SimFG commented 1 year ago

@Yafaa5 sorry, I was wrong here. But, the pre-process function in the gptcache.processor.pre package can't meet your needs, which means you need to custom your pre-process function. And the reference maybe help you, How to better configure your cache

Yafaa5 commented 1 year ago

@SimFG Is there a way to log this data dict to see what inside or return the query passed as replacement for data.get("messages")[-1].content def get_messages_last_content(data: Dict[str, Any], **_: Any) -> str: Thanks a lot for your help

SimFG commented 1 year ago

@Yafaa5 yes, you can do it like this

def custom_get_messages_last_content(data: Dict[str, Any], **params: Any) -> str:
    pprint(data)
    pprint(param)
    return data.get("messages")[-1].content

cache.init(
    pre_embedding_func=custom_get_messages_last_content,
    embedding_func=openai_client.to_embeddings,
    data_manager=data_manager,
    similarity_evaluation=SearchDistanceEvaluation(),
)
Yafaa5 commented 1 year ago

@SimFG yes I tried that but I get NameError: name 'param' is not defined is param something that I define or pass it to the function ?

SimFG commented 1 year ago

@Yafaa5 sorry, there is a typo error, it should be params image

Yafaa5 commented 1 year ago

@SimFG so I did create this function to extract only the query from the prompt and pass it pre_embedding_func but it sill give the same first cached answer for every query

def custom_get_messages_last_content(data, **params) :
    separator = '[USER REQUEST]:'
    string = data.get("messages")[-1].content
    if separator in string:
      result = string.split(separator, 1)[1]
    return result 
SimFG commented 1 year ago

@Yafaa5 You can try to print the result value to checkout the value

Yafaa5 commented 1 year ago

@SimFG already did and it's only the query passed to the question_answer_chain so it should be good

SimFG commented 1 year ago

@Yafaa5 If you want to get a great cache, you need to spend more time to learn how to use. Because in this library, i can't consider all usage scenarios. And a good cache needs to be constantly adjusted according to the usage scenario, which is time-consuming.

How to better configure your cache

Yafaa5 commented 1 year ago

@SimFG I think there is bug somewhere when using ConversationalRetrievalChain with a question_generator , taking a loser look at the data of the sqlite database I can see there is only the resulat of the first query stored there : I have used this query :

query = 'SELECT * FROM gptcache_question;'
cursor.execute(query)
result = cursor.fetchall()
print(result)  

and this one :

query = 'SELECT * FROM gptcache_answer;'
cursor.execute(query)
result = cursor.fetchall()
print(result)

I don't have much experience with faiss so I don't know how to visualize the results stored there .

SimFG commented 1 year ago

I don’t quite understand it. When you request one time, if there is no hit, there should be one more row data in sqlite. If it hits, the data in sqlite will remain unchanged.

Yafaa5 commented 1 year ago

@SimFG I found the solution is to set the max_distance value

cache.init(
    ...
    similarity_evaluation=SearchDistanceEvaluation(max_distance=1.0)
)

I appreciate the help

oussamaJmaaa commented 5 months ago

the code works perfectly but the chat_history is not being saved and the chatbot has no memory how can i fix that please ?

hi, @Yafaa5 I run the demo code and the cache work well. my all code:

import getpass
import time

from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Milvus

from gptcache import cache
from gptcache.adapter.langchain_models import LangChainChat
from gptcache.embedding import Onnx
from gptcache.manager import CacheBase, VectorBase, get_data_manager
from gptcache.processor.pre import get_messages_last_content
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation

openai_key = getpass.getpass("Enter your OpenAI key: ")

# diff 1
# openai_client = OpenAI(model=os.getenv("EMBEDDING_MODEL_DEPLOYMENT"))
openai_client = Onnx()

# get the content(only question) form the prompt to cache
def get_content_func(data, **_):
    return data.get("prompt").split("Question")[-1]

cache_base = CacheBase('sqlite')
vector_base = VectorBase('faiss', dimension=openai_client.dimension, collection_name='chatbot')
data_manager = get_data_manager(cache_base, vector_base)
cache.init(
    pre_embedding_func=get_messages_last_content,
    embedding_func=openai_client.to_embeddings,
    data_manager=data_manager,
    similarity_evaluation=SearchDistanceEvaluation(),
)

# diff 2
gpt_client = LangChainChat(chat=ChatOpenAI(openai_api_key=openai_key))

QUESTION_ANSWER_PROMPT = """
        [INSTRUCTION]: You are a helpful chatbot that has to satisfy user requests in its
        original language in the [USER REQUEST] section to the best of your capabilities.

        [SOURCES OF INFORMATION]:{context}
        [USER REQUEST]: {question}"""

question_prompt_template = PromptTemplate(template=QUESTION_ANSWER_PROMPT, input_variables=["context", "question"])

CONDENSE_PROMPT = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
        Chat History:
        {chat_history}
        Follow Up Input: {question}
        Standalone question:"""

condense_prompt_template = PromptTemplate.from_template(CONDENSE_PROMPT)

doc_chain = load_qa_chain(gpt_client, chain_type="stuff", prompt=question_prompt_template)

question_generator = LLMChain(llm=gpt_client, prompt=condense_prompt_template)

# diff 3
vector_store = Milvus.from_texts(texts=[], embedding=OpenAIEmbeddings(openai_api_key=openai_key))
question_answer_chain = ConversationalRetrievalChain(
    retriever=vector_store.as_retriever(),
    combine_docs_chain=doc_chain, return_source_documents=True, question_generator=question_generator,
    return_generated_question=True, verbose=True)

vectordbkwargs = {"search_distance": 0.7}

chat_history = ""
user_query = "Who won the competition?"
chat_history = ""
start_time = time.time()
result = question_answer_chain({"question": user_query, "chat_history": chat_history, "vectordbkwargs": vectordbkwargs})
print("Time consuming: {:.2f}s".format(time.time() - start_time))

I have commented the diff code, like diff 1, diff 2, diff 3.

The test result: image

So i guess it maybe cause the unstable network