zilliztech / GPTCache

Semantic cache for LLMs. Fully integrated with LangChain and llama_index.
https://gptcache.readthedocs.io
MIT License
6.89k stars 480 forks source link

[Bug]: sqlite faiss onnx cache doesn't get a cache hit when using azure open AI #575

Closed mayalinetsky-kryon closed 7 months ago

mayalinetsky-kryon commented 7 months ago

Current Behavior

I took the code from the benchmark example here and added the cache.set_azure_openai_key() static method after init and adding the needed environment variables for azure. The code now looks like this:

import json
import os
import time

from gptcache.adapter import openai
from gptcache import cache, Config
from gptcache.manager import get_data_manager, CacheBase, VectorBase
from gptcache.similarity_evaluation.onnx import OnnxModelEvaluation
from gptcache.embedding import Onnx as EmbeddingOnnx
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation

def run():
    with open("mock_data.json", "r") as mock_file:
        mock_data = json.load(mock_file)

    embedding_onnx = EmbeddingOnnx()

    # if you want more accurate results,
    # you can use onnx's results to evaluate the model,
    # it will make the results more accurate, but the cache hit rate will decrease

    # class WrapEvaluation(SearchDistanceEvaluation):
    #
    #     def __init__(self):
    #         super().__init__()
    #         self.evaluation_onnx = OnnxModelEvaluation()
    #
    #     def evaluation(self, src_dict, cache_dict, **kwargs):
    #         rank1 = super().evaluation(src_dict, cache_dict, **kwargs)
    #         if rank1 <= 0.5:
    #             rank2 = self.evaluation_onnx.evaluation(src_dict, cache_dict, **kwargs)
    #             return rank2 if rank2 != 0 else 1
    #         return 0
    #
    #     def range(self):
    #         return 0.0, 1.0

    class WrapEvaluation(SearchDistanceEvaluation):
        def evaluation(self, src_dict, cache_dict, **kwargs):
            return super().evaluation(src_dict, cache_dict, **kwargs)

        def range(self):
            return super().range()

    sqlite_file = "sqlite.db"
    faiss_file = "faiss.index"
    has_data = os.path.isfile(sqlite_file) and os.path.isfile(faiss_file)

    cache_base = CacheBase("sqlite")
    vector_base = VectorBase("faiss", dimension=embedding_onnx.dimension)
    data_manager = get_data_manager(cache_base, vector_base, max_size=100000)
    cache.init(
        embedding_func=embedding_onnx.to_embeddings,
        data_manager=data_manager,
        similarity_evaluation=WrapEvaluation(),
        config=Config(similarity_threshold=0.95),
    )

    cache.set_azure_openai_key()

    i = 0
    for pair in mock_data:
        pair["id"] = str(i)
        i += 1

    if not has_data:
        print("insert data")
        start_time = time.time()
        questions, answers = map(
            list, zip(*((pair["origin"], pair["id"]) for pair in mock_data))
        )
        cache.import_data(questions=questions, answers=answers)
        print(
            "end insert data, time consuming: {:.2f}s".format(time.time() - start_time)
        )

    all_time = 0.0
    hit_cache_positive, hit_cache_negative = 0, 0
    fail_count = 0
    for pair in mock_data:
        mock_messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": pair["similar"]},
        ]
        try:
            start_time = time.time()
            res = openai.ChatCompletion.create(
                deployment_id="gpt-35-turbo",
                messages=mock_messages,
            )
            res_text = openai.get_message_from_openai_answer(res)
            if res_text == pair["id"]:
                hit_cache_positive += 1
                prefix = "Positive"
            else:
                hit_cache_negative += 1
                prefix = "Negative"
            consume_time = time.time() - start_time
            all_time += consume_time
            print(f"{prefix} cache hit time consuming: {consume_time:.2f}s")
        except BaseException as e:
            fail_count += 1

    print("average time: {:.2f}s".format(all_time / len(mock_data)))
    print("cache_hit_positive:", hit_cache_positive)
    print("hit_cache_negative:", hit_cache_negative)
    print("fail_count:", fail_count)
    print("average embedding time: ", cache.report.average_embedding_time())
    print("average search time: ", cache.report.average_search_time())

if __name__ == "__main__":
    run()

After running the script I see there are no positive cache hits at all, and all "res_text" values are a full answer resulting from a new request from azure.

Expected Behavior

Because the similarity threshold is quite high (95), I was expecting to get at least some hits, especially for the first question in the mock data: "origin": "Hugging Face Hub is a platform to host Git-based models, datasets, and Spaces." "similar": "Hugging Face Hub serves as a repository for Git-based models, datasets, and Spaces."

And even increasing the threshold to 1 doesn't get me cache hits.

Steps To Reproduce

No response

Environment

No response

Anything else?

No response

mayalinetsky-kryon commented 7 months ago

Found the problem:

Because I've run the script twice: one with similarity 0 and one with similarity 0.95, without deleting the database, the cache contained "updated" answers that were from the first run, meaning in the second run I DID get a cache hit but the answer was not what I originally manually saved in the cache during the import, resulting in a "negative cache hit".