[Question]: Llama-index streaming evalution

smanna123 commented 3 months ago

Question Validation

[X] I have searched both the documentation and discord for an answer.

Question

When I am using "streaming=True" in query_engine in LlamaIndex using the following snippet, as shown below, the output column is empty and groundedness column is missing from the records Dataframe. (evalution usin tru-lens). without streaming=True, it is working fine.

import logging
import pickle
import sys
import os
import warnings
import trulens_eval
from dotenv import load_dotenv
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage, PromptTemplate
from llama_index.retrievers.bm25 import BM25Retriever
from custom_retriever import HybridRetriever
from utils import create_template
from trulens_eval.feedback.provider import OpenAI
from trulens_eval import Feedback, Tru
import numpy as np
from trulens_eval.app import App
from trulens_eval import TruLlama
from llama_index.llms.openai import OpenAI

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

warnings.filterwarnings('ignore', category=FutureWarning, message="The default dtype for empty Series will be 'object'")
PERSIST_DIR = "./storage"
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
tru = Tru()
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.4)
provider = trulens_eval.feedback.provider.OpenAI(openai_api_key)

# settings = {
#     "model": "gpt-3.5-turbo",
#     "temperature": 0.7,
#     "max_tokens": 500,
#     "top_p": 1,
#     "frequency_penalty": 0,
#     "presence_penalty": 0,
# }

with open("doc.pickle", "rb") as f:
    nodes = pickle.load(f)

if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    index = VectorStoreIndex(nodes, storage_context=storage_context)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

# retrieve the top 10 most similar nodes using embeddings
vector_retriever = index.as_retriever(similarity_top_k=4)

# retrieve the top 10 most similar nodes using bm25
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=4)

hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)

query_engine = RetrieverQueryEngine.from_args(
    retriever=hybrid_retriever,
    llm=llm,
   streaming=True
)

query_engine.update_prompts(
    {'response_synthesizer:text_qa_template': create_template()}
)

# response = query_engine.query("What is jas?")
# print(response)
#
context = App.select_context(query_engine)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons)
    .on(context.collect())  # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance)
    .on_input_output()
)
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons)
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

tru_query_engine_recorder = TruLlama(query_engine,
    app_id='LlamaIndex_App1',
    feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance])

with tru_query_engine_recorder as recording:
    response = query_engine.query("jas date")

rec = recording.records
records, feedback = tru.get_records_and_feedback(app_ids=["LlamaIndex_App1"])
tru.get_leaderboard(app_ids=["LlamaIndex_App1"])

records.head()

tru.run_dashboard()

dosubot[bot] commented 3 months ago

To address the issues you're encountering with the streaming=True functionality and the missing columns in the records DataFrame, here are some steps and considerations:

Ensure Proper Configuration for Streaming: Make sure that the RetrieverQueryEngine is properly configured to handle streaming responses. You need to set streaming=True when constructing the Response Synthesizer and then pass it to the RetrieverQueryEngine.

from llama_index.legacy.response_synthesizers import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

synth = get_response_synthesizer(streaming=True, ...)
query_engine = RetrieverQueryEngine(response_synthesizer=synth, ...)

Handling Missing Columns in Records DataFrame: The TruLlama class uses the get_eval_results_df function to aggregate and display feedback in the records DataFrame. If columns like "groundedness" are missing, ensure that the feedback functions are correctly defined and applied.

from trulens_eval import Feedback, TruLlama

# Define feedback functions
f_groundedness = Feedback(provider.groundedness_measure_with_cot_reasons).on(context.collect()).on_output()
f_answer_relevance = Feedback(provider.relevance).on_input_output()
f_context_relevance = Feedback(provider.context_relevance_with_cot_reasons).on_input().on(context).aggregate(np.mean)

# Initialize TruLlama with feedbacks
tru_query_engine_recorder = TruLlama(query_engine, app_id='LlamaIndex_App1', feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance])

Known Issues with Missing Columns: There is a known issue with missing columns in the get_retrieval_results_df function. If a specified metric key is not present in the results DataFrame, the function raises a ValueError. Ensure that all necessary metric keys are present in the DataFrame.

def get_retrieval_results_df(names, results_arr, metric_keys=None):
   metric_keys = metric_keys or DEFAULT_METRIC_KEYS
   avg_metrics_dict = defaultdict(list)
   for name, eval_results in zip(names, results_arr):
       metric_dicts = [eval_result.metric_vals_dict for eval_result in eval_results]
       results_df = pd.DataFrame(metric_dicts)
       for metric_key in metric_keys:
           if metric_key not in results_df.columns:
               raise ValueError(f"Metric key {metric_key} not in results_df")
           avg_metrics_dict[metric_key].append(results_df[metric_key].mean())
   return pd.DataFrame({"retrievers": names, **avg_metrics_dict})

By ensuring the proper configuration for streaming and addressing the known issues with missing columns, you should be able to resolve the problems with the output column being empty and the groundedness column missing from the records DataFrame ^[1]^[2].

To continue talking to Dosu, mention @dosu.

smanna123 commented 3 months ago

I checked all the aspects, and still facing same issue. If streaming is True, in dashboard output is none always Screenshot from 2024-07-02 15-45-16

dosubot[bot] commented 3 months ago

To resolve the issue with the output being None when streaming=True in the query_engine, you need to ensure that the StreamingResponse object is properly handled and processed. The StreamingResponse object requires you to iterate over the tokens as they arrive, which might not be directly compatible with how TruLens expects the data.

Here is an example of how you can handle the StreamingResponse to ensure that the output is properly captured and processed:

# Assuming you have already set up the query engine with streaming enabled
query_engine = index.as_query_engine(streaming=True, similarity_top_k=1)

# Perform the query
streaming_response = query_engine.query("What did the author do growing up?")

# Initialize an empty string to collect the response
full_response = ""

# Iterate over the streaming response to collect the full output
for text in streaming_response.response_gen:
    full_response += text

# Now you can use the full_response as needed
print(full_response)

# Ensure that the full_response is passed to TruLens for evaluation
# Example: records_df = trulens.evaluate(full_response)

Additionally, ensure that the CondenseQuestionChatEngine is properly configured to handle streaming responses. The stream_chat method in the CondenseQuestionChatEngine class is designed to handle streaming responses and record them correctly:

@trace_method("chat")
def stream_chat(
    self, message: str, chat_history: Optional[List[ChatMessage]] = None
) -> StreamingAgentChatResponse:
    chat_history = chat_history or self._memory.get(input=message)

    # Generate standalone question from conversation context and last message
    condensed_question = self._condense_question(chat_history, message)

    log_str = f"Querying with: {condensed_question}"
    logger.info(log_str)
    if self._verbose:
        print(log_str)

    # Query with standalone question
    query_response = self._query_engine.query(condensed_question)

    tool_output = self._get_tool_output_from_response(
        condensed_question, query_response
    )

    # Record response
    if (
        isinstance(query_response, StreamingResponse)
        and query_response.response_gen is not None
    ):
        # override the generator to include writing to chat history
        self._memory.put(ChatMessage(role=MessageRole.USER, content=message))
        response = StreamingAgentChatResponse(
            chat_stream=response_gen_from_query_engine(query_response.response_gen),
            sources=[tool_output],
        )
        thread = Thread(
            target=response.write_response_to_history,
            args=(self._memory, True),
        )
        thread.start()
    else:
        raise ValueError("Streaming is not enabled. Please use chat() instead.")
    return response

By collecting the full response from the StreamingResponse object and ensuring that the CondenseQuestionChatEngine is properly configured to handle streaming responses, you can ensure that the output is not empty and can be properly evaluated by TruLens. This should resolve the issue with the output column being empty and the groundedness column missing ^[1]^[2]^[3].

To continue talking to Dosu, mention @dosu.

smanna123 commented 3 months ago

still not working, same issue ocuring.

run-llama / llama_index

[Question]: Llama-index streaming evalution #14508

Question Validation

Question