truera / trulens

Evaluation and Tracking for LLM Experiments
https://www.trulens.org/
MIT License
2.05k stars 177 forks source link

Rag evaluation with TruLens and bedrock not working using query engine retriever #752

Closed danielaguilera-aily closed 8 months ago

danielaguilera-aily commented 8 months ago
def get_trulens_bedrock():
    import os 
    from trulens_eval import Bedrock
    import boto3

    region = 
    profile =

    os.environ["AWS_PROFILE"] = profile
    os.environ["AWS_REGION"] = region

    session = boto3.Session(region_name=region)
    boto3_bedrock = session.client(service_name="bedrock-runtime")

    bedrock = Bedrock(credentials_profile_name=profile,
                      model_id="anthropic.claude-v2",
                      client=boto3_bedrock
                     )

    return bedrock
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.retrievers import AutoMergingRetriever
from llama_index.query_engine import RetrieverQueryEngine

automerging_retriever = automerging_index.as_retriever(
    similarity_top_k=12,
)

retriever = AutoMergingRetriever(
    automerging_retriever, 
    automerging_index.storage_context, 
    verbose=True,
)

rerank = SentenceTransformerRerank(top_n=6, model="BAAI/bge-reranker-base")

prompt_temp = """

Human: Use the following pieces of context to provide a concise answer to the question at the end. If you don't know
the answer, just say that you don't know, don't try to make up an answer.
<context>
{context}
</context

Question: {question}

Assistant:"""

from llama_index.prompts import PromptTemplate

qa_template = PromptTemplate(prompt_temp, template_var_mappings={"query_str": "question", "context_str": "context"})

auto_merging_engine = RetrieverQueryEngine.from_args(
    retriever=retriever, service_context=auto_merging_context, node_postprocessors=[rerank], text_qa_template=qa_template
)
def get_prebuilt_trulens_recorder(query_engine, app_id):
    from trulens_eval import TruLlama
    import numpy as np

    bedrock = get_trulens_bedrock()

    qa_relevance = Feedback(bedrock.relevance).on_input().on(
            TruLlama.select_source_nodes().node.text # See note below
        ).aggregate(np.mean)

    feedbacks = [qa_relevance]
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
    )
    return tru_recorder
from trulens_eval import Feedback, Tru

tru = Tru()

Tru().reset_database()

tru_recorder = get_prebuilt_trulens_recorder(
    auto_merging_engine,
    app_id ='app_0'
)
with tru_recorder as recording:
    response = auto_merging_engine.query(query)
from trulens_eval.schema import FeedbackResult
from concurrent.futures import as_completed

rec = recording.get()

for feedback_future in  as_completed(rec.feedback_results):
    feedback, feedback_result = feedback_future.result()

    feedback: Feedback
    feedbac_result: FeedbackResult

    display(feedback.name, feedback_result.result)

The code that I'm sharing with you is not working with bedrock. Could you help me?

Next error is being returned:

bedrock request failed <class 'botocore.errorfactory.ValidationException'>=An error occurred (ValidationException) when calling the InvokeModel operation: Malformed input request: 4 schema violations found, please reformat your input and try again.. Retries remaining=0.

....

RuntimeError: Evaluation of relevance failed on inputs: {'prompt': ...} API bedrock request failed 4 time(s)..

joshreini1 commented 8 months ago

Thanks @danielaguilera-aily - this got fixed with this PR. Planning to release this in a patch today.