run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
35.54k stars 5.02k forks source link

[Question]: pydantic.v1.error_wrappers.ValidationError: 7 validation errors for QueryEndEvent #13706

Closed uniltone closed 3 weeks ago

uniltone commented 4 months ago

Question Validation

Question

The following is the code that reported the error. I want to merge the vector_tool with sql_tool.

import os,sys import time import gradio as gr

import threading

os.environ["OPENAI_API_KEY"] = "your openid api_key" os.environ["OPENAI_API_BASE"] = "your openai api base" time_start = time.time() from llama_index.core import Settings,get_response_synthesizer,SQLDatabase from llama_index.legacy import VectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage,Prompt,ServiceContext,set_global_service_context,SummaryIndex from llama_index.core.memory import ChatMemoryBuffer from llama_index.llms.openai import OpenAI from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.legacy.node_parser import SentenceSplitter from llama_index.legacy.indices.struct_store.sql_query import NLSQLTableQueryEngine import llama_index.core from llama_index.core.tools import QueryEngineTool from llama_index.core.retrievers import VectorIndexAutoRetriever from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core.query_engine import SQLAutoVectorQueryEngine from llama_index.core.callbacks import ( CallbackManager, LlamaDebugHandler, CBEventType, ) llama_debug = LlamaDebugHandler(print_trace_on_end=True) callback_manager = CallbackManager([llama_debug]) node_parser = SentenceSplitter() Settings.llm = OpenAI( model="gpt-3.5-turbo", temperature=0.1, max_tokens=2200, ) Settings.embed_model = OpenAIEmbedding( model="text-embedding-3-small", embed_batch_size=256 ) Settings.chunk_size = 1024

加载数据库

from sqlalchemy import ( create_engine, MetaData, Table, Column, String, Integer, select, text, ) from llama_index.core.query_engine import SQLTableRetrieverQueryEngine def load_database(): ''' 加载数据库 ''' engine = create_engine("sqlite:///database/abc.db") metadata_obj = MetaData() sql_database = SQLDatabase(engine,include_tables=["boat_info","boat_weight"],sample_rows_in_table_info=2) from llama_index.core.objects import ( SQLTableNodeMapping, ObjectIndex, SQLTableSchema, ) table_schema_objs = [ (SQLTableSchema(table_name="boat_info",context_str=("这是一张表,存储每艘船只名称,国家,尺寸,服役时间(年)"))), (SQLTableSchema(table_name="boat_weight",context_str=("这是一张表,存储每艘船只重量(吨),boat_id对应boat_info表的id"))) ] # add a SQLTableSchema for each table table_node_mapping = SQLTableNodeMapping(sql_database) obj_index = ObjectIndex.from_objects( table_schema_objs, table_node_mapping, VectorStoreIndex, ) query_engine = SQLTableRetrieverQueryEngine( sql_database, obj_index.as_retriever(similarity_top_k=3) ) return query_engine

def load_vector_index(): ''' 加载索引 ''' PERSIST_DIR = os.path.abspath(os.path.dirname(os.path.abspath(file)))+".\storage" PATH = os.path.abspath(os.path.dirname(os.path.abspath(file)))+'\data' service_content = ServiceContext.from_defaults(chunk_size=1024) vector_query_engines=[] if not os.path.exists(PERSIST_DIR): documents = SimpleDirectoryReader(input_dir=PATH+'/report').load_data(show_progress=True) index = VectorStoreIndex.from_documents(documents,service_context=service_content) vector_query_engines.append(index.as_query_engine()) index.storage_context.persist(persist_dir=PERSIST_DIR+'/report') documents = SimpleDirectoryReader(input_dir=PATH+'/rule').load_data(show_progress=True) index = VectorStoreIndex.from_documents(documents,service_context=service_content) vector_query_engines.append(index.as_query_engine()) index.storage_context.persist(persist_dir=PERSIST_DIR+'/rule') else: print("加载向量索引") storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR+'/report') index = load_index_from_storage(storage_context,callback_manager=callback_manager,embed_model=Settings.embed_model) vector_query_engines.append(index.as_query_engine()) storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR+'/rule') index = load_index_from_storage(storage_context,callback_manager=callback_manager,embed_model=Settings.embed_model) vector_query_engines.append(index.as_query_engine())

vector_store_info = VectorStoreInfo(

#     content_info="articles about shipping industry related information",
#     metadata_info=[MetadataInfo(name="title", type="str", description="articles about shipping industry related information"),],
# )
# vector_auto_retriever = VectorIndexAutoRetriever(
#     index, 
#     vector_store_info=vector_store_info,
#     service_context=service_content
# )
# retriever_query_engine = RetrieverQueryEngine.from_args(
#     vector_auto_retriever,
# )
print("断点1")
return vector_query_engines

def load_SQLAutoVectorQueryEngine(sql_query_engine,retriever_query_engine): sql_tool = QueryEngineTool.from_defaults( query_engine=sql_query_engine, description=( "Useful for translating a natural language query into a SQL query over" " a database containing: boat_name, country,weight,age of" " each boat" ), return_direct=True ) print("断点2")

vector_tool = QueryEngineTool.from_defaults(

#     query_engine=retriever_query_engine,
#     description=(
#         f"Useful for answering semantic questions about shipping industry related information"
#     ),
# )
boats = [
    "关于船舶行业的研究报告",
    "关于船舶行业的规范",
]
vector_tools = []
for boat, query_engine in zip(boats, retriever_query_engine):
    print("断点3")
    vector_tool = QueryEngineTool.from_defaults(
        query_engine=query_engine,
        name=boat,
        description=f"Useful for answering semantic questions about {boat}",
    )
    vector_tools.append(vector_tool)
# query_engine = SQLAutoVectorQueryEngine(
#     sql_tool, vector_tool,llm=Settings.llm,
# )

from llama_index.core.query_engine import RouterQueryEngine,SQLJoinQueryEngine
from llama_index.core.selectors import (PydanticMultiSelector,PydanticSingleSelector,LLMSingleSelector,LLMMultiSelector)
query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=([sql_tool]+vector_tools),
    verbose=True,
)
print("断点4")
return query_engine

sql_query_engine=load_database() retriever_query_engine=load_vector_index() query_engine = load_SQLAutoVectorQueryEngine(sql_query_engine,retriever_query_engine) print("断点5")

query_engine.callback_manager = callback_manager

response = query_engine.query("重大技术装备推广目录的通知?") print(response)

error: 加载向量索引 断点1 断点2 断点3 断点3 断点4 断点5 Selecting query engine 1: The question is related to a research report about the shipbuilding industry, which is more closely related to choice 2.. Selecting query engine 2: The question is related to regulations in the shipbuilding industry, which is less relevant to choice 3.. 重大技术装备推广目录的通知? The notice is about the release of the "Guidance Catalog for the Promotion and Application of Major Technical Equipment - Shipbuilding and Ocean Engineering Field (2022 Edition)" by the China Shipbuilding Industry Association. Traceback (most recent call last): File "e:\AiCompany\llama-index\llama_agent_copy_2.py", line 168, in response = query_engine.query("重大技术装备推广目录的通知?") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "E:\anaconda\Lib\site-packages\llama_index\core\instrumentation\dispatcher.py", line 274, in wrapper result = func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "E:\anaconda\Lib\site-packages\llama_index\core\base\base_query_engine.py", line 55, in query dispatch_event(QueryEndEvent(query=str_or_query_bundle, response=query_result)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "E:\anaconda\Lib\site-packages\pydantic\v1\main.py", line 341, in init raise validation_error pydantic.v1.error_wrappers.ValidationError: 7 validation errors for QueryEndEvent response -> source_nodes -> 0 -> node Can't instantiate abstract class BaseNode with abstract methods get_content, get_metadata_str, get_type, hash, set_content (type=type_error) response -> source_nodes -> 1 -> node Can't instantiate abstract class BaseNode with abstract methods get_content, get_metadata_str, get_type, hash, set_content (type=type_error) response -> source_nodes -> 2 -> node Can't instantiate abstract class BaseNode with abstract methods get_content, get_metadata_str, get_type, hash, set_content (type=type_error) response -> source_nodes -> 3 -> node Can't instantiate abstract class BaseNode with abstract methods get_content, get_metadata_str, get_type, hash, set_content (type=type_error) response instance of StreamingResponse, tuple or dict expected (type=type_error.dataclass; class_name=StreamingResponse) response instance of AsyncStreamingResponse, tuple or dict expected (type=type_error.dataclass; class_name=AsyncStreamingResponse) response instance of PydanticResponse, tuple or dict expected (type=type_error.dataclass; class_name=PydanticResponse)

dosubot[bot] commented 4 months ago

To resolve the pydantic.v1.error_wrappers.ValidationError with 7 validation errors for QueryEndEvent, ensure that the query and response fields are correctly populated with instances of QueryType and RESPONSE_TYPE, respectively. Additionally, when merging a vector search query with an SQL query, use the appropriate query engines and tools provided by the LlamaIndex framework.

Here is an example of how you might instantiate and use QueryEndEvent correctly, along with setting up the SQL and vector query engines:

from llama_index.core.instrumentation.events.query import QueryEndEvent
from llama_index.core.schema import QueryType
from llama_index.core.base.response.schema import RESPONSE_TYPE
from llama_index.legacy.query_engine.sql_vector_query_engine import SQLAutoVectorQueryEngine
from llama_index.legacy.indices.struct_store.sql_query import BaseSQLTableQueryEngine
from llama_index.legacy.query_engine.retriever_query_engine import RetrieverQueryEngine
from llama_index.legacy.tools.query_engine import QueryEngineTool

# Example query and response
query_example = QueryType("SELECT * FROM table WHERE vector_search = 'example'")
response_example = RESPONSE_TYPE({"result": "some_result"})

# Setting up SQL and vector query engines
sql_query_engine = BaseSQLTableQueryEngine()  # Replace with actual initialization
vector_auto_retriever = RetrieverQueryEngine()  # Replace with actual initialization

# Creating SQLAutoVectorQueryEngine instance
sql_vector_query_engine = SQLAutoVectorQueryEngine.from_sql_and_vector_query_engines(
    sql_query_engine=sql_query_engine,
    sql_tool_name="SQL Tool",
    sql_tool_description="SQL Query Engine Tool",
    vector_auto_retriever=vector_auto_retriever,
    vector_tool_name="Vector Tool",
    vector_tool_description="Vector Query Engine Tool"
)

# Creating an instance of QueryEndEvent
try:
    event = QueryEndEvent(query=query_example, response=response_example)
    print(event)
except ValidationError as e:
    print(e.json())

Ensure that query_example and response_example are valid instances of QueryType and RESPONSE_TYPE, respectively. Additionally, make sure that the SQL and vector query engines are properly initialized and compatible with the SQLAutoVectorQueryEngine class. If they are not, you will encounter validation errors.

The SQLAutoVectorQueryEngine class ensures compatibility by validating that the sql_query_tool.query_engine is an instance of BaseSQLTableQueryEngine or NLSQLTableQueryEngine, and that the vector_query_tool.query_engine is an instance of RetrieverQueryEngine with a retriever of type VectorIndexAutoRetriever. Make sure these conditions are met to avoid validation errors.

Additionally, when working with vector search queries, ensure that the VectorSearchQueryRequest model is correctly populated. Here is an example of how to use the VectorSearchQueryRequest model:

from pydantic import ValidationError
from manager_client.models.vector_search_query_request import VectorSearchQueryRequest

# Example vector search query request
vector_search_query = VectorSearchQueryRequest(
    query_string="example query",
    similarity_top_k=5
)

try:
    print(vector_search_query.to_json())
except ValidationError as e:
    print(e.json())

Make sure that the fields in VectorSearchQueryRequest are correctly populated to avoid validation errors [1][2][3][4].

#### Details **About Dosu** This response is meant to be useful and save you time. It is **not meant to be a precise solution**, but rather a starting point for your own research. Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

uniltone commented 4 months ago

How to improve the source code?@dosu

dosubot[bot] commented 4 months ago

To resolve the pydantic.v1.error_wrappers.ValidationError and correctly merge the vector search query with the SQL query using the LlamaIndex framework, you need to ensure that the types and arguments passed to the from_sql_and_vector_query_engines method are correct and consistent with the expected types. Additionally, you should ensure that the vector store is properly initialized and used.

Here is the improved source code:

  1. Ensure that the vector_auto_retriever parameter is correctly typed as VectorIndexAutoRetriever instead of RetrieverQueryEngine.
  2. Correct the docstring to match the parameter name vector_auto_retriever.
  3. Ensure that the vector store is properly initialized and used in the query.
@classmethod
def from_sql_and_vector_query_engines(
    cls,
    sql_query_engine: Union[BaseSQLTableQueryEngine, NLSQLTableQueryEngine],
    sql_tool_name: str,
    sql_tool_description: str,
    vector_auto_retriever: VectorIndexAutoRetriever,
    vector_tool_name: str,
    vector_tool_description: str,
    selector: Optional[Union[LLMSingleSelector, PydanticSingleSelector]] = None,
    **kwargs: Any,
) -> "SQLAutoVectorQueryEngine":
    """From SQL and vector query engines.

    Args:
        sql_query_engine (BaseSQLTableQueryEngine): SQL query engine.
        vector_auto_retriever (VectorIndexAutoRetriever): Vector retriever.
        selector (Optional[Union[LLMSingleSelector, PydanticSingleSelector]]):
            Selector to use.

    """
    sql_query_tool = QueryEngineTool.from_defaults(
        sql_query_engine, name=sql_tool_name, description=sql_tool_description
    )
    vector_query_tool = QueryEngineTool.from_defaults(
        vector_auto_retriever,
        name=vector_tool_name,
        description=vector_tool_description,
    )
    return cls(sql_query_tool, vector_query_tool, selector, **kwargs)

# Example usage with PGVectorStore
from llama_index.vector_stores.postgres import PGVectorStore
import os

URL = "postgresql+psycopg2://{username}:{password}@{host}:{port}/{db_name}".format(
    port=os.getenv("RELYT_PORT", "5432"),
    host=os.getenv("RELYT_HOST", "localhost"),
    username=os.getenv("RELYT_USER", "postgres"),
    password=os.getenv("RELYT_PASS", "mysecretpassword"),
    db_name=os.getenv("RELYT_NAME", "postgres"),
)

vector_store = PGVectorStore.from_params(
    host=os.getenv("RELYT_HOST", "localhost"),
    port=os.getenv("RELYT_PORT", "5432"),
    database=os.getenv("RELYT_NAME", "postgres"),
    user=os.getenv("RELYT_USER", "postgres"),
    password=os.getenv("RELYT_PASS", "mysecretpassword"),
    table_name="example",
    embed_dim=1536  # Using OpenAI’s text-embedding-ada-002
)

vector_auto_retriever = VectorIndexAutoRetriever(vector_store=vector_store)

# Now use the from_sql_and_vector_query_engines method with the correct parameters
sql_query_engine = ...  # Initialize your SQL query engine
sql_tool_name = "SQL Tool"
sql_tool_description = "Description of SQL Tool"
vector_tool_name = "Vector Tool"
vector_tool_description = "Description of Vector Tool"

query_engine = SQLAutoVectorQueryEngine.from_sql_and_vector_query_engines(
    sql_query_engine=sql_query_engine,
    sql_tool_name=sql_tool_name,
    sql_tool_description=sql_tool_description,
    vector_auto_retriever=vector_auto_retriever,
    vector_tool_name=vector_tool_name,
    vector_tool_description=vector_tool_description
)

This change ensures that the vector_auto_retriever parameter is correctly typed and documented, and that the vector store is properly initialized and used, which should help in resolving the pydantic.v1.error_wrappers.ValidationError [1][2][3].

#### Details **About Dosu** This response is meant to be useful and save you time. It is **not meant to be a precise solution**, but rather a starting point for your own research. Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

uniltone commented 4 months ago

But I am using RouterQueryEngine, not SQLAutoVectorQueryEngine @dosu

dosubot[bot] commented 4 months ago

To resolve the pydantic.v1.error_wrappers.ValidationError when using RouterQueryEngine, ensure that the data being passed to the QueryEndEvent is valid according to the Pydantic model's schema. Specifically, check that the final_response conforms to the expected schema.

Here is an example of how to validate and correct the final_response and ensure the payload includes the required fields before passing it to the QueryEndEvent:

from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.callbacks import CBEventType, EventPayload
from llama_index.core.output_parsers.pydantic import PydanticOutputParser
from llama_index.core.base.response.schema import Response

# Assuming `Response` is your Pydantic model
parser = PydanticOutputParser(output_cls=Response)

def _query(self, query_bundle: QueryBundle) -> RESPONSE_TYPE:
    with self.callback_manager.event(
        CBEventType.QUERY, payload={EventPayload.QUERY_STR: query_bundle.query_str}
    ) as query_event:
        # ... (other code)
        # Parse and validate final_response
        final_response = parser.parse(final_response_json)

        query_event.on_end(payload={
            EventPayload.QUERY_STR: query_bundle.query_str,
            EventPayload.RESPONSE: final_response
        })

    return final_response

async def _aquery(self, query_bundle: QueryBundle) -> RESPONSE_TYPE:
    with self.callback_manager.event(
        CBEventType.QUERY, payload={EventPayload.QUERY_STR: query_bundle.query_str}
    ) as query_event:
        # ... (other code)
        # Parse and validate final_response
        final_response = parser.parse(final_response_json)

        query_event.on_end(payload={
            EventPayload.QUERY_STR: query_bundle.query_str,
            EventPayload.RESPONSE: final_response
        })

    return final_response

Ensure that final_response is correctly structured and of the right type, and that the payload includes both query and response fields to resolve the validation errors [1][2][3].

#### Details **About Dosu** This response is meant to be useful and save you time. It is **not meant to be a precise solution**, but rather a starting point for your own research. Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

uniltone commented 4 months ago

I don’t understand, I can’t@dosu

dosubot[bot] commented 4 months ago

@logan-markewich, this is a complex issue that could benefit from your expertise.

#### Details **About Dosu** This response is meant to be useful and save you time. It is **not meant to be a precise solution**, but rather a starting point for your own research. Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.