AttributeError: 'TextNode' object has no attribute 'get_doc_id

farzad528 commented 2 months ago

AttributeError: 'TextNode' object has no attribute 'get_doc_id'"

I am trying to us Azure AI search and llamaindex to run a custom RAG pipeline and view in weave, I am running into this error. Note, it works fine with simple instrumentation and viewing traces but not with building a custom pipeline.

import os
from enum import Enum
import weave
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import PromptTemplate
from llama_index.vector_stores.azureaisearch import (
    AzureAISearchVectorStore,
    IndexManagement,
)

# Set this to True if you want to use an existing index, False to create a new one
use_existing_index = False

# Initialize the vector store
if use_existing_index:
    vector_store = AzureAISearchVectorStore(
        search_or_index_client=index_client,
        index_name=INDEX_NAME,
        index_management=IndexManagement.VALIDATE_INDEX,
        id_field_key="id",
        chunk_field_key="text",
        embedding_field_key="embedding",
        embedding_dimensionality=3072,
        metadata_string_field_key="metadata",
        doc_id_field_key="doc_id",
        language_analyzer="en.lucene",
        vector_algorithm_type="exhaustiveKnn",
    )
else:
    vector_store = AzureAISearchVectorStore(
        search_or_index_client=index_client,
        index_name=INDEX_NAME,
        index_management=IndexManagement.CREATE_IF_NOT_EXISTS,
        id_field_key="id",
        chunk_field_key="text",
        embedding_field_key="embedding",
        embedding_dimensionality=3072,
        metadata_string_field_key="metadata",
        doc_id_field_key="doc_id",
        language_analyzer="en.lucene",
        vector_algorithm_type="exhaustiveKnn",
    )

# Define Prompt Template
PROMPT_TEMPLATE = """
You are an intelligent assistant helping Contoso Inc employees with their healthcare plan questions and employee handbook questions. Use 'you' to refer to the individual asking the questions even if they ask with 'I'. Answer the following question using only the data provided in the sources below. For tabular information return it as an html table. Do not return markdown format. Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. If you cannot answer using the sources below, say you don't know.

User Query: {query_str}
Context: {context_str}
Answer: 
"""

class AzsRetrievalMode(Enum):
    SPARSE = "SPARSE"
    DENSE = "DENSE"
    HYBRID = "HYBRID"
    SEMANTIC_HYBRID = "SEMANTIC_HYBRID"

class SimpleRAGPipeline(weave.Model):
    chat_llm: str = AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME
    temperature: float = 0.1
    similarity_top_k: int = 3
    chunk_size: int = 512
    chunk_overlap: int = 128
    prompt_template: str = PROMPT_TEMPLATE

    def get_llm(self):
        return llm

    def get_template(self):
        return PromptTemplate(self.prompt_template)

    def load_documents_and_chunk(self, data):
        # Load documents from the specified directory
        documents = SimpleDirectoryReader(data).load_data()

        # Configure text splitter
        splitter = SentenceSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
        )

        # Split documents into nodes
        nodes = splitter.get_nodes_from_documents(documents)
        return nodes

    def get_query_engine(self, data=None, use_existing_index=True):
        if use_existing_index:
            storage_context = StorageContext.from_defaults(vector_store=vector_store)
            index = VectorStoreIndex.from_documents([], storage_context=storage_context)
        else:
            nodes = self.load_documents_and_chunk(data)
            storage_context = StorageContext.from_defaults(vector_store=vector_store)

            # Ensure the index is created and populated with the nodes
            index = VectorStoreIndex.from_documents(
                nodes,
                storage_context=storage_context,
            )

        prompt_template = self.get_template()

        return index.as_query_engine(
            similarity_top_k=self.similarity_top_k,
            llm=llm,
            text_qa_template=prompt_template,
        )

    @weave.op()
    def predict(self, query: str):
        query_engine = self.get_query_engine(
            data="data/pdf", use_existing_index=use_existing_index
        )
        response = query_engine.query(query)
        return {"response": response.response}

# Initialize Weave and run the pipeline
weave.init("test-llamaindex-weave")

rag_pipeline = SimpleRAGPipeline()
response = rag_pipeline.predict("What did the author do growing up?")
print(response)

farzad528 commented 2 months ago

@gtarpenning can you assist?

gtarpenning commented 2 months ago

Sure, happy to help, do you mind sharing the full stack trace? It might be useful to look at the weave traces, so the wandb project you are working in might be helpful. Also, have you tried running the code without weave? I can't quite reproduce the issue from the code provided as the constants etc are not included. @farzad528

gtarpenning commented 2 months ago

I'm seeing this issue in the llama-index github that looks related, perhaps you can follow up there? @farzad528

gtarpenning commented 1 month ago

Closing stale issue.

wandb / weave

AttributeError: 'TextNode' object has no attribute 'get_doc_id #2116