run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
36.8k stars 5.28k forks source link

[Question]: indexing nodes with qdrant throws error “list object has no attribute _id” #16040

Closed JINO-ROHIT closed 2 months ago

JINO-ROHIT commented 2 months ago

Question Validation

Question

im getting an error while trying to index nodes with qdrant, please help. Attaching my code snippet below-

class FileReader: def init(self, file_paths): self.file_paths = file_paths

def read_data(self):
    all_documents = []
    [logger.info](http://logger.info/)("Reading data from files: %s", self.file_paths)
    for idx, _f in tqdm(enumerate(self.file_paths), total = len(self.file_paths)):
        [logger.info](http://logger.info/)(f"Reading from {_f}")
        extracted_text = ""
        elements = partition_pdf(_f, extract_images_in_pdf = False)
        extracted_text += "\n\n".join([str(el) for el in elements])
        all_documents.append( Document(text = extracted_text) )
    return all_documents

class VectorIndexer: def init(self, nodes, vector_store, embedding_model): self.nodes = nodes self.vector_store = vector_store self.embedding_model = embedding_model

def create_index(self):
    [logger.info](http://logger.info/)("Creating the vector index")
    storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
    service_context = ServiceContext.from_defaults(embed_model=self.embedding_model, llm=None)
    return VectorStoreIndex(
        self.nodes, storage_context=storage_context, service_context=service_context
    )

def main():

file_paths = [
"abc.pdf"
]
file_reader = FileReader(file_paths = file_paths)

file_data = file_reader.read_data()

embedding_component = EmbeddingComponent("local", 'avsolatorio/GIST-Embedding-v0')
embed_model = embedding_component.embedding_model

node_parser = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)

base_nodes = node_parser.get_nodes_from_documents(file_data, show_progress = True)
for idx, node in enumerate(base_nodes):
    node.id_ = f"node_{idx}"

[logger.info](http://logger.info/)(f"Total number of nodes : {len(base_nodes)}")

sub_chunk_sizes = [128, 256, 512]
sub_node_parsers = [
    SentenceSplitter(chunk_size=c, chunk_overlap=20) for c in sub_chunk_sizes
]

all_nodes = []
for base_node in tqdm(base_nodes):
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)

client = qdrant_client.QdrantClient(
    url = "xxx",
)

vector_store = QdrantVectorStore(client = client,
                                collection_name="cookbooks",
                                batch_size = 500)

storage_context = StorageContext.from_defaults(vector_store = vector_store)
service_context = ServiceContext.from_defaults(embed_model = embed_model,
                                                llm = None,
                                                node_parser = node_parser)

index = VectorStoreIndex.from_vector_store(
vector_store=vector_store, service_context=service_context
)

index.insert(all_nodes)

if name == "main": main()

dosubot[bot] commented 2 months ago

The error "list object has no attribute _id" suggests that somewhere in your code, you are trying to access an attribute _id on a list object instead of on an individual BaseNode object. To resolve this, ensure that you are correctly accessing the attributes of the BaseNode objects in your list.

Here is a corrected version of your code snippet, focusing on the _build_points method and ensuring that you are correctly iterating over the nodes and accessing their attributes:

def _build_points(self, nodes: List[BaseNode], sparse_vector_name: str) -> Tuple[List[Any], List[str]]:
    from qdrant_client.http import models as rest

    ids = []
    points = []
    for node_batch in iter_batch(nodes, self.batch_size):
        node_ids = []
        vectors: List[Any] = []
        sparse_vectors: List[List[float]] = []
        sparse_indices: List[List[int]] = []
        payloads = []

        if self.enable_hybrid and self._sparse_doc_fn is not None:
            sparse_indices, sparse_vectors = self._sparse_doc_fn(
                [
                    node.get_content(metadata_mode=MetadataMode.EMBED)
                    for node in node_batch
                ],
            )

        for i, node in enumerate(node_batch):
            assert isinstance(node, BaseNode)
            node_ids.append(node.node_id)
            if self.enable_hybrid:
                if (
                    len(sparse_vectors) > 0
                    and len(sparse_indices) > 0
                    and len(sparse_vectors) == len(sparse_indices)
                ):
                    vectors.append(
                        {
                            "text-sparse": rest.SparseVector(
                                indices=sparse_indices[i],
                                values=sparse_vectors[i],
                            ),
                            "text-dense": node.get_embedding(),
                        }
                    )
                else:
                    vectors.append(
                        {
                            "text-dense": node.get_embedding(),
                        }
                    )
            else:
                vectors.append(node.get_embedding())

            metadata = node_to_metadata_dict(
                node, remove_text=False, flat_metadata=self.flat_metadata
            )

            payloads.append(metadata)

        points.extend(
            [
                rest.PointStruct(id=node_id, payload=payload, vector=vector)
                for node_id, payload, vector in zip(node_ids, payloads, vectors)
            ]
        )

        ids.extend(node_ids)

    return points, ids

Ensure that you are calling this method correctly and passing a list of BaseNode objects. The node_ids.append(node.node_id) line correctly accesses the node_id attribute of each BaseNode object.

Additionally, if you encounter an error like "AttributeError: 'ScoredPoint' object has no attribute 'get'", it might be related to how the query results are being processed. Make sure that the objects returned by the query are being handled correctly and that you are not mistakenly treating a list or another type of object as if it were a BaseNode.

For example, the parse_to_query_result method converts the vector store response to VectorStoreQueryResult and ensures that the nodes are correctly parsed:

def parse_to_query_result(self, response: List[Any]) -> VectorStoreQueryResult:
    nodes = []
    similarities = []
    ids = []

    for point in response:
        payload = cast(Payload, point.payload)
        try:
            node = metadata_dict_to_node(payload)
        except Exception:
            metadata, node_info, relationships = legacy_metadata_dict_to_node(payload)
            node = TextNode(
                id_=str(point.id),
                text=payload.get("text"),
                metadata=metadata,
                start_char_idx=node_info.get("start", None),
                end_char_idx=node_info.get("end", None),
                relationships=relationships,
            )
        nodes.append(node)
        ids.append(str(point.id))
        try:
            similarities.append(point.score)
        except AttributeError:
            similarities.append(1.0)

    return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)

Make sure that your code correctly handles the response from Qdrant and processes each BaseNode object individually [1][2][3].

To continue talking to Dosu, mention @dosu.