run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
35.89k stars 5.09k forks source link

[Bug]: AzureAISearchVectorStore: more than one metadata filter using FilterCondition.OR fails #13513

Closed courtneyjean closed 4 months ago

courtneyjean commented 4 months ago

Bug Description

I've created a set of documents in an AzureAISearchVectorStore, with a 'country' metadata key. I'm trying to create a filter on documents where 'country' equals 'United Kingdom' OR 'Ireland', but it's throwing an error.

Version

0.10.37

Steps to Reproduce

import os import tiktoken import llama_index from llama_index.core import PromptHelper from llama_index.llms.azure_openai import AzureOpenAI from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding from llama_index.core.node_parser import SimpleNodeParser from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.core import set_global_service_context from llama_index.core import StorageContext,load_index_from_storage from llama_index.core import Settings from llama_index.vector_stores.azureaisearch import AzureAISearchVectorStore from llama_index.vector_stores.azureaisearch import ( IndexManagement, MetadataIndexFieldType, ) from azure.core.credentials import AzureKeyCredential from azure.search.documents import SearchClient from azure.search.documents.indexes import SearchIndexClient from llama_index.core.retrievers import VectorIndexRetriever from llama_index.core.retrievers import VectorIndexAutoRetriever from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core import get_response_synthesizer

import pandas as pd

from llama_index.core.vector_stores import ( MetadataFilter, MetadataFilters, FilterOperator, FilterCondition )

azure_endpoint = "xx" api_version = "2024-02-15-preview" api_key="xxxx"

search_service_api_key = "xxxx" search_service_endpoint = "xx" search_service_api_version = "2023-11-01" credential = AzureKeyCredential(search_service_api_key)

model = "gpt-4" deployment_name = "GPT4-Turbo" embed_model = "text-embedding-ada-002" embed_deployment_name = "ada002embedding" temperature = 0 chunk_size = 1024 chunk_overlap = 20 maxWorkers = 5 sleepTimeBeforeRetry = 30

Settings.llm = AzureOpenAI( model=model, deployment_name=deployment_name, api_key=api_key, azure_endpoint=azure_endpoint, api_version=api_version, temperature = temperature )

Settings.embed_model = AzureOpenAIEmbedding( model=embed_model, deployment_name=embed_deployment_name, api_key=api_key, azure_endpoint=azure_endpoint, api_version=api_version)

Set up some example documents with some metadata

from llama_index.core import Document

documents = [ Document( text="The United Kingdom, made up of England, Scotland, Wales and Northern Ireland, is an island nation in northwestern Europe. England – birthplace of Shakespeare and The Beatles – is home to the capital, London, a globally influential centre of finance and culture.", metadata={"country" : "United Kingdom"} ), Document( text="The Republic of Ireland occupies most of the island of Ireland, off the coast of England and Wales. Its capital, Dublin, is the birthplace of writers like Oscar Wilde, and home of Guinness beer.", metadata={"country" : "Ireland"} ), Document( text="Japan is an island country in East Asia. It is in the northwest Pacific Ocean and is bordered on the west by the Sea of Japan, extending from the Sea of Okhotsk in the north toward the East China Sea, Philippine Sea, and Taiwan in the south.", metadata={"country" : "Japan"} ) ]

Create an AzureAISearch Vector Index

vector_index_name = 'testci3'

index_client = SearchIndexClient( endpoint=search_service_endpoint, index_name=vector_index_name, credential=credential)

metadata_fields = {'country' : 'country'}

AzureAISearch_vector_store = AzureAISearchVectorStore( search_or_index_client=index_client, filterable_metadata_field_keys= metadata_fields, index_name=vector_index_name, index_management=IndexManagement.CREATE_IF_NOT_EXISTS, id_field_key="id", chunk_field_key="chunk", embedding_field_key="embedding", embedding_dimensionality=1536, metadata_string_field_key="metadata", doc_id_field_key="doc_id", language_analyzer="en.lucene", vector_algorithm_type="exhaustiveKnn" )

storage_context = StorageContext.from_defaults(vector_store=AzureAISearch_vector_store) azs_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context )

Demonstration of error when you apply more than one filter

azs_retriver = azs_index.as_retriever() print("--------------------------------------------------------------") response = basic_retriver.retrieve('What locations are celebrated for being birthplaces of famous writers?') print("NO METADATA FILTERS") print(response) print("--------------------------------------------------------------")

Create some metadata filters

UK_filter = MetadataFilter(key='country', operator=FilterOperator.EQ, value='United Kingdom') Ireland_filter = MetadataFilter(key='country', operator=FilterOperator.EQ, value='Ireland')

Ask this question and filter just for Ireland

azs_retriever_ireland = azs_index.as_retriever(filters=MetadataFilters(filters=[Ireland_filter])) print("RETRIEVE IRELAND DOCs ONLY") print(azs_retriever_ireland.retrieve('What locations are celebrated for being birthplaces of famous writers?')) print("--------------------------------------------------------------")

Ask this question and filter the UK and Ireland

filter_names = [UK_filter, Ireland_filter] filters = MetadataFilters(filters=filter_names, condition=FilterCondition.OR) print("IRELAND AND UK FILTERS") print(filters) print("RETRIEVE IRELAND & UK DOCs ONLY") azs_two_filters_retriever = azs_index.as_retriever(filters=filters) print(azs_two_filters_retriever.retrieve('What locations are celebrated for being birthplaces of famous writers?')) print("--------------------------------------------------------------")

Relevant Logs/Tracbacks

HttpResponseError: () Invalid expression: Syntax error at position 62 in 'country eq 'United Kingdom' {metadata_filters.condition.value} country eq 'Ireland''.
Parameter name: $filter
Code: 
Message: Invalid expression: Syntax error at position 62 in 'country eq 'United Kingdom' {metadata_filters.condition.value} country eq 'Ireland''.
Parameter name: $filter
---------------------------------------------------------------------------
HttpResponseError                         Traceback (most recent call last)
File <command-1356225611206570>:27
     25 print("RETRIEVE IRELAND & UK DOCs ONLY")
     26 azs_two_filters_retriever = azs_index.as_retriever(filters=filters)
---> 27 print(azs_two_filters_retriever.retrieve('What locations are celebrated for being birthplaces of famous writers?'))
     28 print("--------------------------------------------------------------")

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/instrumentation/dispatcher.py:274, in Dispatcher.span.<locals>.wrapper(func, instance, args, kwargs)
    270 self.span_enter(
    271     id_=id_, bound_args=bound_args, instance=instance, parent_id=parent_id
    272 )
    273 try:
--> 274     result = func(*args, **kwargs)
    275 except BaseException as e:
    276     self.event(SpanDropEvent(span_id=id_, err_str=str(e)))

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/base/base_retriever.py:244, in BaseRetriever.retrieve(self, str_or_query_bundle)
    239 with self.callback_manager.as_trace("query"):
    240     with self.callback_manager.event(
    241         CBEventType.RETRIEVE,
    242         payload={EventPayload.QUERY_STR: query_bundle.query_str},
    243     ) as retrieve_event:
--> 244         nodes = self._retrieve(query_bundle)
    245         nodes = self._handle_recursive_retrieval(query_bundle, nodes)
    246         retrieve_event.on_end(
    247             payload={EventPayload.NODES: nodes},
    248         )

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/instrumentation/dispatcher.py:274, in Dispatcher.span.<locals>.wrapper(func, instance, args, kwargs)
    270 self.span_enter(
    271     id_=id_, bound_args=bound_args, instance=instance, parent_id=parent_id
    272 )
    273 try:
--> 274     result = func(*args, **kwargs)
    275 except BaseException as e:
    276     self.event(SpanDropEvent(span_id=id_, err_str=str(e)))

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/indices/vector_store/retrievers/retriever.py:101, in VectorIndexRetriever._retrieve(self, query_bundle)
     95     if query_bundle.embedding is None and len(query_bundle.embedding_strs) > 0:
     96         query_bundle.embedding = (
     97             self._embed_model.get_agg_embedding_from_queries(
     98                 query_bundle.embedding_strs
     99             )
    100         )
--> 101 return self._get_nodes_with_embeddings(query_bundle)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/indices/vector_store/retrievers/retriever.py:177, in VectorIndexRetriever._get_nodes_with_embeddings(self, query_bundle_with_embeddings)
    173 def _get_nodes_with_embeddings(
    174     self, query_bundle_with_embeddings: QueryBundle
    175 ) -> List[NodeWithScore]:
    176     query = self._build_vector_store_query(query_bundle_with_embeddings)
--> 177     query_result = self._vector_store.query(query, **self._kwargs)
    178     return self._build_node_list_from_query_result(query_result)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/vector_stores/azureaisearch/base.py:634, in AzureAISearchVectorStore.query(self, query, **kwargs)
    630 elif query.mode == VectorStoreQueryMode.SEMANTIC_HYBRID:
    631     azure_query_result_search = AzureQueryResultSearchSemanticHybrid(
    632         query, self._field_mapping, odata_filter, self._search_client
    633     )
--> 634 return azure_query_result_search.search()

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/vector_stores/azureaisearch/base.py:720, in AzureQueryResultSearchBase.search(self)
    718 search_query = self._create_search_query()
    719 vectors = self._create_query_vector()
--> 720 return self._create_query_result(search_query, vectors)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/vector_stores/azureaisearch/base.py:679, in AzureQueryResultSearchBase._create_query_result(self, search_query, vectors)
    677 node_result = []
    678 score_result = []
--> 679 for result in results:
    680     node_id = result[self._field_mapping["id"]]
    681     metadata = json.loads(result[self._field_mapping["metadata"]])

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/azure/search/documents/_paging.py:54, in SearchItemPaged.__next__(self)
     52     first_iterator = self._first_iterator_instance()
     53     self._page_iterator = itertools.chain.from_iterable(first_iterator)
---> 54 return next(self._page_iterator)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/azure/core/paging.py:75, in PageIterator.__next__(self)
     73     raise StopIteration("End of paging")
     74 try:
---> 75     self._response = self._get_next(self.continuation_token)
     76 except AzureError as error:
     77     if not error.continuation_token:

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/azure/search/documents/_paging.py:124, in SearchPageIterator._get_next_cb(self, continuation_token)
    122 def _get_next_cb(self, continuation_token):
    123     if continuation_token is None:
--> 124         return self._client.documents.search_post(search_request=self._initial_query.request, **self._kwargs)
    126     _next_link, next_page_request = unpack_continuation_token(continuation_token)
    128     return self._client.documents.search_post(search_request=next_page_request, **self._kwargs)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/azure/core/tracing/decorator.py:78, in distributed_trace.<locals>.decorator.<locals>.wrapper_use_tracer(*args, **kwargs)
     76 span_impl_type = settings.tracing_implementation()
     77 if span_impl_type is None:
---> 78     return func(*args, **kwargs)
     80 # Merge span is parameter is set, but only if no explicit parent are passed
     81 if merge_span and not passed_in_parent:

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/azure/search/documents/_generated/operations/_documents_operations.py:756, in DocumentsOperations.search_post(self, search_request, request_options, **kwargs)
    754     map_error(status_code=response.status_code, response=response, error_map=error_map)
    755     error = self._deserialize.failsafe_deserialize(_models.SearchError, pipeline_response)
--> 756     raise HttpResponseError(response=response, model=error)
    758 deserialized = self._deserialize("SearchDocumentsResult", pipeline_response)
    760 if cls:

HttpResponseError: () Invalid expression: Syntax error at position 62 in 'country eq 'United Kingdom' {metadata_filters.condition.value} country eq 'Ireland''.
Parameter name: $filter
Code: 
Message: Invalid expression: Syntax error at position 62 in 'country eq 'United Kingdom' {metadata_filters.condition.value} country eq 'Ireland''.
Parameter name: $filter
courtneyjean commented 4 months ago

I had hoped that this update has fixed this issue, but it seems it hasn't: https://github.com/run-llama/llama_index/pull/13215

dosubot[bot] commented 4 months ago

The error you're encountering when applying more than one metadata filter using FilterCondition.OR in AzureAISearchVectorStore is likely due to the way the filter conditions are being constructed and interpreted by Azure Cognitive Search. The HttpResponseError suggests a syntax issue in the OData filter string, which is crucial for Azure Cognitive Search queries.

To address this, you'll need to ensure that your filter string correctly implements the OR condition. Azure Cognitive Search expects the OData filter syntax to be precise. For combining filters with an OR condition, you might need to manually construct a filter string that explicitly specifies this condition. Here's a simplified approach:

def construct_or_filter(field_name, values):
    return f"({field_name} eq '{values[0]}' or {field_name} eq '{values[1]}')"

filter_string = construct_or_filter('country', ['United Kingdom', 'Ireland'])

This filter string should then be integrated into your query mechanism, ensuring that the fields you're filtering by are marked as filterable in your Azure AI Search index schema.

The pull request you mentioned does not seem to address the handling of FilterCondition.OR directly. For your specific issue, focusing on the correct construction and application of the OData filter string is key. Ensure that your index schema in Azure AI Search is set up correctly to support filtering on the necessary fields.

#### Details **Sources** - [llama-index-legacy/llama_index/legacy/vector_stores/azureaisearch.py](https://github.com/run-llama/llama_index/blob/main/llama-index-legacy/llama_index/legacy/vector_stores/azureaisearch.py) - [llama-index-integrations/vector_stores/llama-index-vector-stores-azureaisearch/llama_index/vector_stores/azureaisearch/base.py](https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/vector_stores/llama-index-vector-stores-azureaisearch/llama_index/vector_stores/azureaisearch/base.py) - [docs/docs/examples/vector_stores/chroma_metadata_filter.ipynb](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/vector_stores/chroma_metadata_filter.ipynb) - [docs/docs/examples/vector_stores/neo4j_metadata_filter.ipynb](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/vector_stores/neo4j_metadata_filter.ipynb) - [docs/docs/examples/vector_stores/WeaviateIndex_metadata_filter.ipynb](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/vector_stores/WeaviateIndex_metadata_filter.ipynb) - [docs/docs/examples/vector_stores/pinecone_metadata_filter.ipynb](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/vector_stores/pinecone_metadata_filter.ipynb)
**About Dosu** This response is meant to be useful and save you time. It is **not meant to be a precise solution**, but rather a starting point for your own research. Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

To continue the conversation, mention @dosu.

courtneyjean commented 4 months ago

How can I implement this filter_string into the retrieval?

This code fails:

def construct_or_filter(field_name, values): return f"({field_name} eq '{values[0]}' or {field_name} eq '{values[1]}')"

Ask this question and filter the UK and Ireland

print("IRELAND AND UK FILTERS") filter_string = construct_or_filter('country', ['United Kingdom', 'Ireland']) print("RETRIEVE IRELAND & UK DOCs ONLY") azs_two_filters_retriever = azs_index.as_retriever(filters=filter_string) print(azs_two_filters_retriever.retrieve('What locations are celebrated for being birthplaces of famous writers?')) print("--------------------------------------------------------------")

Traceback: AttributeError: 'str' object has no attribute 'legacy_filters'

AttributeError Traceback (most recent call last) File :10 8 print("RETRIEVE IRELAND & UK DOCs ONLY") 9 azs_two_filters_retriever = azs_index.as_retriever(filters=filter_string) ---> 10 print(azs_two_filters_retriever.retrieve('What locations are celebrated for being birthplaces of famous writers?')) 11 print("--------------------------------------------------------------")

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/instrumentation/dispatcher.py:274, in Dispatcher.span..wrapper(func, instance, args, kwargs) 270 self.spanenter( 271 id=id_, bound_args=bound_args, instance=instance, parent_id=parent_id 272 ) 273 try: --> 274 result = func(*args, **kwargs) 275 except BaseException as e: 276 self.event(SpanDropEvent(spanid=id, err_str=str(e)))

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/base/base_retriever.py:244, in BaseRetriever.retrieve(self, str_or_query_bundle) 239 with self.callback_manager.as_trace("query"): 240 with self.callback_manager.event( 241 CBEventType.RETRIEVE, 242 payload={EventPayload.QUERY_STR: query_bundle.query_str}, 243 ) as retrieve_event: --> 244 nodes = self._retrieve(query_bundle) 245 nodes = self._handle_recursive_retrieval(query_bundle, nodes) 246 retrieve_event.on_end( 247 payload={EventPayload.NODES: nodes}, 248 )

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/instrumentation/dispatcher.py:274, in Dispatcher.span..wrapper(func, instance, args, kwargs) 270 self.spanenter( 271 id=id_, bound_args=bound_args, instance=instance, parent_id=parent_id 272 ) 273 try: --> 274 result = func(*args, **kwargs) 275 except BaseException as e: 276 self.event(SpanDropEvent(spanid=id, err_str=str(e)))

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/indices/vector_store/retrievers/retriever.py:101, in VectorIndexRetriever._retrieve(self, query_bundle) 95 if query_bundle.embedding is None and len(query_bundle.embedding_strs) > 0: 96 query_bundle.embedding = ( 97 self._embed_model.get_agg_embedding_from_queries( 98 query_bundle.embedding_strs 99 ) 100 ) --> 101 return self._get_nodes_with_embeddings(query_bundle)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/core/indices/vector_store/retrievers/retriever.py:177, in VectorIndexRetriever._get_nodes_with_embeddings(self, query_bundle_with_embeddings) 173 def _get_nodes_with_embeddings( 174 self, query_bundle_with_embeddings: QueryBundle 175 ) -> List[NodeWithScore]: 176 query = self._build_vector_store_query(query_bundle_with_embeddings) --> 177 query_result = self._vector_store.query(query, **self._kwargs) 178 return self._build_node_list_from_query_result(query_result)

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/vector_stores/azureaisearch/base.py:616, in AzureAISearchVectorStore.query(self, query, **kwargs) 614 odata_filter = None 615 if query.filters is not None: --> 616 odata_filter = self._create_odata_filter(query.filters) 617 azure_query_result_search: AzureQueryResultSearchBase = ( 618 AzureQueryResultSearchDefault( 619 query, self._field_mapping, odata_filter, self._search_client 620 ) 621 ) 622 if query.mode == VectorStoreQueryMode.SPARSE:

File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/llama_index/vector_stores/azureaisearch/base.py:580, in AzureAISearchVectorStore._create_odata_filter(self, metadata_filters) 578 """Generate an OData filter string using supplied metadata filters.""" 579 odata_filter: List[str] = [] --> 580 for f in metadata_filters.legacy_filters(): 581 if not isinstance(f, ExactMatchFilter): 582 raise NotImplementedError( 583 "Only ExactMatchFilter filters are supported" 584 )

AttributeError: 'str' object has no attribute 'legacy_filters'

RussellLuo commented 4 months ago

I had hoped that this update has fixed this issue, but it seems it hasn't: https://github.com/run-llama/llama_index/pull/13215

Hi @courtneyjean, I think the problem was just caused by that PR. You can try the latest version v0.10.37, which I guess has fixed the problem (by #13435).

courtneyjean commented 4 months ago

I had hoped that this update has fixed this issue, but it seems it hasn't: #13215

Hi @courtneyjean, I think the problem was just caused by that PR. You can try the latest version v0.10.37, which I guess has fixed the problem (by #13435).

Hi @RussellLuo, I'm using v0.10.37 already :(

RussellLuo commented 4 months ago

I cannot run your code as I have no available azure credential on hand. To see what happened, maybe you could add some debugging logs using print or set a breakpoint on this line:

https://github.com/run-llama/llama_index/blob/4c2a61cd3eeaf24079ea4a74557da0bddc53af47/llama-index-integrations/vector_stores/llama-index-vector-stores-azureaisearch/llama_index/vector_stores/azureaisearch/base.py#L600

logan-markewich commented 4 months ago

@courtneyjean v0.10.37 is the version of the llama-index/llama-index-core package. But you'll want to make sure you have the latest azure search version

pip install -U llama-index-vector-stores-azureaisearch

courtneyjean commented 4 months ago

@courtneyjean v0.10.37 is the version of the llama-index/llama-index-core package. But you'll want to make sure you have the latest azure search version

pip install -U llama-index-vector-stores-azureaisearch

Hi @logan-markewich :

Here is from my pip list. I believe I have the latest version:

llama-index 0.10.37 llama-index-agent-openai 0.2.5 llama-index-cli 0.1.12 llama-index-core 0.10.37 llama-index-embeddings-azure-openai 0.1.9 llama-index-embeddings-openai 0.1.9 llama-index-experimental 0.1.3 llama-index-indices-managed-llama-cloud 0.1.6 llama-index-legacy 0.9.48 llama-index-llms-azure-openai 0.1.8 llama-index-llms-openai 0.1.19 llama-index-multi-modal-llms-openai 0.1.6 llama-index-program-openai 0.1.6 llama-index-question-gen-openai 0.1.3 llama-index-readers-file 0.1.22 llama-index-readers-llama-parse 0.1.4 llama-index-vector-stores-azureaisearch 0.1.5 llama-index-vector-stores-postgres 0.1.7 llama-parse 0.4.3 llamaindex-py-client 0.1.19

RussellLuo commented 4 months ago

llama-index-vector-stores-azureaisearch 0.1.6 is the correct version, but seems like this version has not been published on pypi.

logan-markewich commented 4 months ago

Good catch @RussellLuo the automatic publishing must have failed. Just manually published.

@courtneyjean can you try updating one more time?

courtneyjean commented 4 months ago

Thanks both. I've tried it, and this is an improvement as it no longer throws an error. But unfortunately I'm still not getting the behaviour I expected.

From the code above, here is the new output. When I apply 'NO METADATA FILTERS', the retriever returns two documents. A single filter on Ireland works well, but applying two filters:

filters=[MetadataFilter(key='country', value='United Kingdom', operator=<FilterOperator.EQ: '=='>), MetadataFilter(key='country', value='Ireland', operator=<FilterOperator.EQ: '=='>)] condition=<FilterCondition.OR: 'or'>

Returns only documents related to the second filter.

I also tried similarity_top_k=2 to try to achieve the desired result, but it had no impact.

Here is the output I am currently getting:

NO METADATA FILTERS [NodeWithScore(node=TextNode(id_='899d3cd4-0619-4b35-9644-197aa208d1dd', embedding=None, metadata={'country': 'Ireland'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1b11291f-e1e8-4c34-92f7-51c798a86649', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'country': 'Ireland'}, hash='2156acad3ceff3ff92570304d6c7aed3d123661249e1eebde150da45a391af56')}, text='The Republic of Ireland occupies most of the island of Ireland, off the coast of England and Wales. Its capital, Dublin, is the birthplace of writers like Oscar Wilde, and home of Guinness beer.', start_char_idx=0, end_char_idx=194, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadataseperator='\n'), score=0.7949773663245251), NodeWithScore(node=TextNode(id='0229d71c-4e04-4243-b938-fa7ee8727be5', embedding=None, metadata={'country': 'United Kingdom'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6740d811-49b7-49e1-9a8f-b39ebc42f455', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'country': 'United Kingdom'}, hash='ffa0a601c09be1c86666805fac5f7bdd024235d2784bcd44e711b164e188cf8a')}, text='The United Kingdom, made up of England, Scotland, Wales and Northern Ireland, is an island nation in northwestern Europe. England – birthplace of Shakespeare and The Beatles – is home to the capital, London, a globally influential centre of finance and culture.', start_char_idx=0, end_char_idx=261, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.7814661419624729)]

RETRIEVE IRELAND DOCs ONLY [NodeWithScore(node=TextNode(id_='f53c324f-4ea2-41c2-92ec-7d85832947b6', embedding=None, metadata={'country': 'Ireland'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e97d3939-da45-4f9c-a826-462d79128dd5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'country': 'Ireland'}, hash='2156acad3ceff3ff92570304d6c7aed3d123661249e1eebde150da45a391af56')}, text='The Republic of Ireland occupies most of the island of Ireland, off the coast of England and Wales. Its capital, Dublin, is the birthplace of writers like Oscar Wilde, and home of Guinness beer.', start_char_idx=0, end_char_idx=194, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.8298598)]

IRELAND AND UK FILTERS filters=[MetadataFilter(key='country', value='United Kingdom', operator=<FilterOperator.EQ: '=='>), MetadataFilter(key='country', value='Ireland', operator=<FilterOperator.EQ: '=='>)] condition=<FilterCondition.OR: 'or'> RETRIEVE IRELAND & UK DOCs ONLY [NodeWithScore(node=TextNode(id_='f53c324f-4ea2-41c2-92ec-7d85832947b6', embedding=None, metadata={'country': 'Ireland'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e97d3939-da45-4f9c-a826-462d79128dd5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'country': 'Ireland'}, hash='2156acad3ceff3ff92570304d6c7aed3d123661249e1eebde150da45a391af56')}, text='The Republic of Ireland occupies most of the island of Ireland, off the coast of England and Wales. Its capital, Dublin, is the birthplace of writers like Oscar Wilde, and home of Guinness beer.', start_char_idx=0, end_char_idx=194, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.8298598)]

courtneyjean commented 4 months ago

Here is some code for the same process applied to a llama_index vector store, and the resulting output. In this code the metadata filter uses FilterCondition.OR and returns two documents (metadata tags country='United Kingdom' OR country='Ireland'.

This code demonstrates the anticipated behaviour and output of AzureAISearchVectorStore code above, and demonstrates that there remains an issue with the application of the metadata filter in this code.

Set up some example documents with some metadata

from llama_index.core import Document

documents = [ Document( text="The United Kingdom, made up of England, Scotland, Wales and Northern Ireland, is an island nation in northwestern Europe. England – birthplace of Shakespeare and The Beatles – is home to the capital, London, a globally influential centre of finance and culture.", metadata={"country" : "United Kingdom"} ), Document( text="The Republic of Ireland occupies most of the island of Ireland, off the coast of England and Wales. Its capital, Dublin, is the birthplace of writers like Oscar Wilde, and home of Guinness beer.", metadata={"country" : "Ireland"} ), Document( text="Japan is an island country in East Asia. It is in the northwest Pacific Ocean and is bordered on the west by the Sea of Japan, extending from the Sea of Okhotsk in the north toward the East China Sea, Philippine Sea, and Taiwan in the south.", metadata={"country" : "Japan"} ) ]

Setp up a normal vector index

from llama_index.core import VectorStoreIndex

build index

index = VectorStoreIndex.from_documents(documents) index.ref_doc_info

basic_retriver = index.as_retriever() print("--------------------------------------------------------------") response = basic_retriver.retrieve('What locations are celebrated for being birthplaces of famous writers?') print("NO METADATA FILTERS") print(response) print("--------------------------------------------------------------")

Create some metadata filters

UK_filter = MetadataFilter(key='country', operator=FilterOperator.EQ, value='United Kingdom') Ireland_filter = MetadataFilter(key='country', operator=FilterOperator.EQ, value='Ireland')

Ask this question and filter just for Ireland

retriever_ireland = index.as_retriever(filters=MetadataFilters(filters=[Ireland_filter])) print("RETRIEVE IRELAND DOCs ONLY") print(retriever_ireland.retrieve('What locations are celebrated for being birthplaces of famous writers?')) print("--------------------------------------------------------------")

Ask this question and filter the UK and Ireland

filter_names = [UK_filter, Ireland_filter] filters = MetadataFilters(filters=filter_names, condition=FilterCondition.OR) print("IRELAND AND UK FILTERS") print(filters) print("RETRIEVE IRELAND & UK DOCs ONLY") two_filters_retriever = index.as_retriever(filters=filters) print(two_filters_retriever.retrieve('What locations are celebrated for being birthplaces of famous writers?')) print("--------------------------------------------------------------")

Output is as expected: NO METADATA FILTERS [NodeWithScore(node=TextNode(id_='899d3cd4-0619-4b35-9644-197aa208d1dd', embedding=None, metadata={'country': 'Ireland'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1b11291f-e1e8-4c34-92f7-51c798a86649', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'country': 'Ireland'}, hash='2156acad3ceff3ff92570304d6c7aed3d123661249e1eebde150da45a391af56')}, text='The Republic of Ireland occupies most of the island of Ireland, off the coast of England and Wales. Its capital, Dublin, is the birthplace of writers like Oscar Wilde, and home of Guinness beer.', start_char_idx=0, end_char_idx=194, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadataseperator='\n'), score=0.7949773663245251), NodeWithScore(node=TextNode(id='0229d71c-4e04-4243-b938-fa7ee8727be5', embedding=None, metadata={'country': 'United Kingdom'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6740d811-49b7-49e1-9a8f-b39ebc42f455', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'country': 'United Kingdom'}, hash='ffa0a601c09be1c86666805fac5f7bdd024235d2784bcd44e711b164e188cf8a')}, text='The United Kingdom, made up of England, Scotland, Wales and Northern Ireland, is an island nation in northwestern Europe. England – birthplace of Shakespeare and The Beatles – is home to the capital, London, a globally influential centre of finance and culture.', start_char_idx=0, end_char_idx=261, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.7814661419624729)]

RETRIEVE IRELAND DOCs ONLY [NodeWithScore(node=TextNode(id_='899d3cd4-0619-4b35-9644-197aa208d1dd', embedding=None, metadata={'country': 'Ireland'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1b11291f-e1e8-4c34-92f7-51c798a86649', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'country': 'Ireland'}, hash='2156acad3ceff3ff92570304d6c7aed3d123661249e1eebde150da45a391af56')}, text='The Republic of Ireland occupies most of the island of Ireland, off the coast of England and Wales. Its capital, Dublin, is the birthplace of writers like Oscar Wilde, and home of Guinness beer.', start_char_idx=0, end_char_idx=194, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.7949773663245251)]

IRELAND AND UK FILTERS filters=[MetadataFilter(key='country', value='United Kingdom', operator=<FilterOperator.EQ: '=='>), MetadataFilter(key='country', value='Ireland', operator=<FilterOperator.EQ: '=='>)] condition=<FilterCondition.OR: 'or'> RETRIEVE IRELAND & UK DOCs ONLY [NodeWithScore(node=TextNode(id_='899d3cd4-0619-4b35-9644-197aa208d1dd', embedding=None, metadata={'country': 'Ireland'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1b11291f-e1e8-4c34-92f7-51c798a86649', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'country': 'Ireland'}, hash='2156acad3ceff3ff92570304d6c7aed3d123661249e1eebde150da45a391af56')}, text='The Republic of Ireland occupies most of the island of Ireland, off the coast of England and Wales. Its capital, Dublin, is the birthplace of writers like Oscar Wilde, and home of Guinness beer.', start_char_idx=0, end_char_idx=194, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadataseperator='\n'), score=0.7949773663245251), NodeWithScore(node=TextNode(id='0229d71c-4e04-4243-b938-fa7ee8727be5', embedding=None, metadata={'country': 'United Kingdom'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6740d811-49b7-49e1-9a8f-b39ebc42f455', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'country': 'United Kingdom'}, hash='ffa0a601c09be1c86666805fac5f7bdd024235d2784bcd44e711b164e188cf8a')}, text='The United Kingdom, made up of England, Scotland, Wales and Northern Ireland, is an island nation in northwestern Europe. England – birthplace of Shakespeare and The Beatles – is home to the capital, London, a globally influential centre of finance and culture.', start_char_idx=0, end_char_idx=261, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.7814661419624729)]

courtneyjean commented 4 months ago

Hi All, This error I've shown above occurred when the index was created and then overridden a few times during testing. Once the index was re-created from scratch, the error no longer occurred. Thanks for your help on this, and I'm sorry the solution wasn't more satisfying.