Open psureshmagadi17 opened 3 months ago
Hey there, @psureshmagadi17! I'm here to help you out. If you need assistance with bugs, questions, or contributions, feel free to ask.
To ensure that the search_client
is correctly propagated to the VectorStoreIndex
and avoid the AttributeError: 'NoneType' object has no attribute 'search'
when using RetrieverEvaluator
, you need to make sure that the search_client
is properly initialized and assigned within the AzureAISearchVectorStore
class.
Here is the relevant part of the AzureAISearchVectorStore
class constructor that handles the initialization of the search_client
:
def __init__(
self,
search_or_index_client: Any,
id_field_key: str,
chunk_field_key: str,
embedding_field_key: str,
metadata_string_field_key: str,
doc_id_field_key: str,
filterable_metadata_field_keys: Optional[
Union[
List[str],
Dict[str, str],
Dict[str, Tuple[str, MetadataIndexFieldType]],
]
] = None,
index_name: Optional[str] = None,
index_mapping: Optional[
Callable[[Dict[str, str], Dict[str, Any]], Dict[str, str]]
] = None,
index_management: IndexManagement = IndexManagement.NO_VALIDATION,
embedding_dimensionality: int = 1536,
vector_algorithm_type: str = "exhaustiveKnn",
language_analyzer: str = "en.lucene",
**kwargs: Any,
) -> None:
import_err_msg = (
"`azure-search-documents` package not found, please run "
"`pip install azure-search-documents==11.4.0`"
)
try:
import azure.search.documents # noqa
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
except ImportError:
raise ImportError(import_err_msg)
self._index_client: SearchIndexClient = cast(SearchIndexClient, None)
self._search_client: SearchClient = cast(SearchClient, None)
self.embedding_dimensionality = embedding_dimensionality
if search_or_index_client is not None:
if isinstance(search_or_index_client, SearchIndexClient):
self._index_client = cast(SearchIndexClient, search_or_index_client)
if not index_name:
raise ValueError(
"index_name must be supplied if search_or_index_client is of "
"type azure.search.documents.SearchIndexClient"
)
self._search_client = self._index_client.get_search_client(
index_name=index_name
)
elif isinstance(search_or_index_client, SearchClient):
self._search_client = cast(SearchClient, search_or_index_client)
if index_name:
raise ValueError(
"index_name cannot be supplied if search_or_index_client "
"is of type azure.search.documents.SearchClient"
)
if not self._index_client and not self._search_client:
raise ValueError(
"search_or_index_client must be of type "
"azure.search.documents.SearchClient or "
"azure.search.documents.SearchIndexClient"
)
else:
raise ValueError("search_or_index_client not specified")
if (
index_management == IndexManagement.CREATE_IF_NOT_EXISTS
and not self._index_client
):
raise ValueError(
"index_management has value of IndexManagement.CREATE_IF_NOT_EXISTS "
"but search_or_index_client is not of type "
"azure.search.documents.SearchIndexClient"
)
self._index_management = index_management
field_mapping = {
"id": id_field_key,
"chunk": chunk_field_key,
"embedding": embedding_field_key,
"metadata": metadata_string_field_key,
"doc_id": doc_id_field_key,
}
self._field_mapping = field_mapping
self._index_mapping = (
self._default_index_mapping if index_mapping is None else index_mapping
)
self._metadata_to_index_field_map = self._normalise_metadata_to_index_fields(
filterable_metadata_field_keys
)
if self._index_management == IndexManagement.CREATE_IF_NOT_EXISTS:
if index_name:
self._create_index_if_not_exists(index_name)
if self._index_management == IndexManagement.VALIDATE_INDEX:
self._validate_index(index_name)
Additionally, you can refer to the test cases provided in the llama-index-legacy/tests/vector_stores/test_azureaisearch.py
file to ensure that the search_client
is correctly mocked and tested. Here is an example of how to create a mock vector store and test the initialization:
from typing import Any, List, Optional
from unittest.mock import MagicMock
import pytest
from llama_index.legacy.schema import NodeRelationship, RelatedNodeInfo, TextNode
from llama_index.legacy.vector_stores.azureaisearch import (
AzureAISearchVectorStore,
IndexManagement,
)
try:
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
azureaisearch_installed = True
except ImportError:
azureaisearch_installed = False
search_client = None # type: ignore
def create_mock_vector_store(
search_client: Any,
index_name: Optional[str] = None,
index_management: IndexManagement = IndexManagement.NO_VALIDATION,
) -> AzureAISearchVectorStore:
return AzureAISearchVectorStore(
search_or_index_client=search_client,
id_field_key="id",
chunk_field_key="content",
embedding_field_key="embedding",
metadata_string_field_key="metadata",
doc_id_field_key="doc_id",
filterable_metadata_field_keys=[], # Added to match the updated constructor
index_name=index_name,
index_management=index_management,
embedding_dimensionality=2, # Assuming a dimensionality of 2 for simplicity
)
def create_sample_documents(n: int) -> List[TextNode]:
nodes: List[TextNode] = []
for i in range(n):
nodes.append(
TextNode(
text=f"test node text {i}",
relationships={
NodeRelationship.SOURCE: RelatedNodeInfo(node_id=f"test doc id {i}")
},
embedding=[0.5, 0.5],
)
)
return nodes
@pytest.mark.skipif(
not azureaisearch_installed, reason="azure-search-documents package not installed"
)
def test_azureaisearch_add_two_batches() -> None:
search_client = MagicMock(spec=SearchClient)
vector_store = create_mock_vector_store(search_client)
nodes = create_sample_documents(11)
ids = vector_store.add(nodes)
call_count = search_client.merge_or_upload_documents.call_count
assert ids is not None
assert len(ids) == 11
assert call_count == 2
@pytest.mark.skipif(
not azureaisearch_installed, reason="azure-search-documents package not installed"
)
def test_azureaisearch_add_one_batch() -> None:
search_client = MagicMock(spec=SearchClient)
vector_store = create_mock_vector_store(search_client)
nodes = create_sample_documents(10)
ids = vector_store.add(nodes)
call_count = search_client.merge_or_upload_documents.call_count
assert ids is not None
assert len(ids) == 10
assert call_count == 1
@pytest.mark.skipif(
not azureaisearch_installed, reason="azure-search-documents package not installed"
)
def test_invalid_index_management_for_searchclient() -> None:
search_client = MagicMock(spec=SearchClient)
# No error
create_mock_vector_store(
search_client, index_management=IndexManagement.VALIDATE_INDEX
)
# Cannot supply index name
# ruff: noqa: E501
with pytest.raises(
ValueError,
match="index_name cannot be supplied if search_or_index_client is of type azure.search.documents.SearchClient",
):
create_mock_vector_store(search_client, index_name="test01")
# SearchClient cannot create an index
with pytest.raises(ValueError):
create_mock_vector_store(
search_client,
index_management=IndexManagement.CREATE_IF_NOT_EXISTS,
)
@pytest.mark.skipif(
not azureaisearch_installed, reason="azure-search-documents package not installed"
)
def test_invalid_index_management_for_searchindexclient() -> None:
search_client = MagicMock(spec=SearchIndexClient)
# Index name must be supplied
with pytest.raises(
ValueError,
match="index_name must be supplied if search_or_index_client is of type azure.search.documents.SearchIndexClient",
):
create_mock_vector_store(
search_client, index_management=IndexManagement.VALIDATE_INDEX
)
# No error when index name is supplied with SearchIndexClient
create_mock_vector_store(
search_client,
index_name="test01",
index_management=IndexManagement.CREATE_IF_NOT_EXISTS,
)
Ensure that the search_or_index_client
is correctly passed and that the search_client
is properly initialized within the constructor. This will prevent the NoneType
error when the RetrieverEvaluator
tries to use the search_client
. Additionally, make sure to handle both synchronous and asynchronous clients as shown in the updated constructor and test cases [1][2][3][4].
To continue talking to Dosu, mention @dosu.
@dosu - please see the example below for how I have instantiated AzureAISearchVectorStore
vector_store_address= <endpoint>
vector_store_password= <api key>
search_client = SearchClient(
endpoint=vector_store_address,
index_name=INDEX_NAME,
credential=AzureKeyCredential(vector_store_password)
)
# Define a query that matches all documents
query = "*"
# Set the API version
api_version = <api version>
search_mode = "all"
# Search for documents with the query
response = search_client.search(
search_text=query,
search_mode=search_mode
)
docs = [i for i in response]
AzureAISearchVectorStore
. Note that in this step, I can access the search_client
and run queries. llama_vector_store = AzureAISearchVectorStore(
search_or_index_client=search_client,
filterable_metadata_field_keys=['filename', 'a', 'b', 'c'],
id_field_key="id",
chunk_field_key="content",
embedding_field_key="content_vector",
doc_id_field_key="id",
metadata_string_field_key="metadata"
)
I cannot use llama_vector_store.as_retriever()
directly. Hence, I did step 3 below
Conver to type VectorStoreIndex
storage_context = StorageContext.from_defaults(vector_store=llama_vector_store)
index = VectorStoreIndex.from_documents(
[],
storage_context=storage_context,
embed_model=Settings.embed_model,
)
relevant_ids = [i for i in dict(qa_dataset)['relevant_docs'].keys()]
g,s,f=qa_dataset.relevant_docs[relevantids[0]][0].split('')[:3]
a_filter = MetadataFilter(key=a, operator=FilterOperator.EQUAL_TO, value=g) b_filter = MetadataFilter(key=b, operator=FilterOperator.EQUAL_TO, value=s) c_filter = MetadataFilter(key=c, operator=FilterOperator.EQUAL_TO, value=f)
filter_names = [a_filter, b_filter, c_filter] filters = MetadataFilters(filters=filter_names, condition=FilterCondition.AND)
test_llama_retriver = index.as_retriever(similarity_top_k=3, filters=filters)
4. Pass retriever to `RetriverEvaluator`
retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate", "precision", "recall"], retriever=test_llama_retriver
)
sample_id=relevant_ids[0]
sample_query = qa_dataset.queries[sample_id]
sample_expected = qa_dataset.relevant_docs[sample_id]
nest_asyncio.apply()
eval_result = retriever_evaluator.evaluate(sample_query,sample_expected)
print(eval_result)
But, the problem here is, once `llama_vector_store` is converted into an `index` of type `VectorStoreIndex` to be able to configure a `retriever`, it does not propagate the `search_client`. The `test_llama_retriver` does not have an attribute `search_client`.
To ensure that the search_client
is correctly propagated to the VectorStoreIndex
and subsequently to the retriever, you need to make sure that the AzureAISearchVectorStore
is properly initialized and that the search_client
is accessible within the VectorStoreIndex
.
Here are the steps to ensure proper propagation:
Initialize AzureAISearchVectorStore
with search_client
:
vector_store_address = "https://app00236465-ai-search-service-nc-prod.search.windows.net"
vector_store_password = decrypt(os.environ['AZURE_AI_SEARCH_KEY'])
search_client = SearchClient(
endpoint=vector_store_address,
index_name=INDEX_NAME,
credential=AzureKeyCredential(vector_store_password)
)
llama_vector_store = AzureAISearchVectorStore(
search_or_index_client=search_client,
filterable_metadata_field_keys=['filename', 'a', 'b', 'c'],
id_field_key="id",
chunk_field_key="content",
embedding_field_key="content_vector",
doc_id_field_key="id",
metadata_string_field_key="metadata"
)
Create StorageContext
and VectorStoreIndex
:
storage_context = StorageContext.from_defaults(vector_store=llama_vector_store)
index = VectorStoreIndex.from_documents(
[],
storage_context=storage_context,
embed_model=Settings.embed_model,
)
Configure Filters and Create Retriever:
relevant_ids = [i for i in dict(qa_dataset)['relevant_docs'].keys()]
g, s, f = qa_dataset.relevant_docs[relevant_ids[0]][0].split('_')[:3]
a_filter = MetadataFilter(key='a', operator=FilterOperator.EQUAL_TO, value=g)
b_filter = MetadataFilter(key='b', operator=FilterOperator.EQUAL_TO, value=s)
c_filter = MetadataFilter(key='c', operator=FilterOperator.EQUAL_TO, value=f)
filter_names = [a_filter, b_filter, c_filter]
filters = MetadataFilters(filters=filter_names, condition=FilterCondition.AND)
test_llama_retriever = index.as_retriever(similarity_top_k=3, filters=filters)
Pass Retriever to RetrieverEvaluator
:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate", "precision", "recall"], retriever=test_llama_retriever
)
sample_id = relevant_ids[0]
sample_query = qa_dataset.queries[sample_id]
sample_expected = qa_dataset.relevant_docs[sample_id]
nest_asyncio.apply()
eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
print(eval_result)
search_client
is correctly initialized and passed to the AzureAISearchVectorStore
.AzureAISearchVectorStore
class should have a property to access the search_client
. This is typically done using a property method:
@property
def client(self) -> Any:
"""Get client."""
return self._search_client
By following these steps, you ensure that the search_client
is correctly propagated and used within the AzureAISearchVectorStore
class, and subsequently within the VectorStoreIndex
and the retriever [1][2].
To continue talking to Dosu, mention @dosu.
@dosu - all the above steps are already tested and I confirm that I can access search_client
attribute of AzureAISearchVectorStore
llama_vector_store._search_client
prints - <SearchClient [endpoint=<index end point >, index=INDEX_NAME]>
However, I do not see any attribute called search_client
in test_llama_retriver
which is of type llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever
derived from index
which is of type llama_index.core.indices.vector_store.base.VectorStoreIndex
Note that it's essential to pass test_llama_retriever
to the RetrieverEvaluator
To ensure that the search_client
is correctly propagated from the AzureAISearchVectorStore
to the VectorStoreIndex
and subsequently to the VectorIndexRetriever
, you need to verify that the VectorStoreIndex
correctly utilizes the search_client
from the AzureAISearchVectorStore
.
Here are the steps to ensure proper propagation:
Ensure AzureAISearchVectorStore
Initialization:
vector_store_address = "https://app00236465-ai-search-service-nc-prod.search.windows.net"
vector_store_password = decrypt(os.environ['AZURE_AI_SEARCH_KEY'])
search_client = SearchClient(
endpoint=vector_store_address,
index_name=INDEX_NAME,
credential=AzureKeyCredential(vector_store_password)
)
llama_vector_store = AzureAISearchVectorStore(
search_or_index_client=search_client,
filterable_metadata_field_keys=['filename', 'a', 'b', 'c'],
id_field_key="id",
chunk_field_key="content",
embedding_field_key="content_vector",
doc_id_field_key="id",
metadata_string_field_key="metadata"
)
Create StorageContext
and VectorStoreIndex
:
storage_context = StorageContext.from_defaults(vector_store=llama_vector_store)
index = VectorStoreIndex.from_documents(
[],
storage_context=storage_context,
embed_model=Settings.embed_model,
)
Configure Filters and Create Retriever:
relevant_ids = [i for i in dict(qa_dataset)['relevant_docs'].keys()]
g, s, f = qa_dataset.relevant_docs[relevant_ids[0]][0].split('_')[:3]
a_filter = MetadataFilter(key='a', operator=FilterOperator.EQUAL_TO, value=g)
b_filter = MetadataFilter(key='b', operator=FilterOperator.EQUAL_TO, value=s)
c_filter = MetadataFilter(key='c', operator=FilterOperator.EQUAL_TO, value=f)
filter_names = [a_filter, b_filter, c_filter]
filters = MetadataFilters(filters=filter_names, condition=FilterCondition.AND)
test_llama_retriever = index.as_retriever(similarity_top_k=3, filters=filters)
Pass Retriever to RetrieverEvaluator
:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate", "precision", "recall"], retriever=test_llama_retriever
)
sample_id = relevant_ids[0]
sample_query = qa_dataset.queries[sample_id]
sample_expected = qa_dataset.relevant_docs[sample_id]
nest_asyncio.apply()
eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
print(eval_result)
search_client
is correctly initialized and passed to the AzureAISearchVectorStore
.AzureAISearchVectorStore
class should have a property to access the search_client
. This is typically done using a property method:
@property
def client(self) -> Any:
"""Get client."""
return self._search_client
VectorStoreIndex
:Ensure that the VectorStoreIndex
correctly utilizes the search_client
from the AzureAISearchVectorStore
. The VectorStoreIndex
should have access to the search_client
through the storage_context
.
# Ensure the search client is accessible in the retriever
assert test_llama_retriever._vector_store.client is not None
print(test_llama_retriever._vector_store.client)
By following these steps, you ensure that the search_client
is correctly propagated and used within the AzureAISearchVectorStore
class, and subsequently within the VectorStoreIndex
and the retriever [1][2].
To continue talking to Dosu, mention @dosu.
@dosu
I confirm that running below code produces <SearchClient [endpoint=<index end point >, index=INDEX_NAME]>
# Ensure the search client is accessible in the retriever
assert test_llama_retriever._vector_store.client is not None
print(test_llama_retriever._vector_store.client)
I debugged the code further and noticed that when I run below code, I see that self._search_client
gets set appropriately for the llama_index.vector_stores.azureaisearch.base.AzureQueryResultSearchDefault
object in the _acreate_query_result
function (llama-index/vector_stores/azureaisearch/base.py - AzureQueryResultSearchBase /_acreate_query_result)
retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate", "precision", "recall"], retriever=test_llama_retriver
)
But when below code runs, the self._search_client
is wiped out
eval_result = retriever_evaluator.evaluate(sample_query,sample_expected)
To ensure that the search_client
remains correctly set and does not get wiped out during the evaluation process when using RetrieverEvaluator
with AzureAISearchVectorStore
and VectorStoreIndex
, you should ensure that the search_client
is properly initialized and maintained throughout the lifecycle of the AzureAISearchVectorStore
instance.
Here is the relevant part of the AzureAISearchVectorStore
class that initializes and maintains the search_client
:
def __init__(
self,
search_or_index_client: Any,
id_field_key: str,
chunk_field_key: str,
embedding_field_key: str,
metadata_string_field_key: str,
doc_id_field_key: str,
filterable_metadata_field_keys: Optional[
Union[
List[str],
Dict[str, str],
Dict[str, Tuple[str, MetadataIndexFieldType]],
]
] = None,
index_name: Optional[str] = None,
index_mapping: Optional[
Callable[[Dict[str, str], Dict[str, Any]], Dict[str, str]]
] = None,
index_management: IndexManagement = IndexManagement.NO_VALIDATION,
embedding_dimensionality: int = 1536,
vector_algorithm_type: str = "exhaustiveKnn",
language_analyzer: str = "en.lucene",
**kwargs: Any,
) -> None:
import_err_msg = (
"`azure-search-documents` package not found, please run "
"`pip install azure-search-documents==11.4.0`"
)
try:
import azure.search.documents # noqa
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
except ImportError:
raise ImportError(import_err_msg)
self._index_client: SearchIndexClient = cast(SearchIndexClient, None)
self._search_client: SearchClient = cast(SearchClient, None)
self.embedding_dimensionality = embedding_dimensionality
if search_or_index_client is not None:
if isinstance(search_or_index_client, SearchIndexClient):
self._index_client = cast(SearchIndexClient, search_or_index_client)
if not index_name:
raise ValueError(
"index_name must be supplied if search_or_index_client is of "
"type azure.search.documents.SearchIndexClient"
)
self._search_client = self._index_client.get_search_client(
index_name=index_name
)
elif isinstance(search_or_index_client, SearchClient):
self._search_client = cast(SearchClient, search_or_index_client)
if index_name:
raise ValueError(
"index_name cannot be supplied if search_or_index_client "
"is of type azure.search.documents.SearchClient"
)
if not self._index_client and not self._search_client:
raise ValueError(
"search_or_index_client must be of type "
"azure.search.documents.SearchClient or "
"azure.search.documents.SearchIndexClient"
)
else:
raise ValueError("search_or_index_client not specified")
if (
index_management == IndexManagement.CREATE_IF_NOT_EXISTS
and not self._index_client
):
raise ValueError(
"index_management has value of IndexManagement.CREATE_IF_NOT_EXISTS "
"but search_or_index_client is not of type "
"azure.search.documents.SearchIndexClient"
)
self._index_management = index_management
field_mapping = {
"id": id_field_key,
"chunk": chunk_field_key,
"embedding": embedding_field_key,
"metadata": metadata_string_field_key,
"doc_id": doc_id_field_key,
}
self._field_mapping = field_mapping
self._index_mapping = (
self._default_index_mapping if index_mapping is None else index_mapping
)
self._metadata_to_index_field_map = self._normalise_metadata_to_index_fields(
filterable_metadata_field_keys
)
if self._index_management == IndexManagement.CREATE_IF_NOT_EXISTS:
if index_name:
self._create_index_if_not_exists(index_name)
if self._index_management == IndexManagement.VALIDATE_INDEX:
self._validate_index(index_name)
@property
def client(self) -> Any:
"""Get client."""
return self._search_client
Additionally, you can refer to the test cases provided in the llama-index-legacy/tests/vector_stores/test_azureaisearch.py
file to understand how to mock and test the search_client
initialization and usage:
from typing import Any, List, Optional
from unittest.mock import MagicMock
import pytest
from llama_index.legacy.schema import NodeRelationship, RelatedNodeInfo, TextNode
from llama_index.legacy.vector_stores.azureaisearch import (
AzureAISearchVectorStore,
IndexManagement,
)
try:
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
azureaisearch_installed = True
except ImportError:
azureaisearch_installed = False
search_client = None # type: ignore
def create_mock_vector_store(
search_client: Any,
index_name: Optional[str] = None,
index_management: IndexManagement = IndexManagement.NO_VALIDATION,
) -> AzureAISearchVectorStore:
return AzureAISearchVectorStore(
search_or_index_client=search_client,
id_field_key="id",
chunk_field_key="content",
embedding_field_key="embedding",
metadata_string_field_key="metadata",
doc_id_field_key="doc_id",
filterable_metadata_field_keys=[], # Added to match the updated constructor
index_name=index_name,
index_management=index_management,
embedding_dimensionality=2, # Assuming a dimensionality of 2 for simplicity
)
def create_sample_documents(n: int) -> List[TextNode]:
nodes: List[TextNode] = []
for i in range(n):
nodes.append(
TextNode(
text=f"test node text {i}",
relationships={
NodeRelationship.SOURCE: RelatedNodeInfo(node_id=f"test doc id {i}")
},
embedding=[0.5, 0.5],
)
)
return nodes
@pytest.mark.skipif(
not azureaisearch_installed, reason="azure-search-documents package not installed"
)
def test_azureaisearch_add_two_batches() -> None:
search_client = MagicMock(spec=SearchClient)
vector_store = create_mock_vector_store(search_client)
nodes = create_sample_documents(11)
ids = vector_store.add(nodes)
call_count = search_client.merge_or_upload_documents.call_count
assert ids is not None
assert len(ids) == 11
assert call_count == 2
@pytest.mark.skipif(
not azureaisearch_installed, reason="azure-search-documents package not installed"
)
def test_azureaisearch_add_one_batch() -> None:
search_client = MagicMock(spec=SearchClient)
vector_store = create_mock_vector_store(search_client)
nodes = create_sample_documents(10)
ids = vector_store.add(nodes)
call_count = search_client.merge_or_upload_documents.call_count
assert ids is not None
assert len(ids) == 10
assert call_count == 1
@pytest.mark.skipif(
not azureaisearch_installed, reason="azure-search-documents package not installed"
)
def test_invalid_index_management_for_searchclient() -> None:
search_client = MagicMock(spec=SearchClient)
# No error
create_mock_vector_store(
search_client, index_management=IndexManagement.VALIDATE_INDEX
)
# Cannot supply index name
# ruff: noqa: E501
with pytest.raises(
ValueError,
match="index_name cannot be supplied if search_or_index_client is of type azure.search.documents.SearchClient",
):
create_mock_vector_store(search_client, index_name="test01")
# SearchClient cannot create an index
with pytest.raises(ValueError):
create_mock_vector_store(
search_client,
index_management=IndexManagement.CREATE_IF_NOT_EXISTS,
)
@pytest.mark.skipif(
not azureaisearch_installed, reason="azure-search-documents package not installed"
)
def test_invalid_index_management_for_searchindexclient() -> None:
search_client = MagicMock(spec=SearchIndexClient)
# Index name must be supplied
with pytest.raises(
ValueError,
match="index_name must be supplied if search_or_index_client is of type azure.search.documents.SearchIndexClient",
):
create_mock_vector_store(
search_client, index_management=IndexManagement.VALIDATE_INDEX
)
# No error when index name is supplied with SearchIndexClient
create_mock_vector_store(
search_client,
index_name="test01",
index_management=IndexManagement.CREATE_IF_NOT_EXISTS,
)
Ensure that the search_client
is properly initialized in the constructor and accessed via the client
property. This setup, along with the provided test cases, will help maintain the search_client
throughout the lifecycle of the AzureAISearchVectorStore
instance [1][2][3][4].
To continue talking to Dosu, mention @dosu.
@dosu
Apologies for the oversight. It looks like self._search_client
is set accurately when I run below code :
test_llama_retriver = index.as_retriever(similarity_top_k=3, filters=filters)
print(test_llama_retriver.retrieve('Can you get me the name?'))
but the self._search_client does not seem to be set when I setup RetrieverEvaluator
To ensure that self._search_client
remains set throughout the evaluation process when using RetrieverEvaluator
with AzureAISearchVectorStore
and VectorStoreIndex
, you need to make sure that self._search_client
is properly initialized in the AzureAISearchVectorStore
class. The initialization of self._search_client
is handled in the __init__
method of the AzureAISearchVectorStore
class.
Here is the relevant part of the __init__
method that ensures self._search_client
is set:
def __init__(
self,
search_or_index_client: Any,
id_field_key: str,
chunk_field_key: str,
embedding_field_key: str,
metadata_string_field_key: str,
doc_id_field_key: str,
filterable_metadata_field_keys: Optional[
Union[
List[str],
Dict[str, str],
Dict[str, Tuple[str, MetadataIndexFieldType]],
]
] = None,
index_name: Optional[str] = None,
index_mapping: Optional[
Callable[[Dict[str, str], Dict[str, Any]], Dict[str, str]]
] = None,
index_management: IndexManagement = IndexManagement.NO_VALIDATION,
embedding_dimensionality: int = 1536,
vector_algorithm_type: str = "exhaustiveKnn",
language_analyzer: str = "en.lucene",
**kwargs: Any,
) -> None:
import_err_msg = (
"`azure-search-documents` package not found, please run "
"`pip install azure-search-documents==11.4.0`"
)
try:
import azure.search.documents # noqa
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
except ImportError:
raise ImportError(import_err_msg)
self._index_client: SearchIndexClient = cast(SearchIndexClient, None)
self._search_client: SearchClient = cast(SearchClient, None)
self.embedding_dimensionality = embedding_dimensionality
# Validate search_or_index_client
if search_or_index_client is not None:
if isinstance(search_or_index_client, SearchIndexClient):
self._index_client = cast(SearchIndexClient, search_or_index_client)
if not index_name:
raise ValueError(
"index_name must be supplied if search_or_index_client is of "
"type azure.search.documents.SearchIndexClient"
)
self._search_client = self._index_client.get_search_client(
index_name=index_name
)
elif isinstance(search_or_index_client, SearchClient):
self._search_client = cast(SearchClient, search_or_index_client)
if index_name:
raise ValueError(
"index_name cannot be supplied if search_or_index_client "
"is of type azure.search.documents.SearchClient"
)
if not self._index_client and not self._search_client:
raise ValueError(
"search_or_index_client must be of type "
"azure.search.documents.SearchClient or "
"azure.search.documents.SearchIndexClient"
)
else:
raise ValueError("search_or_index_client not specified")
if (
index_management == IndexManagement.CREATE_IF_NOT_EXISTS
and not self._index_client
):
raise ValueError(
"index_management has value of IndexManagement.CREATE_IF_NOT_EXISTS "
"but search_or_index_client is not of type "
"azure.search.documents.SearchIndexClient"
)
self._index_management = index_management
# Default field mapping
field_mapping = {
"id": id_field_key,
"chunk": chunk_field_key,
"embedding": embedding_field_key,
"metadata": metadata_string_field_key,
"doc_id": doc_id_field_key,
}
self._field_mapping = field_mapping
self._index_mapping = (
self._default_index_mapping if index_mapping is None else index_mapping
)
self._metadata_to_index_field_map = self._normalise_metadata_to_index_fields(
filterable_metadata_field_keys
)
if self._index_management == IndexManagement.CREATE_IF_NOT_EXISTS:
if index_name:
self._create_index_if_not_exists(index_name)
if self._index_management == IndexManagement.VALIDATE_INDEX:
self._validate_index(index_name)
super().__init__()
@property
def client(self) -> Any:
"""Get client."""
return self._search_client
@property
def aclient(self) -> Any:
"""Get async client."""
return self._async_search_client
This code ensures that self._search_client
is properly initialized based on the type of search_or_index_client
provided. If search_or_index_client
is of type SearchIndexClient
, it initializes self._search_client
using the get_search_client
method. If search_or_index_client
is of type SearchClient
, it directly assigns it to self._search_client
.
By ensuring that self._search_client
is correctly initialized in the constructor, it will remain set throughout the evaluation process [1][2].
To continue talking to Dosu, mention @dosu.
I have provided the trace of all the internal calls to llama_index made from my code. All the recommended steps have been followed already. Could you please check the trace / code flow and let me know what the issue could be?
Please note the package versions I am using:
Package | Version |
---|---|
llama-index | 0.10.62 |
llama-index-agent-openai | 0.2.9 |
llama-index-cli | 0.1.13 |
llama-index-core | 0.10.62 |
llama-index-embeddings-azure-openai | 0.1.11 |
llama-index-embeddings-openai | 0.1.11 |
llama-index-indices-managed-llama-cloud | 0.2.7 |
llama-index-legacy | 0.9.48 |
llama-index-llms-azure-openai | 0.1.10 |
llama-index-llms-openai | 0.1.29 |
llama-index-multi-modal-llms-openai | 0.1.9 |
llama-index-program-openai | 0.1.7 |
llama-index-question-gen-openai | 0.1.3 |
llama-index-readers-file | 0.1.32 |
llama-index-readers-llama-parse | 0.1.6 |
llama-index-vector-stores-azureaisearch | 0.1.13 |
llama_vector_store = AzureAISearchVectorStore(
search_or_index_client=search_client,
filterable_metadata_field_keys=['filename', 'a', 'b', 'c'],
id_field_key="id",
chunk_field_key="content",
embedding_field_key="content_vector",
doc_id_field_key="id",
metadata_string_field_key="metadata"
)
-> [internal call] llama_index/vector_stores/azureaisearch/base.py (AzureAISearchVectorStore/init)
-> self._search_client = cast(SearchClient, search_or_index_client)
-> test_llama_retriver = index.as_retriever(similarity_top_k=3, filters=filters)
-> print(test_llama_retriver.retrieve('Whats the name?'))
-> [internal call] llama_index/core/indices/vector_store/retrievers/retrievers.py (VectorIndexRetriever/_build_vector_store_query)
-> return VectorStoreQuery(
query_embedding=query_bundle_with_embeddings.embedding,
similarity_top_k=self._similarity_top_k,
node_ids=self._node_ids,
doc_ids=self._doc_ids,
query_str=query_bundle_with_embeddings.query_str,
mode=self._vector_store_query_mode,
alpha=self._alpha,
filters=self._filters,
sparse_top_k=self._sparse_top_k,
)
-> _build_vector_store_query
return VectorStoreQuery(
query_embedding=query_bundle_with_embeddings.embedding,
similarity_top_k=self._similarity_top_k,
node_ids=self._node_ids,
doc_ids=self._doc_ids,
query_str=query_bundle_with_embeddings.query_str,
mode=self._vector_store_query_mode,
alpha=self._alpha,
filters=self._filters,
sparse_top_k=self._sparse_top_k,
)
-> [internal call] llama_index/vector_stores/azureaisearch/base.py (AzureAISearchVectorStore/query)
-> odata_filter = None
if query.filters is not None:
odata_filter = self._create_odata_filter(query.filters)
-> azure_query_result_search: AzureQueryResultSearchBase = (
AzureQueryResultSearchDefault(
query, self._field_mapping, odata_filter, self._search_client
)
)
-> return azure_query_result_search.search()
Note that in this step, I can still access `self._search_client`
-> retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate", "precision", "recall"], retriever=test_llama_retriver
)
-> eval_result = retriever_evaluator.evaluate(sample_query,sample_expected)
-> [internal call] llama_index/embeddings/azure_openai/base.py (AzureOpenAIEmbedding/_get_aclient)
-> if not self.reuse_client:
return AsyncAzureOpenAI(**self._get_credential_kwargs(is_async=True))
if self._aclient is None:
self._aclient = AsyncAzureOpenAI(
**self._get_credential_kwargs(is_async=True)
)
return self._aclient
-> [internal call] llama_index/core/indices/vector_store/retrievers/retriever.py (VectorIndexRetriever/_aretrieve)
-> async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
embedding = query_bundle.embedding
if self._vector_store.is_embedding_query:
if query_bundle.embedding is None and len(query_bundle.embedding_strs) > 0:
embed_model = self._embed_model
embedding = await embed_model.aget_agg_embedding_from_queries(
query_bundle.embedding_strs
)
return await self._aget_nodes_with_embeddings(
QueryBundle(query_str=query_bundle.query_str, embedding=embedding)
)
-> llama_index/core/indices/vector_store/retrievers/retriever.py (VectorIndexRetriever)
-> async def _aget_nodes_with_embeddings(
self, query_bundle_with_embeddings: QueryBundle
) -> List[NodeWithScore]:
query = self._build_vector_store_query(query_bundle_with_embeddings)
query_result = await self._vector_store.aquery(query, **self._kwargs)
return self._build_node_list_from_query_result(query_result)
-> def _build_vector_store_query(
self, query_bundle_with_embeddings: QueryBundle
) -> VectorStoreQuery:
return VectorStoreQuery(
query_embedding=query_bundle_with_embeddings.embedding,
similarity_top_k=self._similarity_top_k,
node_ids=self._node_ids,
doc_ids=self._doc_ids,
query_str=query_bundle_with_embeddings.query_str,
mode=self._vector_store_query_mode,
alpha=self._alpha,
filters=self._filters,
sparse_top_k=self._sparse_top_k,
)
-> async def _acreate_query_result(
self, search_query: str, vectors: Optional[List[Any]]
) -> VectorStoreQueryResult:
results = await self._search_client.search(
search_text=search_query,
vector_queries=vectors,
top=self._query.similarity_top_k,
select=self._select_fields,
filter=self._odata_filter,
)
id_result = []
node_result = []
score_result = []
async for result in results:
node_id = result[self._field_mapping["id"]]
metadata_str = result[self._field_mapping["metadata"]]
metadata = json.loads(metadata_str) if metadata_str else {}
score = result["@search.score"]
chunk = result[self._field_mapping["chunk"]]
try:
node = metadata_dict_to_node(metadata)
node.set_content(chunk)
except Exception:
# NOTE: deprecated legacy logic for backward compatibility
metadata, node_info, relationships = legacy_metadata_dict_to_node(
metadata
)
node = TextNode(
text=chunk,
id_=node_id,
metadata=metadata,
start_char_idx=node_info.get("start", None),
end_char_idx=node_info.get("end", None),
relationships=relationships,
)
logger.debug(f"Retrieved node id {node_id} with node data of {node}")
id_result.append(node_id)
node_result.append(node)
score_result.append(score)
logger.debug(
f"Search query '{search_query}' returned {len(id_result)} results."
)
return VectorStoreQueryResult(
nodes=node_result, similarities=score_result, ids=id_result
)
def search(self) -> VectorStoreQueryResult:
search_query = self._create_search_query()
vectors = self._create_query_vector()
return self._create_query_result(search_query, vectors)
async def asearch(self) -> VectorStoreQueryResult:
search_query = self._create_search_query()
vectors = self._create_query_vector()
return await self._acreate_query_result(search_query, vectors)
class AzureQueryResultSearchDefault(AzureQueryResultSearchBase):
def _create_query_vector(self) -> Optional[List[Any]]:
"""Query vector store."""
from azure.search.documents.models import VectorizedQuery
if not self._query.query_embedding:
raise ValueError("Query missing embedding")
vectorized_query = VectorizedQuery(
vector=self._query.query_embedding,
k_nearest_neighbors=self._query.similarity_top_k,
fields=self._field_mapping["embedding"],
)
vector_queries = [vectorized_query]
logger.info("Vector search with supplied embedding")
return vector_queries
class AzureQueryResultSearchSparse(AzureQueryResultSearchBase):
def _create_search_query(self) -> str:
if self._query.query_str is None:
raise ValueError("Query missing query string")
search_query = self._query.query_str
logger.info(f"Hybrid search with search text: {search_query}")
return search_query
Based on the detailed trace and the provided code flow, it appears that the self._search_client
is correctly set during the initialization and retrieval process but gets wiped out during the evaluation process with RetrieverEvaluator
.
Here are the key points and potential issues to investigate:
Initialization of AzureAISearchVectorStore
:
self._search_client
is correctly set during the initialization of AzureAISearchVectorStore
.Retrieval Process:
self._search_client
is accessible and correctly used during the retrieval process with test_llama_retriever.retrieve('Whats the name?')
.Evaluation Process:
RetrieverEvaluator.evaluate(sample_query, sample_expected)
.The self._search_client
might be getting reset or not properly passed during the asynchronous evaluation process. This could be due to the way the asynchronous methods are handling the self._search_client
.
Ensure Proper Initialization in Asynchronous Methods:
self._search_client
is correctly passed and maintained in all asynchronous methods.Check Asynchronous Method Calls:
_acreate_query_result
method in AzureQueryResultSearchBase
to ensure that self._search_client
is not getting reset or lost.self._search_client
in Asynchronous Methods:
_acreate_query_result
method to check the state of self._search_client
.async def _acreate_query_result(
self, search_query: str, vectors: Optional[List[Any]]
) -> VectorStoreQueryResult:
print(f"Debug: self._search_client before search: {self._search_client}")
results = await self._search_client.search(
search_text=search_query,
vector_queries=vectors,
top=self._query.similarity_top_k,
select=self._select_fields,
filter=self._odata_filter,
)
print(f"Debug: self._search_client after search: {self._search_client}")
id_result = []
node_result = []
score_result = []
async for result in results:
node_id = result[self._field_mapping["id"]]
metadata_str = result[self._field_mapping["metadata"]]
metadata = json.loads(metadata_str) if metadata_str else {}
score = result["@search.score"]
chunk = result[self._field_mapping["chunk"]]
try:
node = metadata_dict_to_node(metadata)
node.set_content(chunk)
except Exception:
# NOTE: deprecated legacy logic for backward compatibility
metadata, node_info, relationships = legacy_metadata_dict_to_node(
metadata
)
node = TextNode(
text=chunk,
id_=node_id,
metadata=metadata,
start_char_idx=node_info.get("start", None),
end_char_idx=node_info.get("end", None),
relationships=relationships,
)
logger.debug(f"Retrieved node id {node_id} with node data of {node}")
id_result.append(node_id)
node_result.append(node)
score_result.append(score)
logger.debug(
f"Search query '{search_query}' returned {len(id_result)} results."
)
return VectorStoreQueryResult(
nodes=node_result, similarities=score_result, ids=id_result
)
self._search_client
is Passed Correctly:
AzureQueryResultSearchBase
class and its derived classes correctly maintain the self._search_client
throughout the asynchronous calls.If the issue is due to the asynchronous context losing the self._search_client
, you might need to ensure that the self._search_client
is explicitly passed or maintained in the asynchronous context.
By following these steps, you can identify where the self._search_client
is getting lost and ensure it remains set throughout the evaluation process. This will help maintain the correct state of the self._search_client
during the evaluation with RetrieverEvaluator
[1][2][3].
To continue talking to Dosu, mention @dosu.
@dosu
Can you provide more detailed steps on how to explicitly pass the search_client to the asynchronous context (Specifically to be able to use RetrieverEvaluator
)?
Or be able to make sure AzureQueryResultSearchBase
class and its derived classes correctly maintain the self._search_client
throughout the asynchronous calls?
To ensure that self._search_client
is correctly maintained throughout the asynchronous calls, especially when using RetrieverEvaluator
, you can follow these detailed steps:
AzureAISearchVectorStore
Make sure that self._search_client
is properly initialized in the AzureAISearchVectorStore
class. This is already done in your code, but it's crucial to verify it:
class AzureAISearchVectorStore:
def __init__(self, search_or_index_client: Any, ...):
...
if isinstance(search_or_index_client, SearchClient):
self._search_client = cast(SearchClient, search_or_index_client)
...
@property
def client(self) -> Any:
"""Get client."""
return self._search_client
self._search_client
is Passed Correctly in Asynchronous MethodsAdd debug prints in the _acreate_query_result
method to check the state of self._search_client
:
async def _acreate_query_result(self, search_query: str, vectors: Optional[List[Any]]) -> VectorStoreQueryResult:
print(f"Debug: self._search_client before search: {self._search_client}")
results = await self._search_client.search(
search_text=search_query,
vector_queries=vectors,
top=self._query.similarity_top_k,
select=self._select_fields,
filter=self._odata_filter,
)
print(f"Debug: self._search_client after search: {self._search_client}")
id_result = []
node_result = []
score_result = []
async for result in results:
node_id = result[self._field_mapping["id"]]
metadata_str = result[self._field_mapping["metadata"]]
metadata = json.loads(metadata_str) if metadata_str else {}
score = result["@search.score"]
chunk = result[self._field_mapping["chunk"]]
try:
node = metadata_dict_to_node(metadata)
node.set_content(chunk)
except Exception:
metadata, node_info, relationships = legacy_metadata_dict_to_node(metadata)
node = TextNode(
text=chunk,
id_=node_id,
metadata=metadata,
start_char_idx=node_info.get("start", None),
end_char_idx=node_info.get("end", None),
relationships=relationships,
)
logger.debug(f"Retrieved node id {node_id} with node data of {node}")
id_result.append(node_id)
node_result.append(node)
score_result.append(score)
logger.debug(f"Search query '{search_query}' returned {len(id_result)} results.")
return VectorStoreQueryResult(
nodes=node_result, similarities=score_result, ids=id_result
)
self._search_client
is Maintained in AzureQueryResultSearchBase
Ensure that the AzureQueryResultSearchBase
class and its derived classes correctly maintain the self._search_client
throughout the asynchronous calls. You can do this by explicitly passing the search_client
to the asynchronous context if needed.
If the issue is due to the asynchronous context losing the self._search_client
, you might need to ensure that the self._search_client
is explicitly passed or maintained in the asynchronous context.
AzureQueryResultSearchBase
Class:class AzureQueryResultSearchBase:
def __init__(self, query, field_mapping, odata_filter, search_client):
self._query = query
self._field_mapping = field_mapping
self._odata_filter = odata_filter
self._search_client = search_client
async def _acreate_query_result(self, search_query: str, vectors: Optional[List[Any]]) -> VectorStoreQueryResult:
print(f"Debug: self._search_client before search: {self._search_client}")
results = await self._search_client.search(
search_text=search_query,
vector_queries=vectors,
top=self._query.similarity_top_k,
select=self._select_fields,
filter=self._odata_filter,
)
print(f"Debug: self._search_client after search: {self._search_client}")
id_result = []
node_result = []
score_result = []
async for result in results:
node_id = result[self._field_mapping["id"]]
metadata_str = result[self._field_mapping["metadata"]]
metadata = json.loads(metadata_str) if metadata_str else {}
score = result["@search.score"]
chunk = result[self._field_mapping["chunk"]]
try:
node = metadata_dict_to_node(metadata)
node.set_content(chunk)
except Exception:
metadata, node_info, relationships = legacy_metadata_dict_to_node(metadata)
node = TextNode(
text=chunk,
id_=node_id,
metadata=metadata,
start_char_idx=node_info.get("start", None),
end_char_idx=node_info.get("end", None),
relationships=relationships,
)
logger.debug(f"Retrieved node id {node_id} with node data of {node}")
id_result.append(node_id)
node_result.append(node)
score_result.append(score)
logger.debug(f"Search query '{search_query}' returned {len(id_result)} results.")
return VectorStoreQueryResult(
nodes=node_result, similarities=score_result, ids=id_result
)
self._search_client
is Passed Correctly:When creating instances of AzureQueryResultSearchBase
or its derived classes, ensure that self._search_client
is passed correctly:
azure_query_result_search = AzureQueryResultSearchDefault(
query, self._field_mapping, odata_filter, self._search_client
)
By following these steps, you can ensure that self._search_client
is correctly maintained throughout the asynchronous calls and does not get wiped out during the evaluation process with RetrieverEvaluator
[1][2].
To continue talking to Dosu, mention @dosu.
@dosu
I do not see any code changes in AzureQueryResultSearchBase
. What should I modify?
To ensure that self._search_client
is correctly maintained throughout the asynchronous calls, especially when using RetrieverEvaluator
, you need to modify the AzureQueryResultSearchBase
class and its derived classes to handle asynchronous operations properly. Specifically, you should ensure that the asearch
method and any other asynchronous methods correctly use the asynchronous client.
Here are the modifications required:
_create_query_result
in AzureQueryResultSearchBase
class AzureQueryResultSearchBase:
# Existing code...
async def _acreate_query_result(
self, search_query: str, vectors: Optional[List[Any]]
) -> VectorStoreQueryResult:
results = await self._search_client.search(
search_text=search_query,
vector_queries=vectors,
top=self._query.similarity_top_k,
select=self._select_fields,
filter=self._odata_filter,
)
id_result = []
node_result = []
score_result = []
for result in results:
node_id = result[self._field_mapping["id"]]
metadata_str = result[self._field_mapping["metadata"]]
metadata = json.loads(metadata_str) if metadata_str else {}
score = result["@search.score"]
chunk = result[self._field_mapping["chunk"]]
try:
node = metadata_dict_to_node(metadata)
node.set_content(chunk)
except Exception:
metadata, node_info, relationships = legacy_metadata_dict_to_node(
metadata
)
node = TextNode(
text=chunk,
id_=node_id,
metadata=metadata,
start_char_idx=node_info.get("start", None),
end_char_idx=node_info.get("end", None),
relationships=relationships,
)
id_result.append(node_id)
node_result.append(node)
score_result.append(score)
return VectorStoreQueryResult(
nodes=node_result, similarities=score_result, ids=id_result
)
_acreate_query_result
Methodclass AzureQueryResultSearchSemanticHybrid(AzureQueryResultSearchHybrid):
# Existing code...
async def asearch(self) -> VectorStoreQueryResult:
search_query = self._create_search_query()
vectors = self._create_query_vector()
return await self._acreate_query_result(search_query, vectors)
asearch
Method in AzureQueryResultSearchBase
to Call the Asynchronous _acreate_query_result
class AzureQueryResultSearchBase:
# Existing code...
async def asearch(self) -> VectorStoreQueryResult:
search_query = self._create_search_query()
vectors = self._create_query_vector()
return await self._acreate_query_result(search_query, vectors)
BaseRetriever
Additionally, you can refer to the BaseRetriever
class from the LlamaIndex framework for an example of how to handle asynchronous retrieval operations. The BaseRetriever
class uses both synchronous and asynchronous methods to perform retrieval operations. Here is a relevant snippet:
class BaseRetriever(ChainableMixin, PromptMixin, DispatcherSpanMixin):
# Initialization and other methods...
async def _aretrieve_from_object(
self,
obj: Any,
query_bundle: QueryBundle,
score: float,
) -> List[NodeWithScore]:
"""Retrieve nodes from object."""
if isinstance(obj, NodeWithScore):
return [obj]
elif isinstance(obj, BaseNode):
return [NodeWithScore(node=obj, score=score)]
elif isinstance(obj, BaseQueryEngine):
response = await obj.aquery(query_bundle)
return [NodeWithScore(node=TextNode(text=str(response)), score=score)]
elif isinstance(obj, BaseRetriever):
return await obj.aretrieve(query_bundle)
elif isinstance(obj, QueryComponent):
component_keys = obj.input_keys.required_keys
if len(component_keys) > 1:
raise ValueError(
f"QueryComponent {obj} has more than one input key: {component_keys}"
)
elif len(component_keys) == 0:
component_response = await obj.arun_component()
else:
kwargs = {next(iter(component_keys)): query_bundle.query_str}
component_response = await obj.arun_component(**kwargs)
result_output = str(next(iter(component_response.values())))
return [NodeWithScore(node=TextNode(text=result_output), score=score)]
else:
raise ValueError(f"Object {obj} is not retrievable.")
async def _ahandle_recursive_retrieval(
self, query_bundle: QueryBundle, nodes: List[NodeWithScore]
) -> List[NodeWithScore]:
retrieved_nodes: List[NodeWithScore] = []
for n in nodes:
node = n.node
score = n.score or 1.0
if isinstance(node, IndexNode):
obj = node.obj or self.object_map.get(node.index_id, None)
if obj is not None:
if self._verbose:
print_text(
f"Retrieval entering {node.index_id}: {obj.__class__.__name__}\n",
color="llama_turquoise",
)
retrieved_nodes.extend(
await self._aretrieve_from_object(
obj, query_bundle=query_bundle, score=score
)
)
else:
retrieved_nodes.append(n)
else:
retrieved_nodes.append(n)
seen = set()
return [
n
for n in retrieved_nodes
if not ((n.node.hash, n.node.ref_doc_id) in seen or seen.add((n.node.hash, n.node.ref_doc_id))) # type: ignore[func-returns-value]
]
@dispatcher.span
async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
self._check_callback_manager()
dispatcher.event(
RetrievalStartEvent(
str_or_query_bundle=str_or_query_bundle,
)
)
if isinstance(str_or_query_bundle, str):
query_bundle = QueryBundle(str_or_query_bundle)
else:
query_bundle = str_or_query_bundle
with self.callback_manager.as_trace("query"):
with self.callback_manager.event(
CBEventType.RETRIEVE,
payload={EventPayload.QUERY_STR: query_bundle.query_str},
) as retrieve_event:
nodes = await self._aretrieve(query_bundle=query_bundle)
nodes = await self._ahandle_recursive_retrieval(
query_bundle=query_bundle, nodes=nodes
)
retrieve_event.on_end(
payload={EventPayload.NODES: nodes},
)
dispatcher.event(
RetrievalEndEvent(
str_or_query_bundle=str_or_query_bundle,
nodes=nodes,
)
)
return nodes
By making these changes and following the example from BaseRetriever
, you ensure that self._search_client
is correctly maintained and used in asynchronous calls, which is crucial for the RetrieverEvaluator
and other asynchronous operations [1][2][3].
To continue talking to Dosu, mention @dosu.
@dosu that does not help at all. The async method _acreate_query_result
is already being called.
To ensure that the search_client
is correctly maintained throughout the asynchronous calls, especially when using RetrieverEvaluator
, you need to explicitly pass the search_client
to the asynchronous context or ensure that the AzureQueryResultSearchBase
class and its derived classes correctly maintain the self._search_client
throughout the asynchronous calls.
Here are the detailed steps and code modifications required:
AzureQueryResultSearchBase
Class to Include an Asynchronous Method _acreate_query_result
class AzureQueryResultSearchBase:
def __init__(
self,
query: VectorStoreQuery,
field_mapping: Dict[str, str],
odata_filter: Optional[str],
search_client: Any,
) -> None:
self._query = query
self._field_mapping = field_mapping
self._odata_filter = odata_filter
self._search_client = search_client
@property
def _select_fields(self) -> List[str]:
return [
self._field_mapping["id"],
self._field_mapping["chunk"],
self._field_mapping["metadata"],
self._field_mapping["doc_id"],
]
def _create_search_query(self) -> str:
return "*"
def _create_query_vector(self) -> Optional[List[Any]]:
return None
def _create_query_result(
self, search_query: str, vectors: Optional[List[Any]]
) -> VectorStoreQueryResult:
results = self._search_client.search(
search_text=search_query,
vector_queries=vectors,
top=self._query.similarity_top_k,
select=self._select_fields,
filter=self._odata_filter,
)
id_result = []
node_result = []
score_result = []
for result in results:
node_id = result[self._field_mapping["id"]]
metadata_str = result[self._field_mapping["metadata"]]
metadata = json.loads(metadata_str) if metadata_str else {}
score = result["@search.score"]
chunk = result[self._field_mapping["chunk"]]
try:
node = metadata_dict_to_node(metadata)
node.set_content(chunk)
except Exception:
metadata, node_info, relationships = legacy_metadata_dict_to_node(
metadata
)
node = TextNode(
text=chunk,
id_=node_id,
metadata=metadata,
start_char_idx=node_info.get("start", None),
end_char_idx=node_info.get("end", None),
relationships=relationships,
)
id_result.append(node_id)
node_result.append(node)
score_result.append(score)
return VectorStoreQueryResult(
nodes=node_result, similarities=score_result, ids=id_result
)
async def _acreate_query_result(
self, search_query: str, vectors: Optional[List[Any]]
) -> VectorStoreQueryResult:
results = await self._search_client.search(
search_text=search_query,
vector_queries=vectors,
top=self._query.similarity_top_k,
select=self._select_fields,
filter=self._odata_filter,
)
id_result = []
node_result = []
score_result = []
for result in results:
node_id = result[self._field_mapping["id"]]
metadata_str = result[self._field_mapping["metadata"]]
metadata = json.loads(metadata_str) if metadata_str else {}
score = result["@search.score"]
chunk = result[self._field_mapping["chunk"]]
try:
node = metadata_dict_to_node(metadata)
node.set_content(chunk)
except Exception:
metadata, node_info, relationships = legacy_metadata_dict_to_node(
metadata
)
node = TextNode(
text=chunk,
id_=node_id,
metadata=metadata,
start_char_idx=node_info.get("start", None),
end_char_idx=node_info.get("end", None),
relationships=relationships,
)
id_result.append(node_id)
node_result.append(node)
score_result.append(score)
return VectorStoreQueryResult(
nodes=node_result, similarities=score_result, ids=id_result
)
class AzureQueryResultSearchSemanticHybrid(AzureQueryResultSearchHybrid):
def _create_query_vector(self) -> Optional[List[Any]]:
"""Query vector store."""
from azure.search.documents.models import VectorizedQuery
if not self._query.query_embedding:
raise ValueError("Query missing embedding")
# k is set to 50 to align with the number of accept document in azure semantic reranking model.
# https://learn.microsoft.com/azure/search/semantic-search-overview
vectorized_query = VectorizedQuery(
vector=self._query.query_embedding,
k_nearest_neighbors=50,
fields=self._field_mapping["embedding"],
)
vector_queries = [vectorized_query]
logger.info("Vector search with supplied embedding")
return vector_queries
def _create_query_result(
self, search_query: str, vector_queries: Optional[List[Any]]
) -> VectorStoreQueryResult:
results = self._search_client.search(
search_text=search_query,
vector_queries=vector_queries,
top=self._query.similarity_top_k,
select=self._select_fields,
filter=self._odata_filter,
query_type="semantic",
semantic_configuration_name="mySemanticConfig",
)
id_result = []
node_result = []
score_result = []
for result in results:
node_id = result[self._field_mapping["id"]]
metadata_str = result[self._field_mapping["metadata"]]
metadata = json.loads(metadata_str) if metadata_str else {}
# use reranker_score instead of score
score = result["@search.reranker_score"]
chunk = result[self._field_mapping["chunk"]]
try:
node = metadata_dict_to_node(metadata)
node.set_content(chunk)
except Exception:
metadata, node_info, relationships = legacy_metadata_dict_to_node(
metadata
)
node = TextNode(
text=chunk,
id_=node_id,
metadata=metadata,
start_char_idx=node_info.get("start", None),
end_char_idx=node_info.get("end", None),
relationships=relationships,
)
id_result.append(node_id)
node_result.append(node)
score_result.append(score)
return VectorStoreQueryResult(
nodes=node_result, similarities=score_result, ids=id_result
)
async def _acreate_query_result(
self, search_query: str, vector_queries: Optional[List[Any]]
) -> VectorStoreQueryResult:
results = await self._search_client.search(
search_text=search_query,
vector_queries=vector_queries,
top=self._query.similarity_top_k,
select=self._select_fields,
filter=self._odata_filter,
query_type="semantic",
semantic_configuration_name="mySemanticConfig",
)
id_result = []
node_result = []
score_result = []
for result in results:
node_id = result[self._field_mapping["id"]]
metadata_str = result[self._field_mapping["metadata"]]
metadata = json.loads(metadata_str) if metadata_str else {}
# use reranker_score instead of score
score = result["@search.reranker_score"]
chunk = result[self._field_mapping["chunk"]]
try:
node = metadata_dict_to_node(metadata)
node.set_content(chunk)
except Exception:
metadata, node_info, relationships = legacy_metadata_dict_to_node(
metadata
)
node = TextNode(
text=chunk,
id_=node_id,
metadata=metadata,
start_char_idx=node_info.get("start", None),
end_char_idx=node_info.get("end", None),
relationships=relationships,
)
id_result.append(node_id)
node_result.append(node)
score_result.append(score)
return VectorStoreQueryResult(
nodes=node_result, similarities=score_result, ids=id_result
)
asearch
Method to the Derived Classesclass AzureQueryResultSearchSemanticHybrid(AzureQueryResultSearchHybrid):
# ... existing methods ...
async def asearch(self) -> VectorStoreQueryResult:
search_query = self._create_search_query()
vectors = self._create_query_vector()
return await self._acreate_query_result(search_query, vectors)
By following these steps, you ensure that the search_client
is correctly maintained and used in both synchronous and asynchronous contexts, allowing the RetrieverEvaluator
to function properly.
Additionally, you can refer to the BaseRetriever
class from the LlamaIndex framework for an example of how to handle both synchronous and asynchronous retrieval methods. The BaseRetriever
class uses _retrieve_from_object
and _aretrieve_from_object
to manage synchronous and asynchronous retrievals, respectively, and provides retrieve
and aretrieve
methods to handle retrieval in both contexts. This approach ensures that the client is correctly maintained and used throughout the retrieval process. Here is a relevant excerpt from the BaseRetriever
class:
class BaseRetriever(ChainableMixin, PromptMixin, DispatcherSpanMixin):
"""Base retriever."""
# ... other methods ...
def retrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
"""Retrieve nodes given query."""
# ... implementation ...
async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
"""Asynchronously retrieve nodes given query."""
# ... implementation ...
def _retrieve_from_object(
self,
obj: Any,
query_bundle: QueryBundle,
score: float,
) -> List[NodeWithScore]:
"""Retrieve nodes from object."""
# ... implementation ...
async def _aretrieve_from_object(
self,
obj: Any,
query_bundle: QueryBundle,
score: float,
) -> List[NodeWithScore]:
"""Asynchronously retrieve nodes from object."""
# ... implementation ...
This structure ensures that both synchronous and asynchronous retrievals are handled appropriately, maintaining the integrity of the search_client
throughout the process [1][2][3].
To continue talking to Dosu, mention @dosu.
Bug Description
vector_store_address=
vector_store_password=
search_client = SearchClient(
endpoint=vector_store_address,
index_name=INDEX_NAME,
credential=AzureKeyCredential(vector_store_password)
)
Define a query that matches all documents
query = "*"
Set the API version
api_version =
search_mode = "all"
Search for documents with the query
response = search_client.search( search_text=query, search_mode=search_mode )
docs = [i for i in response]
llama_docs = []
for doc in docs: llama_docs.append( llamadoc( id=doc['id'], text= doc['content'], metadata={
"filename": doc['filename'], "file_path": doc['file_path'], "a": doc['a'], "b": doc['b'], "c": doc['c'], "page_number": doc['page_number'], }, ) )
Settings.embed_model = AzureOpenAIEmbedding( model=conf.openai_embedding_model, deployment_name=conf.openai_embedding_model, api_key=conf.openai_api_key, api_base=conf.openai_endpoint, azure_endpoint=conf.openai_endpoint, api_version=conf.openai_version, async_http_client=httpx.AsyncClient(proxies=),
http_client=httpx.Client(proxies=),
)
Settings.llm = AzureOpenAI( engine=OPENAI_GPT_4, model=OPENAI_GPT_4, temperature=0.0, azure_endpoint=conf.openai_endpoint, api_key=conf.openai_api_key, api_version=conf.openai_version, http_client=httpx.Client(proxies=),
)
sample_llama_docs = random.sample(llama_docs, 50)
llama_vector_store = AzureAISearchVectorStore( search_or_index_client=search_client, filterable_metadata_field_keys=['filename', 'a', 'b', 'c'], id_field_key="id", chunk_field_key="content", embedding_field_key="content_vector", doc_id_field_key="id", metadata_string_field_key="metadata" )
storage_context = StorageContext.from_defaults(vector_store=llama_vector_store)
index = VectorStoreIndex.from_documents( [], storage_context=storage_context, embed_model=Settings.embed_model, )
relevant_ids = [i for i in dict(qa_dataset)['relevant_docs'].keys()]
g,s,f=qa_dataset.relevant_docs[relevantids[0]][0].split('')[:3]
a_filter = MetadataFilter(key=a, operator=FilterOperator.EQUAL_TO, value=g) b_filter = MetadataFilter(key=b, operator=FilterOperator.EQUAL_TO, value=s) c_filter = MetadataFilter(key=c, operator=FilterOperator.EQUAL_TO, value=f)
filter_names = [a_filter, b_filter, c_filter] filters = MetadataFilters(filters=filter_names, condition=FilterCondition.AND)
test_llama_retriver = index.as_retriever(similarity_top_k=3, filters=filters)
this works
print(test_llama_retriver.retrieve('Whats the name?')) print("\n\n")
retriever_evaluator = RetrieverEvaluator.from_metric_names( ["mrr", "hit_rate", "precision", "recall"], retriever=test_llama_retriver )
sample_id=relevant_ids[0] sample_query = qa_dataset.queries[sample_id] sample_expected = qa_dataset.relevant_docs[sample_id]
nest_asyncio.apply()
eval_result = retriever_evaluator.evaluate(sample_query,sample_expected)
print(eval_result)
Issue:
eval_result = retriever_evaluator.evaluate(sample_query,sample_expected)
triggers "AttributeError: 'NoneType' object has no attribute 'search'After tracing the code, the below function causes the issue -
`async def _acreate_query_result( self, search_query: str, vectors: Optional[List[Any]] ) -> VectorStoreQueryResult: results = await self._search_client.search( search_text=search_query, vector_queries=vectors, top=self._query.similarity_top_k, select=self._select_fields, filter=self._odata_filter, )
`
This is because I need to convert
AzureAISearchVectorStore
to aVectorStoreIndex
object to useas_retriever
and thesearch_client
passed to instantiateAzureAISearchVectorStore
does not get propagated to VectorStoreIndexI even tried manually overriding
index._search_client = search_client
but does not workllama-index versions:
@dosubot
Version
0.10.62
Steps to Reproduce
vector_store_address=
vector_store_password=
search_client = SearchClient(
endpoint=vector_store_address,
index_name=INDEX_NAME,
credential=AzureKeyCredential(vector_store_password)
)
Define a query that matches all documents
query = "*"
Set the API version
api_version =
search_mode = "all"
Search for documents with the query
response = search_client.search( search_text=query, search_mode=search_mode )
docs = [i for i in response]
llama_docs = []
for doc in docs: llama_docs.append( llamadoc( id=doc['id'], text= doc['content'], metadata={
"filename": doc['filename'], "file_path": doc['file_path'], "a": doc['a'], "b": doc['b'], "c": doc['c'], "page_number": doc['page_number'], }, ) )
Settings.embed_model = AzureOpenAIEmbedding( model=conf.openai_embedding_model, deployment_name=conf.openai_embedding_model, api_key=conf.openai_api_key, api_base=conf.openai_endpoint, azure_endpoint=conf.openai_endpoint, api_version=conf.openai_version, async_http_client=httpx.AsyncClient(proxies=),
http_client=httpx.Client(proxies=),
)
Settings.llm = AzureOpenAI( engine=OPENAI_GPT_4, model=OPENAI_GPT_4, temperature=0.0, azure_endpoint=conf.openai_endpoint, api_key=conf.openai_api_key, api_version=conf.openai_version, http_client=httpx.Client(proxies=),
)
sample_llama_docs = random.sample(llama_docs, 50)
llama_vector_store = AzureAISearchVectorStore( search_or_index_client=search_client, filterable_metadata_field_keys=['filename', 'a', 'b', 'c'], id_field_key="id", chunk_field_key="content", embedding_field_key="content_vector", doc_id_field_key="id", metadata_string_field_key="metadata" )
storage_context = StorageContext.from_defaults(vector_store=llama_vector_store)
index = VectorStoreIndex.from_documents( [], storage_context=storage_context, embed_model=Settings.embed_model, )
relevant_ids = [i for i in dict(qa_dataset)['relevant_docs'].keys()]
g,s,f=qa_dataset.relevant_docs[relevantids[0]][0].split('')[:3]
a_filter = MetadataFilter(key=a, operator=FilterOperator.EQUAL_TO, value=g) b_filter = MetadataFilter(key=b, operator=FilterOperator.EQUAL_TO, value=s) c_filter = MetadataFilter(key=c, operator=FilterOperator.EQUAL_TO, value=f)
filter the retriever to respond from relevant document chunks
filter_names = [a_filter, b_filter, c_filter] filters = MetadataFilters(filters=filter_names, condition=FilterCondition.AND)
test_llama_retriver = index.as_retriever(similarity_top_k=3, filters=filters)
this works
print(test_llama_retriver.retrieve('Whats the patients name?')) print("\n\n")
retriever_evaluator = RetrieverEvaluator.from_metric_names( ["mrr", "hit_rate", "precision", "recall"], retriever=test_llama_retriver )
sample_id=relevant_ids[0] sample_query = qa_dataset.queries[sample_id] sample_expected = qa_dataset.relevant_docs[sample_id]
nest_asyncio.apply()
eval_result = retriever_evaluator.evaluate(sample_query,sample_expected)
print(eval_result)
Relevant Logs/Tracbacks
No response