weaviate / weaviate-python-client

A python native client for easy interaction with a Weaviate instance.
https://weaviate.io/developers/weaviate/current/client-libraries/python.html
BSD 3-Clause "New" or "Revised" License
162 stars 75 forks source link

[Possible bug] Query with a cross-reference to MT collection and a filter throws #897

Closed databyjp closed 8 months ago

databyjp commented 8 months ago

Querying a single-tenant collection with cross-references to a MT collection, the query fails if a filter is specified.

For example, this throws

response = movies.query.near_text(
    query="historical drama",
    limit=2,
    return_metadata=wq.MetadataQuery(distance=True),
    filters=wq.Filter.by_property(name="release_date").greater_than(datetime(2010, 1, 1)),
    return_references=wq.QueryReference(
        link_on="hasReview",
        return_properties=["content", "author_username"],
    )
)

With:

Traceback (most recent call last):
  File "/Users/jphwang/code/weaviate-tutorials/python-int-workshop/.venv/lib/python3.10/site-packages/weaviate/collections/grpc/query.py", line 553, in __call
    res, _ = self._connection.grpc_stub.Search.with_call(
  File "/Users/jphwang/code/weaviate-tutorials/python-int-workshop/.venv/lib/python3.10/site-packages/grpc/_channel.py", line 1177, in with_call
    return _end_unary_response_blocking(state, call, True, None)
  File "/Users/jphwang/code/weaviate-tutorials/python-int-workshop/.venv/lib/python3.10/site-packages/grpc/_channel.py", line 1003, in _end_unary_response_blocking
    raise _InactiveRpcError(state)  # pytype: disable=not-instantiable
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
        status = StatusCode.UNKNOWN
        details = "explorer: get class: vector search: resolve cross-refs: build reference cache: build request cache: fetch job list: index "reviewmt": class ReviewMT has multi-tenancy enabled, but request was without tenant"
        debug_error_string = "UNKNOWN:Error received from peer  {grpc_message:"explorer: get class: vector search: resolve cross-refs: build reference cache: build request cache: fetch job list: index \"reviewmt\": class ReviewMT has multi-tenancy enabled, but request was without tenant", grpc_status:2, created_time:"2024-02-20T18:21:34.50296+00:00"}"
>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/jphwang/code/weaviate-tutorials/python-int-workshop/3.3_queries.py", line 17, in <module>
    response = movies.query.near_text(
  File "/Users/jphwang/code/weaviate-tutorials/python-int-workshop/.venv/lib/python3.10/site-packages/weaviate/collections/queries/near_text/query.py", line 89, in near_text
    res = self._query().near_text(
  File "/Users/jphwang/code/weaviate-tutorials/python-int-workshop/.venv/lib/python3.10/site-packages/weaviate/collections/grpc/query.py", line 483, in near_text
    return self.__call()
  File "/Users/jphwang/code/weaviate-tutorials/python-int-workshop/.venv/lib/python3.10/site-packages/weaviate/collections/grpc/query.py", line 706, in __call
    raise WeaviateQueryError(e.details(), "GRPC search")  # pyright: ignore
weaviate.exceptions.WeaviateQueryError: Query call with protocol GRPC search failed with message explorer: get class: vector search: resolve cross-refs: build reference cache: build request cache: fetch job list: index "reviewmt": class ReviewMT has multi-tenancy enabled, but request was without tenant.

But commenting out the filters line makes it work again. (Full collection creation / import script available if needed.)

dirkkul commented 8 months ago

Querying a single-tenant collection with cross-references to a MT collection, the query fails if a filter is specified.

can you provide a full example? AFAIK references Non-MT => MT should not be possible

databyjp commented 8 months ago

Ahhhh. It was an interesting experience trying to reproduce it.

If I'm using this correctly, when I add those incorrect refs using batch.add_reference the client isn't showing me any errors when it should.

image: semitechnologies/weaviate:1.23.10 + py client 4.4.4

import weaviate
import weaviate.classes.config as wc
from weaviate.util import generate_uuid5
from weaviate.classes.tenants import Tenant
import os

# Connect to a local Weaviate instance
client = weaviate.connect_to_local(
    headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY"),
        "X-Cohere-Api-Key": os.getenv("COHERE_APIKEY"),
     }
)

# Delete existing collections if they exist
client.collections.delete(["Movie", "ReviewMT"])

# Create the ReviewMT collection
reviews = client.collections.create(
    name="ReviewMT",
    properties=[
        wc.Property(name="movie_id", data_type=wc.DataType.INT),
        wc.Property(name="review_id", data_type=wc.DataType.TEXT, skip_vectorization=True),
        wc.Property(name="content", data_type=wc.DataType.TEXT),
    ],
    vectorizer_config=wc.Configure.Vectorizer.text2vec_cohere(),
    multi_tenancy_config=wc.Configure.multi_tenancy(enabled=True)
)

reviews.tenants.create([Tenant(name="Tenant0"), Tenant(name="Tenant1")])

# Create the Movie collection
movies = client.collections.create(
    name="Movie",
    properties=[
        wc.Property(name="title", data_type=wc.DataType.TEXT),
        wc.Property(name="tmdb_id", data_type=wc.DataType.INT),
    ],
    references=[
        wc.ReferenceProperty(name="hasReview", target_collection="ReviewMT")
    ],
    vectorizer_config=wc.Configure.Vectorizer.text2vec_cohere(),
)

# Sample movie data
movies_data = [
    {"title": "Movie 1", "tmdb_id": 1},
    {"title": "Movie 2", "tmdb_id": 2},
]

# Sample review data
reviews_data = [
    {"movie_id": 1, "review_id": "review1", "content": "Great movie!", "tenant": "Tenant0"},
    {"movie_id": 2, "review_id": "review2", "content": "Not bad.", "tenant": "Tenant1"},
]

for review in reviews_data:
    review_uuid = generate_uuid5(review["review_id"])
    reviews_t = reviews.with_tenant(review["tenant"])
    reviews_t.data.insert(
        properties=review,
        uuid=review_uuid,
    )

# # Adding movies

# Insert 1: This fails (as it should)
try:
    for i, movie in enumerate(movies_data):
        movie_uuid = generate_uuid5(str(movie["tmdb_id"]))
        movies.data.insert(
            properties=movie,
            uuid=movie_uuid,
            references={"hasReview": generate_uuid5(reviews_data[i]["review_id"])}
        )
except:
    response = movies.aggregate.over_all(total_count=True)
    print(response.total_count)

# Insert 2: This fails (as it should)
with movies.batch.dynamic() as batch:
    for i, movie in enumerate(movies_data):
        movie_uuid = generate_uuid5(str(movie["tmdb_id"]))
        batch.add_object(
            properties=movie,
            uuid=movie_uuid,
            references={"hasReview": generate_uuid5(reviews_data[i]["review_id"])}
        )

print(len(movies.batch.failed_objects))

# Insert 3: But this works
with movies.batch.dynamic() as batch:
    for i, movie in enumerate(movies_data):
        movie_uuid = generate_uuid5(str(movie["tmdb_id"]))
        batch.add_object(
            properties=movie,
            uuid=movie_uuid,
        )
        batch.add_reference(
            from_uuid=movie_uuid,
            from_property="hasReview",
            to=generate_uuid5(reviews_data[i]["review_id"])
        )

print(len(movies.batch.failed_objects))
print(len(movies.batch.failed_references))

client.close()