redis / redis-vl-python

Redis Vector Library (RedisVL) interfaces with Redis' vector database for realtime semantic search, RAG, and recommendation systems.
https://www.redisvl.com/
MIT License
210 stars 32 forks source link

Issue with search dialect 3 and JSON #140

Open tylerhutcherson opened 5 months ago

tylerhutcherson commented 5 months ago

The dialect 3 api changes the outputs for JSON:

q = VectorQuery(
    vector=[0.23, 0.12, -0.03, 0.98],
    vector_field_name="embedding",
    filter_expression=filter_expression,
    return_fields=["name", "description", "price"],
    dialect=3
)
index.query(q)

returns:

[{'id': 'product:36edbfd1372144759975f01fe6968bbf',
  'vector_distance': '1.08839428425',
  'name': '["Wireless earbuds"]',
  'description': '["Wireless Bluetooth in-ear headphones"]',
  'price': '[64.99]'}]

BUT dialect 2 returns:

[{'id': 'product:36edbfd1372144759975f01fe6968bbf',
  'vector_distance': '1.08839428425',
  'name': 'Wireless earbuds',
  'description': 'Wireless Bluetooth in-ear headphones',
  'price': '64.99'}]

(which is the correct and user expected format.

Additionally FilterQuery types break when using dialect=3...

Need to fix the parsing layer in RedisVL.

tylerhutcherson commented 5 months ago

Schema YAML:

%%writefile schema.yaml

index:
    name: products
    prefix: product
    storage_type: json

fields:
    - name: name
      type: text
    - name: description
      type: text
    - name: connection_type
      path: $.connection.type  # index item from nested object
      type: tag
    - name: price
      type: numeric
    - name: stock
      type: numeric
    - name: color
      path: $.colors.*  # index array of TAGs
      type: tag
    - name: embedding
      type: vector
      attrs:
          dims: 4
          algorithm: flat
          distance_metric: cosine
    - name: embeddings
      path: $.embeddings[*]  # index array of VECTORs
      type: vector
      attrs:
          dims: 4
          algorithm: hnsw
          distance_metric: l2

Code to reproduce:

from redis import Redis

from redisvl.schema import IndexSchema
from redisvl.index import SearchIndex

data = [
    {
        "name": "Noise-cancelling Bluetooth headphones",
        "description": "Wireless Bluetooth headphones with noise-cancelling technology",
        "connection": {
            "wireless": True,
            "type": "Bluetooth"
        },
        "price": 99.98,
        "stock": 25,
        "colors": [
            "black",
            "silver"
        ],
        "embedding": [0.87, -0.15, 0.55, 0.03],
        "embeddings": [[0.56, -0.34, 0.69, 0.02], [0.94, -0.23, 0.45, 0.19]]
    },
    {
        "name": "Wireless earbuds",
        "description": "Wireless Bluetooth in-ear headphones",
        "connection": {
            "wireless": True,
            "type": "Bluetooth"
        },
        "price": 64.99,
        "stock": 17,
        "colors": [
            "red",
            "black",
            "white"
        ],
        "embedding": [-0.7, -0.51, 0.88, 0.14],
        "embeddings": [[0.54, -0.14, 0.79, 0.92], [0.94, -0.93, 0.45, 0.16]]
    }
]

schema = IndexSchema.from_yaml("schema.yaml")
client = Redis.from_url("redis://localhost:6379")
index = SearchIndex(schema, client)
index.create(overwrite=True, drop=True)
keys = index.load(data)
bsbodden commented 4 months ago

@tylerhutcherson I added a test as shown below, but it passes??? Can you take a look and see if you see anything missing?

import pytest
from redis import Redis
from redis.commands.search.query import Query

from redisvl.index import SearchIndex
from redisvl.query import VectorQuery, FilterQuery
from redisvl.query.filter import Tag
from redisvl.schema.schema import IndexSchema

@pytest.fixture
def sample_data():
    return [
        {
            "name": "Noise-cancelling Bluetooth headphones",
            "description": "Wireless Bluetooth headphones with noise-cancelling technology",
            "connection": {
                "wireless": True,
                "type": "Bluetooth"
            },
            "price": 99.98,
            "stock": 25,
            "colors": [
                "black",
                "silver"
            ],
            "embedding": [0.87, -0.15, 0.55, 0.03],
            "embeddings": [[0.56, -0.34, 0.69, 0.02], [0.94, -0.23, 0.45, 0.19]]
        },
        {
            "name": "Wireless earbuds",
            "description": "Wireless Bluetooth in-ear headphones",
            "connection": {
                "wireless": True,
                "type": "Bluetooth"
            },
            "price": 64.99,
            "stock": 17,
            "colors": [
                "red",
                "black",
                "white"
            ],
            "embedding": [-0.7, -0.51, 0.88, 0.14],
            "embeddings": [[0.54, -0.14, 0.79, 0.92], [0.94, -0.93, 0.45, 0.16]]
        }
    ]

@pytest.fixture
def schema_dict():
    return {
        "index": {
            "name": "products",
            "prefix": "product",
            "storage_type": "json"
        },
        "fields": [
            {"name": "name", "type": "text"},
            {"name": "description", "type": "text"},
            {"name": "connection_type", "path": "$.connection.type", "type": "tag"},
            {"name": "price", "type": "numeric"},
            {"name": "stock", "type": "numeric"},
            {"name": "color", "path": "$.colors.*", "type": "tag"},
            {
                "name": "embedding",
                "type": "vector",
                "attrs": {
                    "dims": 4,
                    "algorithm": "flat",
                    "distance_metric": "cosine"
                }
            },
            {
                "name": "embeddings",
                "path": "$.embeddings[*]",
                "type": "vector",
                "attrs": {
                    "dims": 4,
                    "algorithm": "hnsw",
                    "distance_metric": "l2"
                }
            }
        ]
    }

@pytest.fixture
def index(sample_data, redis_url, schema_dict):
    index_schema = IndexSchema.from_dict(schema_dict)
    redis_client = Redis.from_url(redis_url)
    index = SearchIndex(index_schema, redis_client)
    index.create(overwrite=True, drop=True)
    index.load(sample_data)
    yield index
    index.delete(drop=True)

def test_dialect_3_json(index, sample_data):
    # Create a VectorQuery with dialect 3
    vector_query = VectorQuery(
        vector=[0.23, 0.12, -0.03, 0.98],
        vector_field_name="embedding",
        return_fields=["name", "description", "price"],
        dialect=3
    )

    # Execute the query
    results = index.query(vector_query)

    # Print the results
    print("VectorQuery Results:")
    print(results)

    # Assert the expected format of the results
    assert len(results) > 0
    for result in results:
        assert not isinstance(result["name"], list)
        assert not isinstance(result["description"], list)
        assert not isinstance(result["price"], list)

    # Create a FilterQuery with dialect 3
    filter_query = FilterQuery(
        filter_expression=Tag("color") == "black",
        return_fields=["name", "description", "price"],
        dialect=3
    )

    # Execute the query
    results = index.query(filter_query)

    # Print the results
    print("FilterQuery Results:")
    print(results)

    # Assert the expected format of the results
    assert len(results) > 0
    for result in results:
        assert not isinstance(result["name"], list)
        assert not isinstance(result["description"], list)
        assert not isinstance(result["price"], list)
rbs333 commented 4 months ago

@bsbodden it's outputting as a string so for your test check to work it would need to be cast image

rbs333 commented 4 months ago

@bsbodden can be changed to or something more elegant than the test correctly bombs

for result in results:
        assert not isinstance(json.loads(result["name"]), list)
        assert not isinstance(json.loads(result["description"]), list)
        assert not isinstance(json.loads(result["price"]), list)