vespa-engine / pyvespa

Python API for https://vespa.ai, the open big data serving engine
https://pyvespa.readthedocs.io/
Apache License 2.0
105 stars 34 forks source link

[Feedback wanted] Pyvespa Querybuilder API #957

Open thomasht86 opened 1 month ago

thomasht86 commented 1 month ago

Querybuilder in Pyvespa

We have been working on a draft for implementing support for building YQL-queries programmatically with pyvespa. The goal is to make it as ergonomic as possible for users to build complex queries. We would like feedback on the API-proposal from potential users.

Proposal

The WIP unit tests looks like this as of now.

import unittest
from vespa.querybuilder import Query, Q, Queryfield, G, Condition

class TestQueryBuilder(unittest.TestCase):
    def test_dotProduct_with_annotations(self):
        condition = Q.dotProduct(
            "weightedset_field",
            {"feature1": 1, "feature2": 2},
            annotations={"label": "myDotProduct"},
        )
        q = Query(select_fields="*").from_("querybuilder").where(condition)
        expected = 'select * from querybuilder where ({label:"myDotProduct"}dotProduct(weightedset_field, {"feature1":1,"feature2":2}))'
        self.assertEqual(q, expected)
        return q

    def test_geoLocation_with_annotations(self):
        condition = Q.geoLocation(
            "location_field",
            37.7749,
            -122.4194,
            "10km",
            annotations={"targetHits": 100},
        )
        q = Query(select_fields="*").from_("querybuilder").where(condition)
        expected = 'select * from querybuilder where ({targetHits:100}geoLocation(location_field, 37.7749, -122.4194, "10km"))'
        self.assertEqual(q, expected)
        return q

    def test_select_specific_fields(self):
        f1 = Queryfield("f1")
        condition = f1.contains("v1")
        q = Query(select_fields=["f1", "f2"]).from_("sd1").where(condition)

        self.assertEqual(q, 'select f1, f2 from sd1 where f1 contains "v1"')

    def test_select_from_specific_sources(self):
        f1 = Queryfield("f1")
        condition = f1.contains("v1")
        q = Query(select_fields="*").from_("sd1").where(condition)

        self.assertEqual(q, 'select * from sd1 where f1 contains "v1"')

    def test_select_from_multiples_sources(self):
        f1 = Queryfield("f1")
        condition = f1.contains("v1")
        q = Query(select_fields="*").from_("sd1", "sd2").where(condition)

        self.assertEqual(q, 'select * from sd1, sd2 where f1 contains "v1"')

    def test_basic_and_andnot_or_offset_limit_param_order_by_and_contains(self):
        f1 = Queryfield("f1")
        f2 = Queryfield("f2")
        f3 = Queryfield("f3")
        f4 = Queryfield("f4")
        condition = ((f1.contains("v1") & f2.contains("v2")) | f3.contains("v3")) & (
            ~f4.contains("v4")
        )
        q = (
            Query(select_fields="*")
            .from_("querybuilder")
            .where(condition)
            .set_offset(1)
            .set_limit(2)
            .set_timeout(3000)
            .orderByDesc("age")
            .orderByAsc("duration")
        )

        expected = 'select * from querybuilder where ((f1 contains "v1" and f2 contains "v2") or f3 contains "v3") and !(f4 contains "v4") order by age desc, duration asc limit 2 offset 1 timeout 3000'
        self.assertEqual(q, expected)
        return q

    def test_matches(self):
        condition = (
            (Queryfield("f1").matches("v1") & Queryfield("f2").matches("v2"))
            | Queryfield("f3").matches("v3")
        ) & ~Queryfield("f4").matches("v4")
        q = Query(select_fields="*").from_("sd1").where(condition)
        expected = 'select * from sd1 where ((f1 matches "v1" and f2 matches "v2") or f3 matches "v3") and !(f4 matches "v4")'
        self.assertEqual(q, expected)

    def test_nested_queries(self):
        nested_query = (
            Queryfield("f2").contains("2") & Queryfield("f3").contains("3")
        ) | (Queryfield("f2").contains("4") & ~Queryfield("f3").contains("5"))
        condition = Queryfield("f1").contains("1") & ~nested_query
        q = Query(select_fields="*").from_("sd1").where(condition)
        expected = 'select * from sd1 where f1 contains "1" and (!((f2 contains "2" and f3 contains "3") or (f2 contains "4" and !(f3 contains "5"))))'
        self.assertEqual(q, expected)

    def test_userInput_with_and_without_defaultIndex(self):
        condition = Q.userQuery(value="value1") & Q.userQuery(
            index="index", value="value2"
        )
        q = Query(select_fields="*").from_("sd1").where(condition)
        expected = 'select * from sd1 where userQuery("value1") and ({"defaultIndex":"index"})userQuery("value2")'
        self.assertEqual(q, expected)

    def test_fields_duration(self):
        f1 = Queryfield("subject")
        f2 = Queryfield("display_date")
        f3 = Queryfield("duration")
        condition = Query(select_fields=[f1, f2]).from_("calendar").where(f3 > 0)
        expected = "select subject, display_date from calendar where duration > 0"
        self.assertEqual(condition, expected)

    def test_nearest_neighbor(self):
        condition_uq = Q.userQuery()
        condition_nn = Q.nearestNeighbor(
            field="dense_rep", query_vector="q_dense", annotations={"targetHits": 10}
        )
        q = (
            Query(select_fields=["id, text"])
            .from_("m")
            .where(condition_uq | condition_nn)
        )
        expected = "select id, text from m where userQuery() or ({targetHits:10}nearestNeighbor(dense_rep, q_dense))"
        self.assertEqual(q, expected)

    def test_build_many_nn_operators(self):
        self.maxDiff = None
        conditions = [
            Q.nearestNeighbor(
                field="colbert",
                query_vector=f"binary_vector_{i}",
                annotations={"targetHits": 100},
            )
            for i in range(32)
        ]
        # Use Condition.any to combine conditions with OR
        q = (
            Query(select_fields="*")
            .from_("doc")
            .where(condition=Condition.any(*conditions))
        )
        expected = "select * from doc where " + " or ".join(
            [
                f"({{targetHits:100}}nearestNeighbor(colbert, binary_vector_{i}))"
                for i in range(32)
            ]
        )
        self.assertEqual(q, expected)

    def test_field_comparison_operators(self):
        f1 = Queryfield("age")
        condition = (f1 > 30) & (f1 <= 50)
        q = Query(select_fields="*").from_("people").where(condition)
        expected = "select * from people where age > 30 and age <= 50"
        self.assertEqual(q, expected)

    def test_field_in_range(self):
        f1 = Queryfield("age")
        condition = f1.in_range(18, 65)
        q = Query(select_fields="*").from_("people").where(condition)
        expected = "select * from people where range(age, 18, 65)"
        self.assertEqual(q, expected)

    def test_field_annotation(self):
        f1 = Queryfield("title")
        annotations = {"highlight": True}
        annotated_field = f1.annotate(annotations)
        q = Query(select_fields="*").from_("articles").where(annotated_field)
        expected = "select * from articles where ({highlight:true})title"
        self.assertEqual(q, expected)

    def test_condition_annotation(self):
        f1 = Queryfield("title")
        condition = f1.contains("Python")
        annotated_condition = condition.annotate({"filter": True})
        q = Query(select_fields="*").from_("articles").where(annotated_condition)
        expected = 'select * from articles where ({filter:true})title contains "Python"'
        self.assertEqual(q, expected)

    def test_grouping_aggregation(self):
        grouping = G.all(G.group("category"), G.output(G.count()))
        q = Query(select_fields="*").from_("products").group(grouping)
        expected = "select * from products | all(group(category) output(count()))"
        self.assertEqual(q, expected)

    def test_add_parameter(self):
        f1 = Queryfield("title")
        condition = f1.contains("Python")
        q = (
            Query(select_fields="*")
            .from_("articles")
            .where(condition)
            .add_parameter("tracelevel", 1)
        )
        expected = 'select * from articles where title contains "Python"&tracelevel=1'
        self.assertEqual(q, expected)

    def test_custom_ranking_expression(self):
        condition = Q.rank(
            Q.userQuery(), Q.dotProduct("embedding", {"feature1": 1, "feature2": 2})
        )
        q = Query(select_fields="*").from_("documents").where(condition)
        expected = 'select * from documents where rank(userQuery(), dotProduct(embedding, {"feature1":1,"feature2":2}))'
        self.assertEqual(q, expected)

    def test_wand(self):
        condition = Q.wand("keywords", {"apple": 10, "banana": 20})
        q = Query(select_fields="*").from_("fruits").where(condition)
        expected = 'select * from fruits where wand(keywords, {"apple":10,"banana":20})'
        self.assertEqual(q, expected)

    def test_weakand(self):
        condition1 = Queryfield("title").contains("Python")
        condition2 = Queryfield("description").contains("Programming")
        condition = Q.weakAnd(
            condition1, condition2, annotations={"targetNumHits": 100}
        )
        q = Query(select_fields="*").from_("articles").where(condition)
        expected = 'select * from articles where ({"targetNumHits":100}weakAnd(title contains "Python", description contains "Programming"))'
        self.assertEqual(q, expected)

    def test_geoLocation(self):
        condition = Q.geoLocation("location_field", 37.7749, -122.4194, "10km")
        q = Query(select_fields="*").from_("places").where(condition)
        expected = 'select * from places where geoLocation(location_field, 37.7749, -122.4194, "10km")'
        self.assertEqual(q, expected)

    def test_condition_all_any(self):
        c1 = Queryfield("f1").contains("v1")
        c2 = Queryfield("f2").contains("v2")
        c3 = Queryfield("f3").contains("v3")
        condition = Condition.all(c1, c2, Condition.any(c3, ~c1))
        q = Query(select_fields="*").from_("sd1").where(condition)
        expected = 'select * from sd1 where f1 contains "v1" and f2 contains "v2" and (f3 contains "v3" or !(f1 contains "v1"))'
        self.assertEqual(q, expected)

    def test_order_by_with_annotations(self):
        f1 = "relevance"
        f2 = "price"
        annotations = {"strength": 0.5}
        q = (
            Query(select_fields="*")
            .from_("products")
            .orderByDesc(f1, annotations)
            .orderByAsc(f2)
        )
        expected = (
            'select * from products order by {"strength":0.5}relevance desc, price asc'
        )
        self.assertEqual(q, expected)

    def test_field_comparison_methods(self):
        f1 = Queryfield("age")
        condition = f1.ge(18) & f1.lt(30)
        q = Query(select_fields="*").from_("users").where(condition)
        expected = "select * from users where age >= 18 and age < 30"
        self.assertEqual(q, expected)

    def test_filter_annotation(self):
        f1 = Queryfield("title")
        condition = f1.contains("Python").annotate({"filter": True})
        q = Query(select_fields="*").from_("articles").where(condition)
        expected = 'select * from articles where ({filter:true})title contains "Python"'
        self.assertEqual(q, expected)

    def test_nonEmpty(self):
        condition = Q.nonEmpty(Queryfield("comments").eq("any_value"))
        q = Query(select_fields="*").from_("posts").where(condition)
        expected = 'select * from posts where nonEmpty(comments = "any_value")'
        self.assertEqual(q, expected)

    def test_dotProduct(self):
        condition = Q.dotProduct("vector_field", {"feature1": 1, "feature2": 2})
        q = Query(select_fields="*").from_("vectors").where(condition)
        expected = 'select * from vectors where dotProduct(vector_field, {"feature1":1,"feature2":2})'
        self.assertEqual(q, expected)

    def test_in_range_string_values(self):
        f1 = Queryfield("date")
        condition = f1.in_range("2021-01-01", "2021-12-31")
        q = Query(select_fields="*").from_("events").where(condition)
        expected = "select * from events where range(date, 2021-01-01, 2021-12-31)"
        self.assertEqual(q, expected)

    def test_condition_inversion(self):
        f1 = Queryfield("status")
        condition = ~f1.eq("inactive")
        q = Query(select_fields="*").from_("users").where(condition)
        expected = 'select * from users where !(status = "inactive")'
        self.assertEqual(q, expected)

    def test_multiple_parameters(self):
        f1 = Queryfield("title")
        condition = f1.contains("Python")
        q = (
            Query(select_fields="*")
            .from_("articles")
            .where(condition)
            .add_parameter("tracelevel", 1)
            .add_parameter("language", "en")
        )
        expected = 'select * from articles where title contains "Python"&tracelevel=1&language=en'
        self.assertEqual(q, expected)

    def test_multiple_groupings(self):
        grouping = G.all(
            G.group("category"),
            G.maxRtn(10),
            G.output(G.count()),
            G.each(G.group("subcategory"), G.output(G.summary())),
        )
        q = Query(select_fields="*").from_("products").group(grouping)
        expected = "select * from products | all(group(category) max(10) output(count()) each(group(subcategory) output(summary())))"
        self.assertEqual(q, expected)

    def test_default_index_annotation(self):
        condition = Q.userQuery("search terms", index="default_field")
        q = Query(select_fields="*").from_("documents").where(condition)
        expected = 'select * from documents where ({"defaultIndex":"default_field"})userQuery("search terms")'
        self.assertEqual(q, expected)

    def test_Q_p_function(self):
        condition = Q.p(
            Queryfield("f1").contains("v1"),
            Queryfield("f2").contains("v2"),
            Queryfield("f3").contains("v3"),
        )
        q = Query(select_fields="*").from_("sd1").where(condition)
        expected = 'select * from sd1 where f1 contains "v1" and f2 contains "v2" and f3 contains "v3"'
        self.assertEqual(q, expected)

    def test_rank_multiple_conditions(self):
        condition = Q.rank(
            Q.userQuery(),
            Q.dotProduct("embedding", {"feature1": 1}),
            Q.weightedSet("tags", {"tag1": 2}),
        )
        q = Query(select_fields="*").from_("documents").where(condition)
        expected = 'select * from documents where rank(userQuery(), dotProduct(embedding, {"feature1":1}), weightedSet(tags, {"tag1":2}))'
        self.assertEqual(q, expected)

    def test_nonEmpty_with_annotations(self):
        annotated_field = Queryfield("comments").annotate({"filter": True})
        condition = Q.nonEmpty(annotated_field)
        q = Query(select_fields="*").from_("posts").where(condition)
        expected = "select * from posts where nonEmpty(({filter:true})comments)"
        self.assertEqual(q, expected)

    def test_weight_annotation(self):
        condition = Queryfield("title").contains("heads", annotations={"weight": 200})
        q = Query(select_fields="*").from_("s1").where(condition)
        expected = 'select * from s1 where title contains({weight:200}"heads")'
        self.assertEqual(q, expected)

    def test_nearest_neighbor_annotations(self):
        condition = Q.nearestNeighbor(
            field="dense_rep", query_vector="q_dense", annotations={"targetHits": 10}
        )
        q = Query(select_fields=["id, text"]).from_("m").where(condition)
        expected = "select id, text from m where ({targetHits:10}nearestNeighbor(dense_rep, q_dense))"
        self.assertEqual(q, expected)

    def test_phrase(self):
        text = Queryfield("text")
        condition = text.contains(Q.phrase("st", "louis", "blues"))
        query = Q.select("*").where(condition)
        expected = 'select * from * where text contains phrase("st", "louis", "blues")'
        self.assertEqual(query, expected)

    def test_near(self):
        title = Queryfield("title")
        condition = title.contains(Q.near("madonna", "saint"))
        query = Q.select("*").where(condition)
        expected = 'select * from * where title contains near("madonna", "saint")'
        self.assertEqual(query, expected)

    def test_onear(self):
        title = Queryfield("title")
        condition = title.contains(Q.onear("madonna", "saint"))
        query = Q.select("*").where(condition)
        expected = 'select * from * where title contains onear("madonna", "saint")'
        self.assertEqual(query, expected)

    def test_sameElement(self):
        persons = Queryfield("persons")
        first_name = Queryfield("first_name")
        last_name = Queryfield("last_name")
        year_of_birth = Queryfield("year_of_birth")
        condition = persons.contains(
            Q.sameElement(
                first_name.contains("Joe"),
                last_name.contains("Smith"),
                year_of_birth < 1940,
            )
        )
        query = Q.select("*").where(condition)
        expected = 'select * from * where persons contains sameElement(first_name contains "Joe", last_name contains "Smith", year_of_birth < 1940)'
        self.assertEqual(query, expected)

    def test_equiv(self):
        fieldName = Queryfield("fieldName")
        condition = fieldName.contains(Q.equiv("A", "B"))
        query = Q.select("*").where(condition)
        expected = 'select * from * where fieldName contains equiv("A", "B")'
        self.assertEqual(query, expected)

    def test_uri(self):
        myUrlField = Queryfield("myUrlField")
        condition = myUrlField.contains(Q.uri("vespa.ai/foo"))
        query = Q.select("*").where(condition)
        expected = 'select * from * where myUrlField contains uri("vespa.ai/foo")'
        self.assertEqual(query, expected)

    def test_fuzzy(self):
        myStringAttribute = Queryfield("myStringAttribute")
        annotations = {"prefixLength": 1, "maxEditDistance": 2}
        condition = myStringAttribute.contains(
            Q.fuzzy("parantesis", annotations=annotations)
        )
        query = Q.select("*").where(condition)
        expected = 'select * from * where myStringAttribute contains ({prefixLength:1,maxEditDistance:2}fuzzy("parantesis"))'
        self.assertEqual(query, expected)

    def test_userInput(self):
        condition = Q.userInput("@animal")
        query = Q.select("*").where(condition).param("animal", "panda")
        expected = "select * from * where userInput(@animal)&animal=panda"
        self.assertEqual(query, expected)

    def test_in_operator(self):
        integer_field = Queryfield("integer_field")
        condition = integer_field.in_(10, 20, 30)
        query = Q.select("*").where(condition)
        expected = "select * from * where integer_field in (10, 20, 30)"
        self.assertEqual(query, expected)

    def test_predicate(self):
        condition = Q.predicate(
            "predicate_field",
            attributes={"gender": "Female"},
            range_attributes={"age": "20L"},
        )
        query = Q.select("*").where(condition)
        expected = 'select * from * where predicate(predicate_field,{"gender":"Female"},{"age":20L})'
        self.assertEqual(query, expected)

    def test_true(self):
        condition = Q.true()
        query = Q.select("*").where(condition)
        expected = "select * from * where true"
        self.assertEqual(query, expected)

    def test_false(self):
        condition = Q.false()
        query = Q.select("*").where(condition)
        expected = "select * from * where false"
        self.assertEqual(query, expected)

if __name__ == "__main__":
    unittest.main()

Feedback

Please post any feedback and/or modification suggestions to this below. Now is the time to speak up! 😄

marcoaleixo commented 1 month ago

Nice! Right now we have our own "vespa_services" where we try to have some common interfaces to our devs, having this in pyvespa itself will be awesome.

Do you plan to add support pagination? Maybe this is not related to the scope of the Querybuilder. 🤔

thomasht86 commented 1 month ago

Thanks for the input.

The intention is to support setting limit and offset- see link Also, continuations in grouping should be supported.

.set_limit(2)
.set_offset(5)

visiting is not the scope of this, but that also supports continuation.

Is that what you were thinking?

marcoaleixo commented 1 month ago

Hmmm, cool! - That should cover my use case!

So let's say my user delete a document from a folder, so we normally use offset to go over all documents for this specific file_id that exists in a streaming group ( by folder_id ) and call the delete operation for each document found.

Another use case is that we have a low topK ( 5 ) and we have a LLM tool that decides - are those sources enough to reply the question? If not, bring the next 5.

Still not sure if that's the best way, but it works. :)