vespa-engine / vespa

AI + Data, online. https://vespa.ai
https://vespa.ai
Apache License 2.0
5.47k stars 584 forks source link

special-tokens not applied for document, only query #30491

Open jobergum opened 4 months ago

jobergum commented 4 months ago

Using special-tokens it only seem to apply to the query, not the document texts.

<config name="vespa.configdefinition.specialtokens">
        <tokenlist>
            <item>
                <name>default</name>
                <tokens>
                    <item>
                        <token>s.a</token>
                    </item>
                    <item>
                        <token>.net</token>
                    </item>
                </tokens>
            </item>
        </tokenlist>
        </config>

schema:

schema doc {

    document doc {

        field text_simple type string {
            indexing: index | summary
            index: enable-bm25
                }
        field text type array<string> {
            indexing: index | summary
            index: enable-bm25
        }
    }
    fieldset default {
        fields: text
    }

    rank-profile default inherits default {
        first-phase {
            expression: nativeRank(text) 
        }
    }

   document-summary my-debug-summary {
        summary text_tokens type array<string> {
            source:text 
            tokens
    }

    summary text_simple_tokens type array<string> {
            source:text_simple
            tokens
        }
        from-disk
    }

}

feed

{ "put": "id:doc:doc::1", "fields": {"text_simple": "Corp s.a", "text": ["Corporation s.a"] } }

Query

vespa query 'query=s a' 'summary=my-debug-summary' 'trace.level=2'
{
    "trace": {
        "children": [
            {
                "message": "No query profile is used"
            },
            {
                "message": "Invoking chain 'vespa' [com.yahoo.prelude.statistics.StatisticsSearcher@native -> com.yahoo.prelude.querytransform.PhrasingSearcher@vespa -> ... -> federation@native]"
            },
            {
                "children": [
                    {
                        "message": "Federating to [text]"
                    },
                    {
                        "children": [
                            {
                                "message": "Stemming: [select * from sources * where weakAnd(default contains ({stem: false}\"s\"), default contains ({stem: false}\"a\")) timeout 9999]"
                            },
                            {
                                "message": "Lowercasing: [select * from sources * where weakAnd(default contains ({stem: false, normalizeCase: false}\"s\"), default contains ({stem: false, normalizeCase: false}\"a\")) timeout 9999]"
                            },
                            {
                                "message": "sc0.num0 search to dispatch: query=[WEAKAND(100) s a] timeout=9999ms offset=0 hits=10 groupingSessionCache=true sessionId=26d90702-9dde-411a-afc9-66e098e21dd6.1709648169518.19.default grouping=0 :  restrict=[doc]"
                            },
                            {
                                "message": "Current state of query tree: WEAKAND[N=100]{\n  WORD[fromSegmented=false index=\"\" origin=\"(0 1)\" segmentIndex=0 stemmed=true uniqueID=1 words=true]{\n    \"s\"\n  }\n  WORD[fromSegmented=false index=\"\" origin=\"(2 3)\" segmentIndex=0 stemmed=true uniqueID=2 words=true]{\n    \"a\"\n  }\n}\n"
                            },
                            {
                                "message": "Dispatching to search node in cluster = dispatcher.text key = 0 hostname = vespa-container path = 0 in group 0 statusIsKnown = true working = true activeDocs = 1 targetActiveDocs = 1"
                            },
                            {
                                "message": "sc0.num0 dispatch response: Result (1 of total 1 hits)"
                            },
                            {
                                "message": "sc0.num0 fill to dispatch: query=[WEAKAND(100) s a] timeout=9999ms offset=0 hits=10 groupingSessionCache=true sessionId=26d90702-9dde-411a-afc9-66e098e21dd6.1709648169518.19.default grouping=0 :  restrict=[doc] summary='my-debug-summary'"
                            },
                            {
                                "message": "Current state of query tree: WEAKAND[N=100]{\n  WORD[fromSegmented=false index=\"\" origin=\"(0 1)\" segmentIndex=0 stemmed=true uniqueID=1 words=true]{\n    \"s\"\n  }\n  WORD[fromSegmented=false index=\"\" origin=\"(2 3)\" segmentIndex=0 stemmed=true uniqueID=2 words=true]{\n    \"a\"\n  }\n}\n"
                            }
                        ]
                    },
                    {
                        "message": "Query parsed to: select * from sources * where weakAnd(default contains \"s\", default contains \"a\") timeout 10000"
                    }
                ]
            }
        ]
    },
    "root": {
        "id": "toplevel",
        "relevance": 1.0,
        "fields": {
            "totalCount": 1
        },
        "coverage": {
            "coverage": 100,
            "documents": 1,
            "full": true,
            "nodes": 1,
            "results": 1,
            "resultsFull": 1
        },
        "children": [
            {
                "id": "index:text/0/c4ca42388ce70a10b392b401",
                "relevance": 0.2233206906812349,
                "source": "text",
                "fields": {
                    "sddocname": "doc",
                    "text_tokens": [
                        [
                            "corporation",
                            "s",
                            "a"
                        ]
                    ],
                    "text_simple_tokens": [
                        "corp",
                        "s",
                        "a"
                    ]
                }
            }
        ]
    }
}

As seen above, s and a is indexed separately. But if we pass query='s.a', the query is not touched, but this then leads to not recalling the document.

vespa query 'query=s.a' 'summary=my-debug-summary' 'trace.level=2' 
{
    "trace": {
        "children": [
            {
                "message": "No query profile is used"
            },
            {
                "message": "Invoking chain 'vespa' [com.yahoo.prelude.statistics.StatisticsSearcher@native -> com.yahoo.prelude.querytransform.PhrasingSearcher@vespa -> ... -> federation@native]"
            },
            {
                "children": [
                    {
                        "message": "Federating to [text]"
                    },
                    {
                        "children": [
                            {
                                "message": "Stemming: [select * from sources * where weakAnd(default contains \"s.a\") timeout 9999]"
                            },
                            {
                                "message": "Lowercasing: [select * from sources * where weakAnd(default contains ({normalizeCase: false}\"s.a\")) timeout 9999]"
                            },
                            {
                                "message": "sc0.num0 search to dispatch: query=[WEAKAND(100) s.a] timeout=9999ms offset=0 hits=10 groupingSessionCache=true sessionId=26d90702-9dde-411a-afc9-66e098e21dd6.1709648230613.21.default grouping=0 :  restrict=[doc]"
                            },
                            {
                                "message": "Current state of query tree: WEAKAND[N=100]{\n  WORD[fromSegmented=false index=\"\" origin=\"(0 3)\" segmentIndex=0 stemmed=false uniqueID=1 words=false]{\n    \"s.a\"\n  }\n}\n"
                            },
                            {
                                "message": "Dispatching to search node in cluster = dispatcher.text key = 0 hostname = vespa-container path = 0 in group 0 statusIsKnown = true working = true activeDocs = 1 targetActiveDocs = 1"
                            },
                            {
                                "message": "sc0.num0 dispatch response: Result (0 of total 0 hits)"
                            }
                        ]
                    },
                    {
                        "message": "Query parsed to: select * from sources * where weakAnd(default contains \"s.a\") timeout 10000"
                    }
                ]
            }
        ]
    },
    "root": {
        "id": "toplevel",
        "relevance": 1.0,
        "fields": {
            "totalCount": 0
        },
        "coverage": {
            "coverage": 100,
            "documents": 1,
            "full": true,
            "nodes": 1,
            "results": 1,
            "resultsFull": 1
        }
    }
}
bratseth commented 4 months ago

Yes, not implemented for the current default linguistics module.