groonga / groonga

An embeddable fulltext search engine. Groonga is the successor project to Senna.
https://groonga.org/
GNU Lesser General Public License v2.1
792 stars 116 forks source link

ONPP operator does not produce expected results #1628

Open yssrku opened 10 months ago

yssrku commented 10 months ago

What happend?

I'm trying to search for IPv4 address that might be defanged.

Currently I'm using *ONP5"a b c d" which works good enough, but I thought the *ONPP operator might able to help me make the search more accurate.

However, *ONPP does not match as expected, and I don't see any mention of restrictions in the docs.

How to reproduce it

table_create Reports TABLE_HASH_KEY ShortText
table_create ReportsIndex TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto

column_create Reports text COLUMN_SCALAR|COMPRESS_ZSTD LongText

column_create ReportsIndex index_content COLUMN_INDEX|WITH_POSITION|WITH_SECTION --type Reports --source text

load --table Reports
[
    {"_key": "a", "text": "a.b.c.d"}
]

# This produced expected match
select --table Reports --query '*ONPP "(a) (.) (b)"' --match_columns text

# This can also match
select --table Reports --query '*ONPP "(.) (b)"' --match_columns text

# This does NOT match
select --table Reports --query '*ONPP "(.) (b) (.)"' --match_columns text

# This is what I really wanted to do, I know 4 tokens that could be separated by two possible tokens, does not work 
select --table Reports --query '*ONPP "(a) (. [.]) (b) (. [.]) (c) (. [.]) (d)"' --match_columns text

Expected behavior

All select in the reproduction script should match.

Environment

Additional context

No response

kou commented 10 months ago

Oh, this is difficult to fix...

WIP patch:

diff --git a/lib/ii.c b/lib/ii.c
index 75849fb7c..134525515 100644
--- a/lib/ii.c
+++ b/lib/ii.c
@@ -13533,19 +13533,25 @@ grn_ii_select_cursor_next_find_near(grn_ctx *ctx,
   bool need_check = true;
   if (data->mode == GRN_OP_NEAR_PHRASE ||
       data->mode == GRN_OP_ORDERED_NEAR_PHRASE) {
+    int32_t start_pos = data->pos;
     uint32_t phrase_id;
     for (phrase_id = 0; phrase_id < data->n_phrases; phrase_id++) {
-      if (!grn_ii_select_data_find_phrase(ctx, data, phrase_id, data->pos, 0)) {
+      if (!grn_ii_select_data_find_phrase(ctx, data, phrase_id, start_pos, 0)) {
         need_check = false;
         break;
       }
-      /* TODO: Can we update data->pos to reduce needless search? */
       bt_push(data->bt, data->token_info);
+      if (data->mode == GRN_OP_ORDERED_NEAR_PHRASE &&
+          data->token_info->pos > start_pos) {
+        start_pos = data->token_info->pos;
+      }
+      /* TODO: Can we update data->pos to reduce needless search? */
     }
   } else if (data->mode == GRN_OP_NEAR_PHRASE_PRODUCT ||
              data->mode == GRN_OP_ORDERED_NEAR_PHRASE_PRODUCT) {
     uint32_t i;
     uint32_t phrase_id = 0;
+    int32_t start_pos = data->pos;
     for (i = 0; i < data->n_phrase_groups; i++) {
       phrase_group *group = &(data->phrase_groups[i]);
       bt_zap(group->btree);
@@ -13555,10 +13561,14 @@ grn_ii_select_cursor_next_find_near(grn_ctx *ctx,
         if (grn_ii_select_data_find_phrase(ctx,
                                            data,
                                            phrase_id,
-                                           data->pos,
+                                           start_pos,
                                            0)) {
           have_phrase = true;
           bt_push(group->btree, data->token_info);
+          if (data->mode == GRN_OP_ORDERED_NEAR_PHRASE_PRODUCT &&
+              data->token_info->pos > start_pos) {
+            start_pos = data->token_info->pos;
+          }
           /* TODO: Can we update data->pos to reduce needless search? */
         }
       }
diff --git a/test/command/suite/select/query/ordered_near_phrase_product/same_phrases.test b/test/command/suite/select/query/ordered_near_phrase_product/same_phrases.test
new file mode 100644
index 000000000..2ecd7e858
--- /dev/null
+++ b/test/command/suite/select/query/ordered_near_phrase_product/same_phrases.test
@@ -0,0 +1,19 @@
+table_create Entries TABLE_NO_KEY
+column_create Entries content COLUMN_SCALAR Text
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenNgram \
+  --normalizer NormalizerNFKC150
+column_create Terms entries_content COLUMN_INDEX|WITH_POSITION Entries content
+
+load --table Entries
+[
+{"content": "a.b.c.d"},
+{"content": "a[.]b.c[.]d"},
+{"content": "x.y.z"}
+]
+
+select Entries \
+  --match_columns content \
+  --query '*ONPP10"(a) (. [.]) (b) (. [.]) (c) (. [.]) (d)"' \
+  --output_columns '_score, content'