opendistro-for-elasticsearch / k-NN

🆕 A machine learning plugin which supports an approximate k-NN search algorithm for Open Distro.
https://opendistro.github.io/
Apache License 2.0
277 stars 55 forks source link

KNN Script fails when the filtered document does not contain the vector. #230

Closed vamshin closed 4 years ago

vamshin commented 4 years ago

KNN script would corrupt results if the document that is filtered part of query does not contain the vector.

Steps to reproduce:-

curl -X PUT "localhost:9200/my_index?pretty" -H 'Content-Type: application/json' -d'
{
"settings" : {
  "number_of_shards" :   1,
  "number_of_replicas" : 0
  },
  "mappings": {
    "properties": {
      "my_dense_vector": {
        "type": "knn_vector",
        "dimension": 3
      },
      "status" : {
        "type" : "keyword"
      }
    }
  }
}
'

curl -X PUT "localhost:9200/my_index/_doc/1?pretty" -H 'Content-Type: application/json' -d'
{
  "my_dense_vector": [0.5, 10, 6],
  "status" : "published"
}
'
curl -X PUT "localhost:9200/my_index/_doc/2?pretty" -H 'Content-Type: application/json' -d'
{
  "my_dense_vector": [-0.5, 10, 10],
  "status" : "published"
}
'

curl -X PUT "localhost:9200/my_index/_doc/3?pretty" -H 'Content-Type: application/json' -d'
{
  "status" : "published"
}
'
curl -X PUT "localhost:9200/my_index/_doc/4?pretty" -H 'Content-Type: application/json' -d'
{
  "status" : "abc"
}
'

 curl -XPOST localhost:9200/my_index/_forcemerge?max_num_segments=1

curl -X GET "localhost:9200/my_index/_search?pretty" -H 'Content-Type: application/json' -d'
{
"query": {
"script_score": {
"query" : {
"bool" : {
"filter" : {
"term" : {
"status" : "published"
}
}
}
},
"script": {
"lang": "knn",
"source": "knn_score",
"params": {
"field": "my_dense_vector",
"vector": [2.0, 2.0, 2.0],
"space": "l2"
}
}
}
}
}
'

Result:- Score for the doc id 3 should be Float.Min as it does not contain the vector but the score of the previous document gets carried over in this case

{
  "took" : 209,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 3,
      "relation" : "eq"
    },
    "max_score" : 0.012012012,
    "hits" : [
      {
        "_index" : "my_index",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.012012012,
        "_source" : {
          "my_dense_vector" : [
            0.5,
            10,
            6
          ],
          "status" : "published"
        }
      },
      {
        "_index" : "my_index",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 0.0073937154,
        "_source" : {
          "my_dense_vector" : [
            -0.5,
            10,
            10
          ],
          "status" : "published"
        }
      },
      {
        "_index" : "my_index",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 0.0073937154,  ----> This is the corrupted score
        "_source" : {
          "status" : "published"
        }
      }
    ]
  }
}
vamshin commented 4 years ago

After fix

{
  "took" : 158,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 3,
      "relation" : "eq"
    },
    "max_score" : 0.012012012,
    "hits" : [
      {
        "_index" : "my_index",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.012012012,
        "_source" : {
          "my_dense_vector" : [
            0.5,
            10,
            6
          ],
          "status" : "published"
        }
      },
      {
        "_index" : "my_index",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 0.0073937154,
        "_source" : {
          "my_dense_vector" : [
            -0.5,
            10,
            10
          ],
          "status" : "published"
        }
      },
      {
        "_index" : "my_index",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 1.4E-45,   // No vector in this doc, so assigned Float.MIN
        "_source" : {
          "status" : "published"
        }
      }
    ]
  }
}