lior-k / fast-elasticsearch-vector-scoring

Score documents using embedding-vectors dot-product or cosine-similarity with ES Lucene engine
Apache License 2.0
395 stars 112 forks source link

Search results are always in order of their id number and not similarity #25

Closed RobRoyce closed 4 years ago

RobRoyce commented 5 years ago

When performing the query as outlined in the README, I get back the documents within my index, but they are always returned in numerical order (by id) and not by similarity. What am I doing wrong?

lior-k commented 5 years ago

The test under fast-elasticsearch-vector-scoring/src/test/java/com/liorkn/elasticsearch/PluginTest.java works fine with the query provided. Can you share your query?

RobRoyce commented 5 years ago

Using 7.1.0, made sure I was on es-7.1 branch for zipping/installing the plugin.

Two things to point out. I tried the following in Python which resulted in a TransportError: TransportError(500, 'search_phase_execution_exception', None) from ES.

body = {
    "mappings": {
        "properties": {
            "embedding_vector": {
                'type': 'binary',
                'doc_values': True
            }
        }
    }
}
def index_data(data):
    counter = 1
    for vector in data:
        body = {
            "id": counter,
            "embedding_vector": encode_array(vector)
        }
        es.index(index=es_index, body=body)
        counter += 1
data = np.random.rand(10,14,14).tolist()
index_data(data)
search = {
    "query": {
        "function_score": {
            "boost_mode": "replace",
            "script_score": {
                "script": {
                    "source": "binary_vector_score",
                    "lang": "knn",
                    "params": {
                        "cosine": False,
                        "field": "embedding_vectors",
                        "vector": [-0.09217305481433868, 0.010635560378432274, -0.02878434956073761, 0.06988169997930527, 0.1273992955684662, -0.023723633959889412, 0.05490724742412567, -0.12124507874250412, -0.023694118484854698, 0.014595639891922474, 0.1471538096666336, 0.044936809688806534, -0.02795785665512085, -0.05665992572903633, -0.2441125512123108, 0.2755320072174072, 0.11451690644025803, 0.20242854952812195, -0.1387604922056198, 0.05219579488039017, 0.1145530641078949, 0.09967200458049774, 0.2161576747894287, 0.06157230958342552, 0.10350126028060913, 0.20387393236160278, 0.1367097795009613, 0.02070528082549572, 0.19238869845867157, 0.059613026678562164, 0.014012521132826805, 0.16701748967170715, 0.04985826835036278, -0.10990987718105316, -0.12032567709684372, -0.1450948715209961, 0.13585780560970306, 0.037511035799980164, 0.04251480475068092, 0.10693439096212387, -0.08861573040485382, -0.07457160204648972, 0.0549330934882164, 0.19136285781860352, 0.03346432000398636, -0.03652812913060188, -0.1902569830417633, 0.03250952064990997, -0.3061246871948242, 0.05219300463795662, -0.07879918068647385, 0.1403723508119583, -0.08893408626317978, -0.24330253899097443, -0.07105310261249542, -0.18161986768245697, 0.15501035749912262, -0.216160386800766, -0.06377710402011871, -0.07671763002872467, 0.05360138416290283, -0.052845533937215805, -0.02905619889497757, 0.08279753476381302]
                    }
                }
            }
        }
    },
    "size": 100
}
es.search(es_index, search)

I tried the same query from within Kibana's console after my vectors had been stored in ES as base64 strings:

POST es_image_search/_search
{
  "query": {
    "function_score": {
      "boost_mode": "replace",
      "script_score": {
        "script": {
          "source": "binary_vector_score",
          "lang": "knn",
          "params": {
            "cosine": false,
            "field": "embedding_vector",
            "vector": [
               -0.09217305481433868, 0.010635560378432274, -0.02878434956073761, 0.06988169997930527, 0.1273992955684662, -0.023723633959889412, 0.05490724742412567, -0.12124507874250412, -0.023694118484854698, 0.014595639891922474, 0.1471538096666336, 0.044936809688806534, -0.02795785665512085, -0.05665992572903633, -0.2441125512123108, 0.2755320072174072, 0.11451690644025803, 0.20242854952812195, -0.1387604922056198, 0.05219579488039017, 0.1145530641078949, 0.09967200458049774, 0.2161576747894287, 0.06157230958342552, 0.10350126028060913, 0.20387393236160278, 0.1367097795009613, 0.02070528082549572, 0.19238869845867157, 0.059613026678562164, 0.014012521132826805, 0.16701748967170715, 0.04985826835036278, -0.10990987718105316, -0.12032567709684372, -0.1450948715209961, 0.13585780560970306, 0.037511035799980164, 0.04251480475068092, 0.10693439096212387, -0.08861573040485382, -0.07457160204648972, 0.0549330934882164, 0.19136285781860352, 0.03346432000398636, -0.03652812913060188, -0.1902569830417633, 0.03250952064990997, -0.3061246871948242, 0.05219300463795662, -0.07879918068647385, 0.1403723508119583, -0.08893408626317978, -0.24330253899097443, -0.07105310261249542, -0.18161986768245697, 0.15501035749912262, -0.216160386800766, -0.06377710402011871, -0.07671763002872467, 0.05360138416290283, -0.052845533937215805, -0.02905619889497757, 0.08279753476381302
             ]
          }
        }
      }
    }
  },
  "size": 100
}

which returns:

{
  "took" : 9,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 20,
      "relation" : "eq"
    },
    "max_score" : 0.0,
    "hits" : [
      {
        "_index" : "es_image_search",
        "_type" : "_doc",
        "_id" : "lJMzDmwBSBtPpRXcd5rF",
        "_score" : 0.0,
        "_source" : {
          "id" : 1,
          "embedding_vector" : "P0vIWT8Avvo+gD+XPs/SrD2ImaY+X0ltPzWsDjye1Gg/PeZ7PsGPXj9jICQ+rOJmPyHxTD9Bn7s+TixhPu4ftz8ZFuM9jBEHP3JZ7D8cyzw+IMyQP0SAnz8g3kk/dM6QP1so8j2at0I/LDmiP2xIRj8We/8+soX2PYF/mj0TPhw+nToVP0kuwz99zNo+BUMHPnmJ4T9iOrg+HcwhP2XQ6z3Olgk/aJosPzaZXj2zgNc/YXxyP2p8+z9wFQc+I0KQPDaofz8eoGE/VeXuPdgTGz6wVQQ+rhCSPtgckjzrIQA+xpweP0OXRT8BhdI+WSJzPxYFrj9Tzn8/Cyw8P0ER4T4zjyg+QKzwPyPdhT9YUm8+ZkPlPwA7Mj058T4/Tt1bP3GvNj3mQRI+4Gt9PxNHND8hmJQ/KqSlP3YEwz6b3/U/A+kwPyxBZT9smK89e96wPymYzz82UKI+frY2PjUcmz9QeOU96VDKPZdlmz9mnvc/GWG8PzO9mD6vWlA/PTRoP0AR7T6HSO0/ACoVPkXAlD8kl74+EM82PxOxHz554eE/W5GNP2zbiT91YG8+gsF+P1KW1T0fNAk+iuHGP2yUNj4mpNI+7a8iPwaKZT8Uy2c+YS6kPzo+7D3Q4Lc/YqTzPuqXYT96S3I/ATtQPSlQbz7Bcww+pRCyPeUfFz8/4Zg91dbUPxHebD2xfgc9dArQPkBhQT9m4jE/KXJyP3TaMT5Mgv4/XqX8P2X4jz8HLtc/fW6xPrtygj9JunQ+84j8PzoSqT7Ur7M+zHZDPpqEkj69ZA882D2ZP0McJj3EAro+9hw1P2BpUz8WFHU/aO3GPxJ1xz8oAeM/YVUHPoWqNT8QTFY/c8f4PtwDyT8yIiQ/buUDPkHifz8ZbWE+oViQP1KlCT9l98c9pGfqP1lsuD3iDSA/L38EPn5mMj5AjpM/exWdPvQLAj8nrCo/BkQLPwDuAD8tiPg/MrZfP2TlYj8Tw7I+HB8oPuzrXD1GzY8+qM5KPrcpUD93Fjs+8KMoPwCq1z6biaI+gtomPz6wHw=="
        }
      },
      {
        "_index" : "es_image_search",
        "_type" : "_doc",
        "_id" : "lZMzDmwBSBtPpRXcd5r_",
        "_score" : 0.0,
        "_source" : {
          "id" : 2,
          "embedding_vector" : "Poza7j0kGCc/A+ZZPq6xmT98klU/SnnLPuuE6T8szq0+oWOWPkvFzj7H7p8/Pqx8PwdYZT78Q4k/cQLlPldNpz7dkhY/HT0yPzoKnz8n1C8+ycX7Py/4fD8jAz8+i6ojP1HJvT86ctM9acpvPw4ZOD882yk9tNvePymPwD8qV0g/TiINPYo+KT5lX+c/Kb88PzXMZz5++08+qtfUP0bFZz9K5VA/VXE/PrQShj6gRPs+/4NWPBGJNT9KlcE/MmlJP0qp3j3LBm0+ygndPoNbQT2W5MY/FOcUPtRXzT6cFjM/akuJPpNGvD8//RE/SAthP2FC1z9v5Ug/WlI9Prym5T64z80+7eo3PxdSKT8u4tc/C42QP1Utcj9ZsK88RHImP0g6sj80hpc9CylyPxY6XT7pS2U+nAaiPxvdmD9PiSY+0tVqP0/96j994s0/eHjjPgUZpz8JbrI+3rrZPxvwXT9262U+RTCQPvEzhz9zdqU+WyLRP2X6wT8W+aM/DDzWP0jFyj92I/Q/GcuCPyj23D6cehE+bgI5PzYZHD8iP+w9Ur0xPtajAz7VAu4+3T8RPzVzbD6N554+uKe7PZkTTz8Y+rg8pX4gPzkX7T6xv9o+IeSyP1fR+j5HZxE/dQjnPumXxj8Q3Lg/OZXLPucbgD5eh2E+pNJuPupTbD8bcfQ8s8bOPxhznz5N1Rk/dOlaP1KbCT9VMAY+oJsmP1bO3z8Ne6Y+LT8fPksjSz6TPQI/XzvRPqP+Xz1hOzg+gtVaPki6Mz9nbnA+aP6QPJXYQj9AqBU/bihOP1PW/z9sYWg/MhRRPtexTj7fttc7hVKAPviJeD8jlDo/eTSYPwfXbD9+7dU+8VHvPzon6j8kC4E+kM83P2kMRT96GEI/QvzjP2Jelz8xRTk/QQoXPpibqT8bUg4/LjlAP3qPMD9wmZQ+dzNbPxkl9D7saqk+MO+bP2XmQT9AlXY/LC8hPlfirj9wuss/WzLmPvlwLz94dIY+/BubPZkiHz7yjXQ/YbwoPJgaWz9ikoY+yYkCPz/vFw=="
        }
      },
      {
        "_index" : "es_image_search",
        "_type" : "_doc",
        "_id" : "lpMzDmwBSBtPpRXceJoM",
        "_score" : 0.0,
        "_source" : {
          "id" : 3,
          "embedding_vector" : "PzpKqz9peu4+2H1GP3wKjz7+tAs+h3fgP1pFBz98Bjs+RY0DPnArQj22TTY+0XmtPvR0HT9vcjU8NO5fPx5G5z8PoGY+nDDHPz/2Tz9JKUg+uOZ+PQZjaT93whE/VVltPrw8lDwp5cw+tMfDP2krnj9r/BI9z3ntPg3Dxz9vVe8+wV+JP39XxD5SOG4/Ww2RPp8Rkj4+WhE+kF+IPvBcCD7mktk+3KrDPjkgjD8Qm/s/agfLP0u30z4U9NM9h1MDPcO0ej7kzqI/UFSwPoqGcT6dEoQ+sZlgPvz4qD8L7u4/dr+eP2KHZT9joUI9wTK2Pw88ez9FTzg+ZeX4Py6oMD8ekPI97JaWPzV+xD5hhwM+mFF4Pw2FEz7upyE+0MmRPl9Ktz5pJMU/XBgXPp2gLT8DznQ/cE5+PkjnZD90/qc+4ERWPbYt/T7qKIA/DfpYP3GjSD4x7jo9GGtmPiVtcj9R7uc+EclyPyfslj7KI2Y+U8WuPTMYuT8b8RE+ubu1Peetgj9Dp3g+lt4qPrQZ5z7+0mo/diZ3PxeYlz6u49s+5d7ePquJlT9I5BU+iAiHP38fYT95to8/SL3HPuscpj8O51U+aJGjPbbzUj9oNQ4+9NKmPpXb/T85nEY9+9ePP0oSJD47uXE/D0XnPz2xSDyf9Co/TM+aPw1D1T2VYCI/Eoq9P2OJOj8GY24/C2CXPzxKvj9O3Oc8gmCKP1XuVD5jgPE/epWNPzUnuz8jDpE9jjVuP1iidj83CdY8sNMhPhCxcz8kQ80/EWh5PqAihj8qqqQ/JthQPvDGmD8qcsE/JlZuPeC0Vj4+1bY/MXfRP2ZIAz31HwA/EOEJP3K1/j8u8q099wO0PzVkjj5DpSo/Dc9IPSInIj9iC/I9h1DxPyUBlzyk060+ORSwPxFmEj6SvPY/AYG8P3+80z9BFPw/Cv4VPw7MeT6zxU8/Yz+HPy0G8j8puww9TZ9SPxHpej62Xso/CGt5Pt5j/j8X1ng/am5pP0nwsz5HEiw/ehQNPRldzT4b/w8/Y0uTPwkZuQ=="
        }
      },
.
.
.

As you can see, they are returned in numerical order of their id's. I get the same thing returned for every vector I try in my query.

So, maybe the query is wrong and ES just defaults to giving me results in order, whereas Python raises an exception?

lior-k commented 5 years ago

Hi Rob, took a while but I figured it out: the dimensions of the search vector do not correlate to the dimensions of your indexed vectors.

in that case, the plugin returns a score of 0.

when I changed the query to a vector of 196 the result came back just fine.

here's a python script I wrote, based on your use case, which succeds:

from elasticsearch import Elasticsearch, NotFoundError
import base64
import numpy as np

_float32_dtype = np.dtype('>f4')

def decode_float_list(base64_string):
    buffer = base64.b64decode(base64_string)
    return np.frombuffer(buffer, dtype=_float32_dtype).tolist()

def encode_array(arr):
    base64_str = base64.b64encode(np.array(arr).astype(_float32_dtype)).decode("utf-8")
    return base64_str

es = Elasticsearch("localhost:9200", send_get_body_as='POST', retry_on_timeout=True, timeout=5000)
INDEX_NAME = "testindex2"

def create_index(name):
    request_body = '''{
        "settings": {
            "number_of_shards": 2,
            "number_of_replicas": 1
        },

        "mappings": {
                "properties": {
                    "embedding_vector": {
                        "type": "binary",
                        "doc_values": true
                    }
                }
        }
    }'''
    print(f"creating {name} index... {request_body}")
    es.indices.create(index = name, body = request_body)

def index_data(data):
    counter = 1
    for vector in data:
        body = {
            "id": counter,
            "embedding_vector": encode_array(vector)
        }
        es.index(index=INDEX_NAME, body=body)
        counter += 1

def search():
    search = {
        "query": {
            "function_score": {
                "boost_mode": "replace",
                "script_score": {
                    "script": {
                        "source": "binary_vector_score",
                        "lang": "knn",
                        "params": {
                            "cosine": False,
                            "field": "embedding_vector",
                            "vector": [ 0.6172189116477966, 0.4812350273132324, 0.2395150065422058, 0.41844668984413147, 0.8617216944694519, 0.12854498624801636, 0.2627895176410675, 0.22640013694763184, 0.5444879531860352, 0.52374267578125, 0.7576023936271667, 0.25305455923080444, 0.5308356285095215, 0.6852802038192749, 0.4624062180519104, 0.1816617250442505, 0.2958976626396179, 0.025580303743481636, 0.16926740109920502, 0.7047653198242188, 0.6931900978088379, 0.04226350784301758, 0.9671088457107544, 0.47195401787757874, 0.2582820653915405, 0.11039293557405472, 0.6919737458229065, 0.5618643760681152, 0.6426474452018738, 0.6258983612060547, 0.8140584826469421, 0.2586701810359955, 0.2690378725528717, 0.9467039704322815, 0.474464476108551, 0.7006123661994934, 0.3056519627571106, 0.934620201587677, 0.33563244342803955, 0.38651159405708313, 0.3424995541572571, 0.23031608760356903, 0.641241729259491, 0.01252000406384468, 0.5705199837684631, 0.24167191982269287, 0.4995182156562805, 0.9633683562278748, 0.618108868598938, 0.9971736669540405, 0.24285273253917694, 0.4431900978088379, 0.67298823595047, 0.5439957976341248, 0.5564237833023071, 0.2304188311100006, 0.4888533055782318, 0.4624284505844116, 0.788846492767334, 0.44891494512557983, 0.9873254299163818, 0.8286163806915283, 0.7455354332923889, 0.8039408326148987, 0.5274253487586975, 0.4829685688018799, 0.6627996563911438, 0.3408285975456238, 0.5105639100074768, 0.066745325922966, 0.13178864121437073, 0.35720911622047424, 0.1358930915594101, 0.5904856324195862, 0.12224390357732773, 0.7346777319908142, 0.9671003222465515, 0.48915180563926697, 0.7750203013420105, 0.14900848269462585, 0.6375364661216736, 0.21111196279525757, 0.8424895405769348, 0.13458995521068573, 0.5942713618278503, 0.6773364543914795, 0.8135702610015869, 0.33085259795188904, 0.3377285897731781, 0.9505098462104797, 0.5543105006217957, 0.9818258285522461, 0.297512948513031, 0.4442136883735657, 0.9673498868942261, 0.7054122090339661, 0.724175751209259, 0.6931982636451721, 0.8991569876670837, 0.01580190286040306, 0.11919090896844864, 0.38001662492752075, 0.5516496300697327, 0.8624045848846436, 0.13067130744457245, 0.12067067623138428, 0.642181932926178, 0.32152852416038513, 0.9839213490486145, 0.6214938759803772, 0.8877131342887878, 0.6137049198150635, 0.14480671286582947, 0.5091487169265747, 0.8738197088241577, 0.6978392004966736, 0.8988777995109558, 0.10804525017738342, 0.7366241216659546, 0.7556180357933044, 0.22851991653442383, 0.1791202872991562, 0.11619532108306885, 0.04393879696726799, 0.7954261898994446, 0.8965669870376587, 0.7234428524971008, 0.23360027372837067, 0.9665877223014832, 0.14681114256381989, 0.9289661645889282, 0.9380605816841125, 0.4196012616157532, 0.4730188846588135, 0.514502227306366, 0.5517736673355103, 0.6869121193885803, 0.8567425608634949, 0.7314034700393677, 0.9989842772483826, 0.3868770897388458, 0.9380677342414856, 0.4927084743976593, 0.7979277968406677, 0.45593059062957764, 0.0170291718095541, 0.6517185568809509, 0.5005806684494019, 0.8620452880859375, 0.5568361282348633, 0.07004088908433914, 0.5770776271820068, 0.8143753409385681, 0.8382748961448669, 0.0996832400560379, 0.5101017355918884, 0.4771038293838501, 0.9274903535842896, 0.22478686273097992, 0.9320020079612732, 0.05571257323026657, 0.6283928155899048, 0.6742311120033264, 0.0424797385931015, 0.7878830432891846, 0.5152276158332825, 0.16908106207847595, 0.5440091490745544, 0.7015048861503601, 0.25502151250839233, 0.40467849373817444, 0.432849258184433, 0.7071661353111267, 0.14723558723926544, 0.38334646821022034, 0.9520816802978516, 0.8364397287368774, 0.8559724688529968, 0.008303776383399963, 0.9050803184509277, 0.32011473178863525, 0.4527781903743744, 0.7674447298049927, 0.4480983316898346, 0.1805608868598938, 0.4140874147415161, 0.27097389101982117, 0.8837590217590332, 0.7211946845054626, 0.34096693992614746, 0.4692194163799286, 0.29635292291641235, 0.272903710603714, 0.00385366752743721, 0.17514188587665558, 0.6346434950828552]
                        }
                    }
                }
            }
        },
        "size": 5
    }
    print(search)
    return es.search(INDEX_NAME, search)

if __name__ == '__main__':
    create_index(INDEX_NAME)

    data = np.random.rand(10, 14, 14).tolist()

    print(data)
    index_data(data)

    res = search()
    print(res)
lior-k commented 4 years ago

See the answer for this problem in issue #43