microsoft / SPTAG

A distributed approximate nearest neighborhood search (ANN) library which provides a high quality vector index build, search and distributed online serving toolkits for large scale vector search scenario.
MIT License
4.79k stars 582 forks source link

Long build and search time with 100k vectors #128

Open chikubee opened 4 years ago

chikubee commented 4 years ago

SPTAG Performance stats with Docker on MACOS Dimensions = 768 Number of vectors = 100k Number of test queries = 3 Distance Metric: L2/Cosine (Time in secs)

Build time : 378.7892005443573 BKT Search time : 0.2576255798339844 BKT (3 queries) Addition time : 185.24826049804688 BKT Deletion Time : 1.4446532726287842 BKT

Build time : 323.30899143218994 KDT Search time : 0.3578498363494873 KDT (n=3) Addition time : 426.7439365386963 KDT Deletion Time : 1.5057392120361328 KDT

The Build and vector Addition time can still be ignored, but the search time is enormous and can't be compromised. The search time is not so high with 10 dim 100k vectors.

This does not seem to be the expected behavior as it claims to be scalable for high dimensional datasets.

I compared it with FAISS on the same data, FAISS [IVFFlatIP] Dimensions = 768 Number of vectors = 100k Number of test queries = 3 Build time : 0.41316819190979004 Search Time: 0.0009050369262695312 (3 queries)

Please use the below script to reproduce the code.

import SPTAG
import numpy as np
import time
n = 100000
k = 3
r = 3

def testBuild(algo, distmethod, x, out):
   i = SPTAG.AnnIndex(algo, 'Float', x.shape[1])
   i.SetBuildParam("NumberOfThreads", '6')
   i.SetBuildParam("DistCalcMethod", distmethod)
   if i.Build(x, x.shape[0]):
       i.Save(out)

def testBuildWithMetaData(algo, distmethod, x, s, out):
   i = SPTAG.AnnIndex(algo, 'Float', x.shape[1])
   i.SetBuildParam("NumberOfThreads", '6')
   i.SetBuildParam("DistCalcMethod", distmethod)
   if i.BuildWithMetaData(x, s, x.shape[0], False):
       i.Save(out)

def testSearch(index, q, k):
   j = SPTAG.AnnIndex.Load(index)
   for t in range(q.shape[0]):
       result = j.Search(q[t], k)
       print (result[0]) # ids
       print (result[1]) # distances

def testSearchWithMetaData(index, q, k):
   j = SPTAG.AnnIndex.Load(index)
   j.SetSearchParam("MaxCheck", '1024')
   for t in range(q.shape[0]):
       result = j.SearchWithMetaData(q[t], k)
       print (result[0]) # ids
       print (result[1]) # distances
       print (result[2]) # metadata

def testAdd(index, x, out, algo, distmethod):
   if index != None:
       i = SPTAG.AnnIndex.Load(index)
   else:
       i = SPTAG.AnnIndex(algo, 'Float', x.shape[1])
   i.SetBuildParam("NumberOfThreads", '6')
   i.SetBuildParam("DistCalcMethod", distmethod)
   if i.Add(x, x.shape[0]):
       i.Save(out)

def testAddWithMetaData(index, x, s, out, algo, distmethod):
   if index != None:
       i = SPTAG.AnnIndex.Load(index)
   else:
       i = SPTAG.AnnIndex(algo, 'Float', x.shape[1])
   i.SetBuildParam("NumberOfThreads", '6')
   i.SetBuildParam("DistCalcMethod", distmethod)
   if i.AddWithMetaData(x, s, x.shape[0]):
       i.Save(out)

def testDelete(index, x, out):
   i = SPTAG.AnnIndex.Load(index)
   ret = i.Delete(x, x.shape[0])
   print (ret)
   i.Save(out)

def Test(algo, distmethod):
   vector_dimension = 768
   x = np.ones((n, vector_dimension), dtype=np.float32) * np.reshape(np.arange(n, dtype=np.float32), (n, 1))
   q = np.ones((r, vector_dimension), dtype=np.float32) * np.reshape(np.arange(r, dtype=np.float32), (r, 1)) * 2
   m = ''
   for i in range(n):
       m += str(i) + '\n'

   m = m.encode()

   index = algo
   print ("Build.............................")
   st = time.time()
   testBuild(algo, distmethod, x, 'testindices')
   et = time.time()
   print("Build time : ", et - st, index)
   st = time.time()
   testSearch('testindices', q, k)
   et = time.time()
   print("Search time : ", et - st, index)
   print ("Add.............................")
   st = time.time()
   testAdd('testindices', x, 'testindices', algo, distmethod)
   et = time.time()
   print("Addition time : ", et - st, index)
   testSearch('testindices', q, k)
   print ("Delete.............................")
   st = time.time()
   testDelete('testindices', q, 'testindices')
   et = time.time()
   print("Deletion Time : ", et - st, index)
   testSearch('testindices', q, k)

   print ("AddWithMetaData.............................")
   testAddWithMetaData(None, x, m, 'testindices', algo, distmethod)
   testSearchWithMetaData('testindices', q, k)
   print ("Delete.............................")
   testDelete('testindices', q, 'testindices')
   testSearchWithMetaData('testindices', q, k)

if __name__ == '__main__':
   Test('BKT', 'L2')
   # Test('KDT', 'L2')

Code to reproduce the results with FAISS: Installation: pip install faiss-cpu --no-cache

import faiss
import numpy as np
import time
n = 100000
k = 3
r = 3
d = 768
nlist = 10
vector_dimension = 768
x = np.ones((n, vector_dimension), dtype=np.float32) * np.reshape(np.arange(n, dtype=np.float32), (n, 1))
q = np.ones((r, vector_dimension), dtype=np.float32) * np.reshape(np.arange(r, dtype=np.float32), (r, 1)) * 2
st = time.time()
quantizer = faiss.IndexFlatIP(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
index.train(x)
index.add(q)
et = time.time()
print("Build Time:  ", et-st)
st = time.time()
D, I = index.search(q, k=3)
print("Search Time: ", time.time()-st)

Is there something I am doing wrong or is this the expected behavior? Are there some parameters that can be tuned to help scale search. Thanks in advance.

wilfoderek commented 3 years ago

Did you solve it?