microsoft / SPTAG

A distributed approximate nearest neighborhood search (ANN) library which provides a high quality vector index build, search and distributed online serving toolkits for large scale vector search scenario.
MIT License
4.83k stars 580 forks source link

The maxcheck parameter has no effect when searching. #342

Open LLLjun opened 2 years ago

LLLjun commented 2 years ago

I ran the code following the example, but while searching I found that the MaxCheck parameter doesn't adjust the recall as described.

[1] [query] [maxcheck] [avg] [99%] [95%] [recall] [qps] [mem] [1] 0-10000 16384 0.0028 0.0090 0.0064 0.8103 2886.7361 0GB [1] 0-10000 8192 0.0024 0.0086 0.0056 0.8103 3368.8591 0GB [1] 0-10000 4096 0.0015 0.0058 0.0033 0.8103 5320.0259 0GB [1] 0-10000 2048 0.0015 0.0060 0.0035 0.8103 5267.8604 0GB [1] 0-10000 1024 0.0016 0.0055 0.0036 0.8103 5104.4990 0GB [1] 0-10000 512 0.0014 0.0050 0.0032 0.8103 5527.0239 0GB [1] 0-10000 256 0.0016 0.0054 0.0037 0.8103 4964.4180 0GB

marxqiu commented 2 years ago

I also encountered the same issue: qps and recall didn't change as expected. Here is the code I tested with the default configuration of sift1m dataset

vector_number = 100000
vector_dimension = 1000

x = np.random.rand(vector_number, vector_dimension).astype(np.float32) 
q = np.random.rand(1000, vector_dimension).astype(np.float32)

m = ''
for i in range(vector_number):
    m += str(i) + '\n'

index = SPTAG.AnnIndex('SPANN', 'Float', vector_dimension)

index.SetBuildParam("IndexAlgoType", "BKT", "Base")
index.SetBuildParam("IndexDirectory", "spann_index", "Base")
index.SetBuildParam("DistCalcMethod", "L2", "Base")

index.SetBuildParam("isExecute", "true", "SelectHead")
index.SetBuildParam("NumberOfThreads", '64', "SelectHead")
index.SetBuildParam("Ratio", "0.16", "SelectHead") # index.SetBuildParam("Count", "200", "SelectHead")
index.SetBuildParam("TreeNumber", "1", "SelectHead")
index.SetBuildParam("BKTKmeansK", "32", "SelectHead")
index.SetBuildParam("BKTLeafSize", "8", "SelectHead")
index.SetBuildParam("SaveBKT", "false", "SelectHead")
index.SetBuildParam("SplitFactor", "6", "SelectHead")
index.SetBuildParam("SplitThreshold", "100", "SelectHead")
index.SetBuildParam("BKTLambdaFactor", "-1", "SelectHead")
index.SetBuildParam("SamplesNumber", "1000", "SelectHead")
index.SetBuildParam("SelectThreshold", "50", "SelectHead")

index.SetBuildParam("isExecute", "true", "BuildHead")
index.SetBuildParam("NeighborhoodSize", "32", "BuildHead")
index.SetBuildParam("TPTNumber", "32", "BuildHead")
index.SetBuildParam("TPTLeafSize", "2000", "BuildHead")
index.SetBuildParam("MaxCheck", "8192", "BuildHead")
index.SetBuildParam("MaxCheckForRefineGraph", "8192", "BuildHead")
index.SetBuildParam("RefineIterations", "3", "BuildHead")
index.SetBuildParam("NumberOfThreads", "64", "BuildHead")
index.SetBuildParam("BKTLambdaFactor", "-1", "BuildHead")
index.SetBuildParam("isExecute", "true", "BuildSSDIndex")
index.SetBuildParam("BuildSsdIndex", "true", "BuildSSDIndex")
index.SetBuildParam("InternalResultNum", "64", "BuildSSDIndex")
index.SetBuildParam("ReplicaCount", "8", "BuildSSDIndex")
index.SetBuildParam("PostingPageLimit", "12", "BuildSSDIndex")
index.SetBuildParam("NumberOfThreads", "64", "BuildSSDIndex")
index.SetBuildParam("MaxCheck", "8192", "BuildSSDIndex")

if (os.path.exists("spann_index")):
    shutil.rmtree("spann_index")

print ("Build.............................")
st = time.time()
index.BuildWithMetaData(x, m, vector_number, False, False)
et = time.time()
build_time = et - st
print("Build time : ", build_time)

maxcheck = [100, 200, 400, 1000, 2000]
searchPostingPageLimit = [1, 5, 10, 40, 100]

for m in maxcheck:
    for s in searchPostingPageLimit:

        index.SetSearchParam("isExecute", "true", "SearchSSDIndex")
        index.SetSearchParam("BuildSsdIndex", "false", "SearchSSDIndex")
        index.SetSearchParam("InternalResultNum", "32", "SearchSSDIndex")
        index.SetSearchParam("NumberOfThreads", "4", "SearchSSDIndex")
        index.SetSearchParam("HashTableExponent", "4", "SearchSSDIndex")
        index.SetSearchParam("ResultNum", "10", "SearchSSDIndex")
        index.SetSearchParam("MaxCheck", str(m) , "SearchSSDIndex")
        index.SetSearchParam("MaxDistRatio", "10000", "SearchSSDIndex")
        index.SetSearchParam("SearchPostingPageLimit", str(s), "SearchSSDIndex")

        st = time.time()
        for t in tqdm(range(q.shape[0])):
            result = index.SearchWithMetaData(q[t], 3) # Search k=3 nearest vectors for query vector q
        et = time.time()
        search_time = et - st
        print(f"{m}/{s}   Search time : ", et - st)
JingyuanHe1222 commented 1 year ago

Hi, I encountered the same issue. Have you figure out the reason? Thanks in advance!