zilliztech / pyglass

Graph Library for Approximate Similarity Search
MIT License
96 stars 20 forks source link

Recall and qps do not match the results of glasspy installed using pip3 #14

Closed YinshiSanchez closed 2 days ago

YinshiSanchez commented 6 days ago

I tested the performance of Glass using the C++ interface, but the recall rate and QPS differ significantly from the results of glasspy installed using pip3. The recall rate and search performance of glasspy installed using pip3 basiclly matches the results on ann-benchmark .However, both the recall rate and search QPS tested with the C++ interface are very low. Furthermore, I noticed that the GlassPy searcher can use level=3, whereas the latest code in the main branch does not support level=3. Is the implementation of GlassPy inconsistent with the current open-source implementation? Are there any further opensouce plans for the future? This is the information about the CPU of the machine I ran the test on. image

Here is my C++ code:

#include "H5Cpp.h"
#include "glass/builder.hpp"
#include "glass/hnsw/hnsw.hpp"
#include "glass/nsg/nsg.hpp"
#include "glass/searcher.hpp"
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <vector>

float compute_recall(const std::vector<std::vector<int>> &groundtruth,
                     const int *predictions, int query_id, int k) {
  float recall = 0.0f;

  int hits = 0;
  std::unordered_set<int> gt_set(groundtruth[query_id].begin(),
                                 groundtruth[query_id].begin() + k);
  for (int j = 0; j < k; ++j) {
    if (gt_set.find(predictions[j]) != gt_set.end()) {
      ++hits;
    }
  }
  recall += static_cast<float>(hits) / k;
  return recall;
}

int main() {
  int dim = 128;
  int max_elements = 1000000;
  int query_elements = 10000;
  int max_degree = 32;
  int ef_construction = 200;
  int ef_search = 200;
  int k = 10;
  std::string file_name = "/data/dataset_diskann/sift/sift-128-euclidean.hdf5";

  std::cout << "dim: " << dim << std::endl;
  std::cout << "max_elements(M): " << max_elements << std::endl;
  std::cout << "query_elements: " << query_elements << std::endl;
  std::cout << "max_degree: " << max_degree << std::endl;
  std::cout << "ef_construction: " << ef_construction << std::endl;
  std::cout << "ef_search: " << ef_search << std::endl;
  std::cout << "k: " << k << std::endl;
  std::cout << "file_name: " << file_name << std::endl;

  H5::H5File hdf5_file(file_name.c_str(), H5F_ACC_RDONLY);

  int dimension = 0;

  if (hdf5_file.attrExists("dimension")) {
    H5::Attribute attr = hdf5_file.openAttribute("dimension");
    attr.read(H5::PredType::NATIVE_INT, &dimension);
  } else {
    H5::DataSet train_dataset = hdf5_file.openDataSet("train");
    H5::DataSpace dataspace = train_dataset.getSpace();

    hsize_t dims[2]; 
    dataspace.getSimpleExtentDims(dims, nullptr);

    dimension = static_cast<int>(dims[1]); 
  }

  std::cout << "Dimension: " << dimension << std::endl;

  H5::DataSet train_dataset = hdf5_file.openDataSet("train");
  H5::DataSpace train_dataspace = train_dataset.getSpace();

  hsize_t train_dims[2];
  train_dataspace.getSimpleExtentDims(train_dims, nullptr);

  std::vector<float> X_train(train_dims[0] * train_dims[1]);
  train_dataset.read(X_train.data(), H5::PredType::NATIVE_FLOAT);

  std::cout << "Train Data Loaded: " << train_dims[0] << " samples, "
            << train_dims[1] << " features." << std::endl;

  H5::DataSet test_dataset = hdf5_file.openDataSet("test");
  H5::DataSpace test_dataspace = test_dataset.getSpace();

  hsize_t test_dims[2];
  test_dataspace.getSimpleExtentDims(test_dims, nullptr);

  std::vector<float> X_test(train_dims[0] * train_dims[1]);
  test_dataset.read(X_test.data(), H5::PredType::NATIVE_FLOAT);

  std::cout << "Test Data Loaded: " << test_dims[0] << " samples, "
            << test_dims[1] << " features." << std::endl;

  H5::DataSet ground_truth_dataset = hdf5_file.openDataSet("neighbors");
  H5::DataSpace ground_truth_dataspace = ground_truth_dataset.getSpace();

  hsize_t ground_truth_dims[2];
  ground_truth_dataspace.getSimpleExtentDims(ground_truth_dims, nullptr);

  std::vector<int> linear_data(ground_truth_dims[0] * ground_truth_dims[1]);

  ground_truth_dataset.read(linear_data.data(), H5::PredType::NATIVE_INT);

  std::vector<std::vector<int>> ground_truth(
      ground_truth_dims[0], std::vector<int>(ground_truth_dims[1]));
  for (size_t i = 0; i < ground_truth_dims[0]; ++i) {
    for (size_t j = 0; j < ground_truth_dims[1]; ++j) {
      ground_truth[i][j] = linear_data[i * ground_truth_dims[1] + j];
    }
  }

  for (size_t i = 0; i < 5; ++i) {
    std::cout << "Query " << i << ": ";
    for (size_t j = 0; j < 5; ++j) {
      std::cout << ground_truth[i][j] << " ";
    }
    std::cout << std::endl;
  }

  std::cout << "Ground Truth Data Loaded: " << ground_truth_dims[0]
            << " samples, " << ground_truth_dims[1] << " features."
            << std::endl;

  auto index = std::unique_ptr<glass::Builder>(
      (glass::Builder *)new glass::HNSW(dim, "l2", max_degree * 2, ef_search));

  index->Build(X_train.data(), max_elements);

  auto searcher = glass::create_searcher(index->GetGraph(), "l2", 2);
  searcher->SetData(X_train.data(), max_elements, dim);
  searcher->Optimize(1);

  float correct = 0;
  int *dst;
  dst = new int[k];
  double total_time = 0;
  searcher->SetEf(ef_search);
  for (int i = 0; i < query_elements; ++i) {
    auto start = std::chrono::system_clock::now();
    searcher->Search(X_test.data() + i * dim, k, dst);
    auto end = std::chrono::system_clock::now();
    std::chrono::duration<double> gap = end - start;
    total_time += gap.count();
    correct += compute_recall(ground_truth, dst, i, k);
  }

  delete[] dst;
  std::cout << std::fixed << std::setprecision(6) << "\n";
  std::cout << "Correct: " << correct << std::endl;
  std::cout << "elements: " << query_elements << std::endl;
  std::cout << "Recall: " << correct / query_elements << std::endl;
  std::cout << "Total Query Time (s): " << total_time << std::endl;
  std::cout << "Average Query Time (s): " << total_time / query_elements
            << std::endl;
  std::cout << "QPS: " << query_elements / total_time << std::endl;
  return 0;
}
YinshiSanchez commented 2 days ago

Sorry, I set the wrong search parameters, which caused this issue.