opensearch-project / k-NN

🆕 Find the k-nearest neighbors (k-NN) for your vector data
https://opensearch.org/docs/latest/search-plugins/knn/index/
Apache License 2.0
154 stars 113 forks source link

[BUG] KNN stats empty on cluster #1279

Open juntezhang opened 11 months ago

juntezhang commented 11 months ago

What is the bug? The KNN stats are empty on a cluster. It seems no data has been populated.

How can one reproduce the bug? Steps to reproduce the behavior:

Update: This also happens on a single node cluster on local. I see the stats when I use faiss instead of lucene.

  1. Create a cluster with 3 cluster manager nodes, 2 data nodes, 2 ml nodes.
  2. Set in the cluster manager nodes the following KNN configuration:
        - name: "plugins.ml_commons.task_dispatch_policy"
          value: "round_robin"
        - name: "plugins.ml_commons.model_auto_redeploy.enable"
          value: "true"
        - name: "plugins.ml_commons.max_ml_task_per_node"
          value: "512"
        - name: "knn.algo_param.index_thread_qty"
          value: "3"
  3. Configure a field in the mapping to use KNN, this is mine:
    "field_vectorized": {
    "type": "nested",
    "properties": {
    "knn": {
      "type": "knn_vector",
      "dimension": 768,
      "method": {
        "name": "hnsw",
        "space_type": "l2",
        "engine": "lucene",
        "parameters": {
          "ef_construction": 128,
          "m": 8
        }
      }
    }
    }
    }
  4. Load any model in the cluster.
  5. Access the KNN stats with http://localhost:9200/_plugins/_knn/stats?pretty by logging on a node.
  6. See that all the stats are empty:
    {
    "_nodes" : {
    "total" : 7,
    "successful" : 7,
    "failed" : 0
    },
    "cluster_name" : "my-cluster",
    "circuit_breaker_triggered" : false,
    "model_index_status" : null,
    "nodes" : {
    "mpReBrBORta1b0iOezJ8cA" : {
      "graph_memory_usage_percentage" : 0.0,
      "graph_query_requests" : 0,
      "graph_memory_usage" : 0,
      "cache_capacity_reached" : false,
      "load_success_count" : 0,
      "training_memory_usage" : 0,
      "indices_in_cache" : { },
      "script_query_errors" : 0,
      "hit_count" : 0,
      "knn_query_requests" : 0,
      "total_load_time" : 0,
      "miss_count" : 0,
      "knn_query_with_filter_requests" : 0,
      "training_memory_usage_percentage" : 0.0,
      "lucene_initialized" : false,
      "graph_index_requests" : 0,
      "faiss_initialized" : false,
      "load_exception_count" : 0,
      "training_errors" : 0,
      "eviction_count" : 0,
      "nmslib_initialized" : false,
      "script_compilations" : 0,
      "script_query_requests" : 0,
      "graph_stats" : {
        "refresh" : {
          "total_time_in_millis" : 0,
          "total" : 0
        },
        "merge" : {
          "current" : 0,
          "total" : 0,
          "total_time_in_millis" : 0,
          "current_docs" : 0,
          "total_docs" : 0,
          "total_size_in_bytes" : 0,
          "current_size_in_bytes" : 0
        }
      },
      "graph_query_errors" : 0,
      "indexing_from_model_degraded" : false,
      "graph_index_errors" : 0,
      "training_requests" : 0,
      "script_compilation_errors" : 0
    },
    "9_7RWD9jQN-gCTQODbg9AQ" : {
      "graph_memory_usage_percentage" : 0.0,
      "graph_query_requests" : 0,
      "graph_memory_usage" : 0,
      "cache_capacity_reached" : false,
      "load_success_count" : 0,
      "training_memory_usage" : 0,
      "indices_in_cache" : { },
      "script_query_errors" : 0,
      "hit_count" : 0,
      "knn_query_requests" : 0,
      "total_load_time" : 0,
      "miss_count" : 0,
      "knn_query_with_filter_requests" : 0,
      "training_memory_usage_percentage" : 0.0,
      "lucene_initialized" : true,
      "graph_index_requests" : 0,
      "faiss_initialized" : false,
      "load_exception_count" : 0,
      "training_errors" : 0,
      "eviction_count" : 0,
      "nmslib_initialized" : false,
      "script_compilations" : 0,
      "script_query_requests" : 0,
      "graph_stats" : {
        "refresh" : {
          "total_time_in_millis" : 0,
          "total" : 0
        },
        "merge" : {
          "current" : 0,
          "total" : 0,
          "total_time_in_millis" : 0,
          "current_docs" : 0,
          "total_docs" : 0,
          "total_size_in_bytes" : 0,
          "current_size_in_bytes" : 0
        }
      },
      "graph_query_errors" : 0,
      "indexing_from_model_degraded" : false,
      "graph_index_errors" : 0,
      "training_requests" : 0,
      "script_compilation_errors" : 0
    },
    "p0WJkA02SPW7_A6f--koMg" : {
      "graph_memory_usage_percentage" : 0.0,
      "graph_query_requests" : 0,
      "graph_memory_usage" : 0,
      "cache_capacity_reached" : false,
      "load_success_count" : 0,
      "training_memory_usage" : 0,
      "indices_in_cache" : { },
      "script_query_errors" : 0,
      "hit_count" : 0,
      "knn_query_requests" : 0,
      "total_load_time" : 0,
      "miss_count" : 0,
      "knn_query_with_filter_requests" : 0,
      "training_memory_usage_percentage" : 0.0,
      "lucene_initialized" : true,
      "graph_index_requests" : 0,
      "faiss_initialized" : false,
      "load_exception_count" : 0,
      "training_errors" : 0,
      "eviction_count" : 0,
      "nmslib_initialized" : false,
      "script_compilations" : 0,
      "script_query_requests" : 0,
      "graph_stats" : {
        "refresh" : {
          "total_time_in_millis" : 0,
          "total" : 0
        },
        "merge" : {
          "current" : 0,
          "total" : 0,
          "total_time_in_millis" : 0,
          "current_docs" : 0,
          "total_docs" : 0,
          "total_size_in_bytes" : 0,
          "current_size_in_bytes" : 0
        }
      },
      "graph_query_errors" : 0,
      "indexing_from_model_degraded" : false,
      "graph_index_errors" : 0,
      "training_requests" : 0,
      "script_compilation_errors" : 0
    },
    "2fl_okzPTnapO3w-t8XJ4A" : {
      "graph_memory_usage_percentage" : 0.0,
      "graph_query_requests" : 0,
      "graph_memory_usage" : 0,
      "cache_capacity_reached" : false,
      "load_success_count" : 0,
      "training_memory_usage" : 0,
      "indices_in_cache" : { },
      "script_query_errors" : 0,
      "hit_count" : 0,
      "knn_query_requests" : 0,
      "total_load_time" : 0,
      "miss_count" : 0,
      "knn_query_with_filter_requests" : 0,
      "training_memory_usage_percentage" : 0.0,
      "lucene_initialized" : false,
      "graph_index_requests" : 0,
      "faiss_initialized" : false,
      "load_exception_count" : 0,
      "training_errors" : 0,
      "eviction_count" : 0,
      "nmslib_initialized" : false,
      "script_compilations" : 0,
      "script_query_requests" : 0,
      "graph_stats" : {
        "refresh" : {
          "total_time_in_millis" : 0,
          "total" : 0
        },
        "merge" : {
          "current" : 0,
          "total" : 0,
          "total_time_in_millis" : 0,
          "current_docs" : 0,
          "total_docs" : 0,
          "total_size_in_bytes" : 0,
          "current_size_in_bytes" : 0
        }
      },
      "graph_query_errors" : 0,
      "indexing_from_model_degraded" : false,
      "graph_index_errors" : 0,
      "training_requests" : 0,
      "script_compilation_errors" : 0
    },
    "Gr2bYy46RTCEOcVfDKulug" : {
      "graph_memory_usage_percentage" : 0.0,
      "graph_query_requests" : 0,
      "graph_memory_usage" : 0,
      "cache_capacity_reached" : false,
      "load_success_count" : 0,
      "training_memory_usage" : 0,
      "indices_in_cache" : { },
      "script_query_errors" : 0,
      "hit_count" : 0,
      "knn_query_requests" : 0,
      "total_load_time" : 0,
      "miss_count" : 0,
      "knn_query_with_filter_requests" : 0,
      "training_memory_usage_percentage" : 0.0,
      "lucene_initialized" : true,
      "graph_index_requests" : 0,
      "faiss_initialized" : false,
      "load_exception_count" : 0,
      "training_errors" : 0,
      "eviction_count" : 0,
      "nmslib_initialized" : false,
      "script_compilations" : 0,
      "script_query_requests" : 0,
      "graph_stats" : {
        "refresh" : {
          "total_time_in_millis" : 0,
          "total" : 0
        },
        "merge" : {
          "current" : 0,
          "total" : 0,
          "total_time_in_millis" : 0,
          "current_docs" : 0,
          "total_docs" : 0,
          "total_size_in_bytes" : 0,
          "current_size_in_bytes" : 0
        }
      },
      "graph_query_errors" : 0,
      "indexing_from_model_degraded" : false,
      "graph_index_errors" : 0,
      "training_requests" : 0,
      "script_compilation_errors" : 0
    },
    "0ENLV3O3ThGPWCIW2d5PBg" : {
      "graph_memory_usage_percentage" : 0.0,
      "graph_query_requests" : 0,
      "graph_memory_usage" : 0,
      "cache_capacity_reached" : false,
      "load_success_count" : 0,
      "training_memory_usage" : 0,
      "indices_in_cache" : { },
      "script_query_errors" : 0,
      "hit_count" : 0,
      "knn_query_requests" : 0,
      "total_load_time" : 0,
      "miss_count" : 0,
      "knn_query_with_filter_requests" : 0,
      "training_memory_usage_percentage" : 0.0,
      "lucene_initialized" : true,
      "graph_index_requests" : 0,
      "faiss_initialized" : false,
      "load_exception_count" : 0,
      "training_errors" : 0,
      "eviction_count" : 0,
      "nmslib_initialized" : false,
      "script_compilations" : 0,
      "script_query_requests" : 0,
      "graph_stats" : {
        "merge" : {
          "current" : 0,
          "total" : 0,
          "total_time_in_millis" : 0,
          "current_docs" : 0,
          "total_docs" : 0,
          "total_size_in_bytes" : 0,
          "current_size_in_bytes" : 0
        },
        "refresh" : {
          "total" : 0,
          "total_time_in_millis" : 0
        }
      },
      "graph_query_errors" : 0,
      "indexing_from_model_degraded" : false,
      "graph_index_errors" : 0,
      "training_requests" : 0,
      "script_compilation_errors" : 0
    },
    "mgTsAWTuSB2TViG5OTl1NA" : {
      "graph_memory_usage_percentage" : 0.0,
      "graph_query_requests" : 0,
      "graph_memory_usage" : 0,
      "cache_capacity_reached" : false,
      "load_success_count" : 0,
      "training_memory_usage" : 0,
      "indices_in_cache" : { },
      "script_query_errors" : 0,
      "hit_count" : 0,
      "knn_query_requests" : 0,
      "total_load_time" : 0,
      "miss_count" : 0,
      "knn_query_with_filter_requests" : 0,
      "training_memory_usage_percentage" : 0.0,
      "lucene_initialized" : true,
      "graph_index_requests" : 0,
      "faiss_initialized" : false,
      "load_exception_count" : 0,
      "training_errors" : 0,
      "eviction_count" : 0,
      "nmslib_initialized" : false,
      "script_compilations" : 0,
      "script_query_requests" : 0,
      "graph_stats" : {
        "refresh" : {
          "total_time_in_millis" : 0,
          "total" : 0
        },
        "merge" : {
          "current" : 0,
          "total" : 0,
          "total_time_in_millis" : 0,
          "current_docs" : 0,
          "total_docs" : 0,
          "total_size_in_bytes" : 0,
          "current_size_in_bytes" : 0
        }
      },
      "graph_query_errors" : 0,
      "indexing_from_model_degraded" : false,
      "graph_index_errors" : 0,
      "training_requests" : 0,
      "script_compilation_errors" : 0
    }
    }
    }

What is the expected behavior? Expected behavior is that the KNN stats are populated.

What is your host/environment?

OpenSearch 2.11 on Docker

Do you have any screenshots? n/a

Do you have any additional context? n/a

heemin32 commented 11 months ago

@ryanbogan?

juntezhang commented 11 months ago

I just updated the description to say that I see stats when I use faiss instead of lucene, and it is reproducable on local computer with a single node.

vamshin commented 11 months ago

@juntezhang Yes you are right. Currently graph usage stats are available only for nmslib, faiss and not lucene. We need to figure out a way to have lucene stats. Will consider this as a feature request

jmazanec15 commented 11 months ago

Yes, we need to re-examine this API in general. lucene isnt included because lucene maps graphs from disk into memory. Because this leverages the page cache, it will be difficult to tell utilization of memory by lucene in this case.

pasumarthisunil commented 3 months ago

I was wondering if there have been any updates on this. Is it possible to get these metrics in any other way?