Inference Endpoint becomes unavailable

jeffvestal commented 2 hours ago

Elasticsearch Version

serverless

Installed Plugins

No response

Java Version

bundled

OS Version

serverless

Problem Description

I have an Inference endpoint up and running. It has been working fine for several days. The cluster suddenly started responding with 404 error when trying to call the inference endpoint.

Steps to Reproduce

GET grocery_items/_search
   {
        "retriever": {
                        "standard": {
                            "query": {
                                "nested": {
                                    "path": "Product Description_semantic.inference.chunks",
                                    "query": {
                                        "sparse_vector": {
                                            "inference_id": "elser-endpoint",
                                            "field": "Product Description_semantic.inference.chunks.embeddings",
                                            "query": "find me lunch meat"
                                        }
                                    },
                                    "inner_hits": {
                                        "size": 2,
                                        "name": "grocery_items.Product Description_semantic",
                                        "_source": [
                                            "Product Description_semantic.inference.chunks.text"
                                        ]
                                    }
                                }
                            }
                        }
        }
   }

response

{
  "error": {
    "root_cause": [
      {
        "type": "status_exception",
        "reason": "Unable to find model deployment task [elser-endpoint] please stop and start the deployment or try again momentarily"
      }
    ],
    "type": "status_exception",
    "reason": "Unable to find model deployment task [elser-endpoint] please stop and start the deployment or try again momentarily"
  },
  "status": 404
}

GET _inference/elser-endpoint response

{
  "endpoints": [
    {
      "inference_id": "elser-endpoint",
      "task_type": "sparse_embedding",
      "service": "elasticsearch",
      "service_settings": {
        "num_allocations": 10,
        "num_threads": 1,
        "model_id": ".elser_model_2_linux-x86_64",
        "adaptive_allocations": {
          "enabled": true,
          "min_number_of_allocations": 10,
          "max_number_of_allocations": 100
        }
      }
    }
  ]
}

get _ml/trained_models/.elser_model_2_linux-x86_64/_stats response

{
  "count": 1,
  "trained_model_stats": [
    {
      "model_id": ".elser_model_2_linux-x86_64",
      "model_size_stats": {
        "model_size_bytes": 274756282,
        "required_native_memory_bytes": 2101346304
      },
      "pipeline_count": 1,
      "ingest": {
        "total": {
          "count": 0,
          "time_in_millis": 0,
          "current": 0,
          "failed": 0
        },
        "pipelines": {
          ".kibana-elastic-ai-assistant-ingest-pipeline-knowledge-base": {
            "count": 0,
            "time_in_millis": 0,
            "current": 0,
            "failed": 0,
            "ingested_as_first_pipeline_in_bytes": 0,
            "produced_as_first_pipeline_in_bytes": 0,
            "processors": [
              {
                "inference": {
                  "type": "inference",
                  "stats": {
                    "count": 0,
                    "time_in_millis": 0,
                    "current": 0,
                    "failed": 0
                  }
                }
              }
            ]
          }
        }
      },
      "inference_stats": {
        "failure_count": 0,
        "inference_count": 2,
        "cache_miss_count": 0,
        "missing_all_fields_count": 0,
        "timestamp": 1729883203847
      },
      "deployment_stats": {
        "deployment_id": "elser-endpoint",
        "model_id": ".elser_model_2_linux-x86_64",
        "threads_per_allocation": 1,
        "number_of_allocations": 10,
        "adaptive_allocations": {
          "enabled": true,
          "min_number_of_allocations": 10,
          "max_number_of_allocations": 100
        },
        "queue_capacity": 1024,
        "cache_size": "262mb",
        "priority": "normal",
        "start_time": 1728864116915,
        "inference_count": 2,
        "peak_throughput_per_minute": 1,
        "nodes": [
          {
            "node": {
              "serverless": {
                "name": "serverless",
                "ephemeral_id": "serverless",
                "transport_address": "0.0.0.0:0",
                "external_id": "serverless",
                "attributes": {},
                "roles": [
                  "ml"
                ],
                "version": "9.0.0",
                "min_index_version": 8000099,
                "max_index_version": 9000000
              }
            },
            "routing_state": {
              "routing_state": "started"
            },
            "inference_count": 2,
            "average_inference_time_ms": 84,
            "average_inference_time_ms_excluding_cache_hits": 84,
            "inference_cache_hit_count": 0,
            "last_access": 1729882326131,
            "number_of_pending_requests": 0,
            "start_time": 1729875144036,
            "threads_per_allocation": 1,
            "number_of_allocations": 1,
            "peak_throughput_per_minute": 1,
            "throughput_last_minute": 0,
            "inference_cache_hit_count_last_minute": 0
          }
        ]
      }
    }
  ]
}

Though whats weird is I was trying to deploy an ELSER model

PUT _inference/sparse_embedding/elser-endpoint-backup
{
  "service": "elser",
  "service_settings": {
    "adaptive_allocations": {
      "enabled": true,
      "min_number_of_allocations": 10,
      "max_number_of_allocations": 100
    },
    "num_threads": 1
  }
}

response

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign model [my-e5-endpoint]"
      }
    ],
    "type": "illegal_argument_exception",
    "reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign model [my-e5-endpoint]"
  },
  "status": 400
}

serverless prod - fbe378fd1dab4affa7c981c90a03f440

Logs (if relevant)

app log last success (central us time)

2024-10-25 13:51:57,332 - INFO - _async_transport.py - perform_request - line 271 - POST [https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud:443/users/_search](https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud/users/_search) [status:200 duration:0.045s]

then on next call

2024-10-25 13:52:04,257 - INFO - _async_transport.py - perform_request - line 271 - POST [https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud:443/grocery_items/_search](https://grocery-store-fbe378.es.us-east-1.aws.elastic.cloud/grocery_items/_search) [status:404 duration:0.045s]

jeffvestal commented 2 hours ago

GET _ml/trained_models/_stats

response

{
  "count": 4,
  "trained_model_stats": [
    {
      "model_id": ".elser_model_2_linux-x86_64",
      "model_size_stats": {
        "model_size_bytes": 274756282,
        "required_native_memory_bytes": 2101346304
      },
      "pipeline_count": 1,
      "ingest": {
        "total": {
          "count": 0,
          "time_in_millis": 0,
          "current": 0,
          "failed": 0
        },
        "pipelines": {
          ".kibana-elastic-ai-assistant-ingest-pipeline-knowledge-base": {
            "count": 0,
            "time_in_millis": 0,
            "current": 0,
            "failed": 0,
            "ingested_as_first_pipeline_in_bytes": 0,
            "produced_as_first_pipeline_in_bytes": 0,
            "processors": [
              {
                "inference": {
                  "type": "inference",
                  "stats": {
                    "count": 0,
                    "time_in_millis": 0,
                    "current": 0,
                    "failed": 0
                  }
                }
              }
            ]
          }
        }
      },
      "inference_stats": {
        "failure_count": 0,
        "inference_count": 2,
        "cache_miss_count": 0,
        "missing_all_fields_count": 0,
        "timestamp": 1729885638611
      },
      "deployment_stats": {
        "deployment_id": "elser-endpoint",
        "model_id": ".elser_model_2_linux-x86_64",
        "threads_per_allocation": 1,
        "number_of_allocations": 10,
        "adaptive_allocations": {
          "enabled": true,
          "min_number_of_allocations": 10,
          "max_number_of_allocations": 100
        },
        "queue_capacity": 1024,
        "cache_size": "262mb",
        "priority": "normal",
        "start_time": 1728864116915,
        "inference_count": 2,
        "peak_throughput_per_minute": 1,
        "nodes": [
          {
            "node": {
              "serverless": {
                "name": "serverless",
                "ephemeral_id": "serverless",
                "transport_address": "0.0.0.0:0",
                "external_id": "serverless",
                "attributes": {},
                "roles": [
                  "ml"
                ],
                "version": "9.0.0",
                "min_index_version": 8000099,
                "max_index_version": 9000000
              }
            },
            "routing_state": {
              "routing_state": "started"
            },
            "inference_count": 2,
            "average_inference_time_ms": 84,
            "average_inference_time_ms_excluding_cache_hits": 84,
            "inference_cache_hit_count": 0,
            "last_access": 1729882326131,
            "number_of_pending_requests": 0,
            "start_time": 1729875144036,
            "threads_per_allocation": 1,
            "number_of_allocations": 1,
            "peak_throughput_per_minute": 1,
            "throughput_last_minute": 0,
            "inference_cache_hit_count_last_minute": 0
          }
        ]
      }
    },
    {
      "model_id": ".multilingual-e5-small",
      "model_size_stats": {
        "model_size_bytes": 470097544,
        "required_native_memory_bytes": 7633552488
      },
      "pipeline_count": 0,
      "inference_stats": {
        "failure_count": 0,
        "inference_count": 0,
        "cache_miss_count": 0,
        "missing_all_fields_count": 0,
        "timestamp": 1729885638611
      },
      "deployment_stats": {
        "deployment_id": "my-e5-endpoint",
        "model_id": ".multilingual-e5-small",
        "threads_per_allocation": 1,
        "number_of_allocations": 1,
        "queue_capacity": 1024,
        "cache_size": "448.3mb",
        "priority": "normal",
        "start_time": 1729870121989,
        "peak_throughput_per_minute": 0,
        "nodes": [
          {
            "node": {
              "serverless": {
                "name": "serverless",
                "ephemeral_id": "serverless",
                "transport_address": "0.0.0.0:0",
                "external_id": "serverless",
                "attributes": {},
                "roles": [
                  "ml"
                ],
                "version": "9.0.0",
                "min_index_version": 8000099,
                "max_index_version": 9000000
              }
            },
            "routing_state": {
              "routing_state": "started"
            },
            "inference_count": 0,
            "inference_cache_hit_count": 0,
            "number_of_pending_requests": 0,
            "start_time": 1729875160928,
            "threads_per_allocation": 1,
            "number_of_allocations": 1,
            "peak_throughput_per_minute": 0,
            "throughput_last_minute": 0,
            "inference_cache_hit_count_last_minute": 0
          }
        ]
      }
    },
    {
      "model_id": ".multilingual-e5-small_linux-x86_64",
      "model_size_stats": {
        "model_size_bytes": 411956170,
        "required_native_memory_bytes": 1075570580
      },
      "pipeline_count": 1,
      "ingest": {
        "total": {
          "count": 0,
          "time_in_millis": 0,
          "current": 0,
          "failed": 0
        },
        "pipelines": {
          "my-e5-inference": {
            "count": 0,
            "time_in_millis": 0,
            "current": 0,
            "failed": 0,
            "ingested_as_first_pipeline_in_bytes": 0,
            "produced_as_first_pipeline_in_bytes": 0,
            "processors": [
              {
                "inference": {
                  "type": "inference",
                  "stats": {
                    "count": 0,
                    "time_in_millis": 0,
                    "current": 0,
                    "failed": 0
                  }
                }
              }
            ]
          }
        }
      },
      "inference_stats": {
        "failure_count": 0,
        "inference_count": 0,
        "cache_miss_count": 0,
        "missing_all_fields_count": 0,
        "timestamp": 1729885638611
      },
      "deployment_stats": {
        "deployment_id": "e5-ingest",
        "model_id": ".multilingual-e5-small_linux-x86_64",
        "threads_per_allocation": 1,
        "number_of_allocations": 1,
        "adaptive_allocations": {
          "enabled": true,
          "min_number_of_allocations": 1,
          "max_number_of_allocations": 32
        },
        "queue_capacity": 1024,
        "state": "started",
        "allocation_status": {
          "allocation_count": 1,
          "target_allocation_count": 1,
          "state": "fully_allocated"
        },
        "cache_size": "0b",
        "priority": "normal",
        "start_time": 1729809703261,
        "peak_throughput_per_minute": 0,
        "nodes": []
      }
    },
    {
      "model_id": "lang_ident_model_1",
      "model_size_stats": {
        "model_size_bytes": 1053992,
        "required_native_memory_bytes": 0
      },
      "pipeline_count": 0
    }
  ]
}

jeffvestal commented 2 hours ago

stopping the elser-endpoint deployment and re-deploying it brings it back to service

jeffvestal commented 2 hours ago

But I'm unable to create a new endpoint after restarting the elser one

PUT _inference/sparse_embedding/elser-endpoint-new-structure
{
  "service": "elasticsearch",
  "service_settings": {
    "adaptive_allocations": {
      "enabled": true,
      "min_number_of_allocations": 10,
      "max_number_of_allocations": 100
    },
    "num_threads": 1,
        "model_id": ".elser_model_2_linux-x86_64"
  }
}

response

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign [1] allocations to deployment [elser-endpoint-new-structure]"
      }
    ],
    "type": "illegal_argument_exception",
    "reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign [1] allocations to deployment [elser-endpoint-new-structure]"
  },
  "status": 400
}

jeffvestal commented 1 hour ago

@wwang500 was able to reproduce

on a brand new production serverless project
download all modes, .multilingual-e5-small, .multilingual-e5-small_linux-x86_64 and .elser_model_2_linux-x86_64

start them one by one based on your model stats.


PUT _inference/text_embedding/my-e5-model
{
"service": "elasticsearch",
"service_settings": {
"num_allocations": 1,
"num_threads": 1,
"model_id": ".multilingual-e5-small" 
}
}

PUT _inference/text_embedding/e5-ingest { "service": "elasticsearch", "service_settings": { "num_threads": 1, "model_id": ".multilingual-e5-small_linux-x86_64", "adaptive_allocations": { "enabled": true, "min_number_of_allocations": 1, "max_number_of_allocations": 32 } } } PUT _inference/text_embedding/e5-ingest { "service": "elasticsearch", "service_settings": { "num_threads": 1, "model_id": ".multilingual-e5-small_linux-x86_64", "adaptive_allocations": { "enabled": true, "min_number_of_allocations": 1, "max_number_of_allocations": 32 } } }

PUT _inference/sparse_embedding/elser-endpoint-backup-2 { "service": "elser", "service_settings": { "adaptive_allocations": { "enabled": true, "min_number_of_allocations": 10, "max_number_of_allocations": 100 }, "num_threads": 1 } }

elastic / elasticsearch