Open jeffvestal opened 2 hours ago
GET _ml/trained_models/_stats
response
{
"count": 4,
"trained_model_stats": [
{
"model_id": ".elser_model_2_linux-x86_64",
"model_size_stats": {
"model_size_bytes": 274756282,
"required_native_memory_bytes": 2101346304
},
"pipeline_count": 1,
"ingest": {
"total": {
"count": 0,
"time_in_millis": 0,
"current": 0,
"failed": 0
},
"pipelines": {
".kibana-elastic-ai-assistant-ingest-pipeline-knowledge-base": {
"count": 0,
"time_in_millis": 0,
"current": 0,
"failed": 0,
"ingested_as_first_pipeline_in_bytes": 0,
"produced_as_first_pipeline_in_bytes": 0,
"processors": [
{
"inference": {
"type": "inference",
"stats": {
"count": 0,
"time_in_millis": 0,
"current": 0,
"failed": 0
}
}
}
]
}
}
},
"inference_stats": {
"failure_count": 0,
"inference_count": 2,
"cache_miss_count": 0,
"missing_all_fields_count": 0,
"timestamp": 1729885638611
},
"deployment_stats": {
"deployment_id": "elser-endpoint",
"model_id": ".elser_model_2_linux-x86_64",
"threads_per_allocation": 1,
"number_of_allocations": 10,
"adaptive_allocations": {
"enabled": true,
"min_number_of_allocations": 10,
"max_number_of_allocations": 100
},
"queue_capacity": 1024,
"cache_size": "262mb",
"priority": "normal",
"start_time": 1728864116915,
"inference_count": 2,
"peak_throughput_per_minute": 1,
"nodes": [
{
"node": {
"serverless": {
"name": "serverless",
"ephemeral_id": "serverless",
"transport_address": "0.0.0.0:0",
"external_id": "serverless",
"attributes": {},
"roles": [
"ml"
],
"version": "9.0.0",
"min_index_version": 8000099,
"max_index_version": 9000000
}
},
"routing_state": {
"routing_state": "started"
},
"inference_count": 2,
"average_inference_time_ms": 84,
"average_inference_time_ms_excluding_cache_hits": 84,
"inference_cache_hit_count": 0,
"last_access": 1729882326131,
"number_of_pending_requests": 0,
"start_time": 1729875144036,
"threads_per_allocation": 1,
"number_of_allocations": 1,
"peak_throughput_per_minute": 1,
"throughput_last_minute": 0,
"inference_cache_hit_count_last_minute": 0
}
]
}
},
{
"model_id": ".multilingual-e5-small",
"model_size_stats": {
"model_size_bytes": 470097544,
"required_native_memory_bytes": 7633552488
},
"pipeline_count": 0,
"inference_stats": {
"failure_count": 0,
"inference_count": 0,
"cache_miss_count": 0,
"missing_all_fields_count": 0,
"timestamp": 1729885638611
},
"deployment_stats": {
"deployment_id": "my-e5-endpoint",
"model_id": ".multilingual-e5-small",
"threads_per_allocation": 1,
"number_of_allocations": 1,
"queue_capacity": 1024,
"cache_size": "448.3mb",
"priority": "normal",
"start_time": 1729870121989,
"peak_throughput_per_minute": 0,
"nodes": [
{
"node": {
"serverless": {
"name": "serverless",
"ephemeral_id": "serverless",
"transport_address": "0.0.0.0:0",
"external_id": "serverless",
"attributes": {},
"roles": [
"ml"
],
"version": "9.0.0",
"min_index_version": 8000099,
"max_index_version": 9000000
}
},
"routing_state": {
"routing_state": "started"
},
"inference_count": 0,
"inference_cache_hit_count": 0,
"number_of_pending_requests": 0,
"start_time": 1729875160928,
"threads_per_allocation": 1,
"number_of_allocations": 1,
"peak_throughput_per_minute": 0,
"throughput_last_minute": 0,
"inference_cache_hit_count_last_minute": 0
}
]
}
},
{
"model_id": ".multilingual-e5-small_linux-x86_64",
"model_size_stats": {
"model_size_bytes": 411956170,
"required_native_memory_bytes": 1075570580
},
"pipeline_count": 1,
"ingest": {
"total": {
"count": 0,
"time_in_millis": 0,
"current": 0,
"failed": 0
},
"pipelines": {
"my-e5-inference": {
"count": 0,
"time_in_millis": 0,
"current": 0,
"failed": 0,
"ingested_as_first_pipeline_in_bytes": 0,
"produced_as_first_pipeline_in_bytes": 0,
"processors": [
{
"inference": {
"type": "inference",
"stats": {
"count": 0,
"time_in_millis": 0,
"current": 0,
"failed": 0
}
}
}
]
}
}
},
"inference_stats": {
"failure_count": 0,
"inference_count": 0,
"cache_miss_count": 0,
"missing_all_fields_count": 0,
"timestamp": 1729885638611
},
"deployment_stats": {
"deployment_id": "e5-ingest",
"model_id": ".multilingual-e5-small_linux-x86_64",
"threads_per_allocation": 1,
"number_of_allocations": 1,
"adaptive_allocations": {
"enabled": true,
"min_number_of_allocations": 1,
"max_number_of_allocations": 32
},
"queue_capacity": 1024,
"state": "started",
"allocation_status": {
"allocation_count": 1,
"target_allocation_count": 1,
"state": "fully_allocated"
},
"cache_size": "0b",
"priority": "normal",
"start_time": 1729809703261,
"peak_throughput_per_minute": 0,
"nodes": []
}
},
{
"model_id": "lang_ident_model_1",
"model_size_stats": {
"model_size_bytes": 1053992,
"required_native_memory_bytes": 0
},
"pipeline_count": 0
}
]
}
stopping the elser-endpoint
deployment and re-deploying it brings it back to service
But I'm unable to create a new endpoint after restarting the elser one
PUT _inference/sparse_embedding/elser-endpoint-new-structure
{
"service": "elasticsearch",
"service_settings": {
"adaptive_allocations": {
"enabled": true,
"min_number_of_allocations": 10,
"max_number_of_allocations": 100
},
"num_threads": 1,
"model_id": ".elser_model_2_linux-x86_64"
}
}
response
{
"error": {
"root_cause": [
{
"type": "illegal_argument_exception",
"reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign [1] allocations to deployment [elser-endpoint-new-structure]"
}
],
"type": "illegal_argument_exception",
"reason": "not enough memory on node [gIs_zXQYT3SMtEozmJJg0w] to assign [1] allocations to deployment [elser-endpoint-new-structure]"
},
"status": 400
}
@wwang500 was able to reproduce
PUT _inference/text_embedding/my-e5-model
{
"service": "elasticsearch",
"service_settings": {
"num_allocations": 1,
"num_threads": 1,
"model_id": ".multilingual-e5-small"
}
}
PUT _inference/text_embedding/e5-ingest { "service": "elasticsearch", "service_settings": { "num_threads": 1, "model_id": ".multilingual-e5-small_linux-x86_64", "adaptive_allocations": { "enabled": true, "min_number_of_allocations": 1, "max_number_of_allocations": 32 } } } PUT _inference/text_embedding/e5-ingest { "service": "elasticsearch", "service_settings": { "num_threads": 1, "model_id": ".multilingual-e5-small_linux-x86_64", "adaptive_allocations": { "enabled": true, "min_number_of_allocations": 1, "max_number_of_allocations": 32 } } }
PUT _inference/sparse_embedding/elser-endpoint-backup-2 { "service": "elser", "service_settings": { "adaptive_allocations": { "enabled": true, "min_number_of_allocations": 10, "max_number_of_allocations": 100 }, "num_threads": 1 } }
Elasticsearch Version
serverless
Installed Plugins
No response
Java Version
bundled
OS Version
serverless
Problem Description
I have an Inference endpoint up and running. It has been working fine for several days. The cluster suddenly started responding with
404
error when trying to call the inference endpoint.Steps to Reproduce
response
GET _inference/elser-endpoint
responseget _ml/trained_models/.elser_model_2_linux-x86_64/_stats
responseThough whats weird is I was trying to deploy an ELSER model
response
serverless prod -
fbe378fd1dab4affa7c981c90a03f440
Logs (if relevant)
app log last success (central us time)
then on next call