opensearch-project / ml-commons

ml-commons provides a set of common machine learning algorithms, e.g. k-means, or linear regression, to help developers build ML related features within OpenSearch.
Apache License 2.0
90 stars 128 forks source link

[BUG] Neural search: ArrayIndexOutOfBoundsException: Index 495884 out of bounds for length 1 #2292

Open lihuimingxs opened 5 months ago

lihuimingxs commented 5 months ago

What is the bug? A clear and concise description of the bug.

In Opensearch 2.12.0:

When using GPU and initiating concurrent requests using neural retrieval, an ArrayIndexOutOfBoundsException exception was encountered.

I'm not sure if it's a concurrency issue, but what I can know is that a single request is successful, and exceptions only occur when there are concurrent requests.

Number of concurrent requests: More than 5 times

Request:

GET irp_index_vec/_search
{
      "size": 100, 
      "query": {
          "bool": {
              "filter": [
                  {
                      "bool": {
                          "must": [
                              {
                                  "terms": {
                                      "stat": [
                                          1
                                      ]
                                  }
                              }
                          ]
                      }
                  }
              ],
              "must": [
                  {
                      "neural": {
                          "embeddingCnVector1": {
                              "query_text": "some content",
                              "k": 100
                          }
                      }
                  }
              ]
          }
      }
}

Exception:

[2024-03-28T19:30:27,355][WARN ][r.suppressed             ] [opensearch-cluster_manager] path: /irp_index_vec/_search, params: {typed_keys=true, index=irp_index_vec}
org.opensearch.action.search.SearchPhaseExecutionException: all shards failed
        at org.opensearch.action.search.AbstractSearchAsyncAction.onPhaseFailure(AbstractSearchAsyncAction.java:722) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.search.AbstractSearchAsyncAction.executeNextPhase(AbstractSearchAsyncAction.java:379) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.search.FetchSearchPhase.moveToNextPhase(FetchSearchPhase.java:298) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.search.FetchSearchPhase.lambda$innerRun$1(FetchSearchPhase.java:138) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.search.CountedCollector.countDown(CountedCollector.java:66) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.search.CountedCollector.onFailure(CountedCollector.java:85) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.search.FetchSearchPhase$2.onFailure(FetchSearchPhase.java:257) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.ActionListenerResponseHandler.handleException(ActionListenerResponseHandler.java:75) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.search.SearchTransportService$ConnectionCountingHandler.handleException(SearchTransportService.java:766) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.TransportService$9.handleException(TransportService.java:1725) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.security.transport.SecurityInterceptor$RestoringTransportResponseHandler.handleException(SecurityInterceptor.java:404) [opensearch-security-2.12.0.0.jar:2.12.0.0]
        at org.opensearch.transport.TransportService$ContextRestoreResponseHandler.handleException(TransportService.java:1511) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.InboundHandler.lambda$handleException$5(InboundHandler.java:447) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.common.util.concurrent.OpenSearchExecutors$DirectExecutorService.execute(OpenSearchExecutors.java:343) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.InboundHandler.handleException(InboundHandler.java:445) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.InboundHandler.handlerResponseError(InboundHandler.java:437) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.InboundHandler.messageReceived(InboundHandler.java:170) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.InboundHandler.inboundMessage(InboundHandler.java:127) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.TcpTransport.inboundMessage(TcpTransport.java:770) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.InboundPipeline.forwardFragments(InboundPipeline.java:175) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.InboundPipeline.doHandleBytes(InboundPipeline.java:150) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.InboundPipeline.handleBytes(InboundPipeline.java:115) [opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.transport.netty4.Netty4MessageChannelHandler.channelRead(Netty4MessageChannelHandler.java:95) [transport-netty4-client-2.12.0.jar:2.12.0]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:442) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.handler.logging.LoggingHandler.channelRead(LoggingHandler.java:280) [netty-handler-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:442) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.handler.ssl.SslHandler.unwrap(SslHandler.java:1475) [netty-handler-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.handler.ssl.SslHandler.decodeJdkCompatible(SslHandler.java:1338) [netty-handler-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.handler.ssl.SslHandler.decode(SslHandler.java:1387) [netty-handler-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.handler.codec.ByteToMessageDecoder.decodeRemovalReentryProtection(ByteToMessageDecoder.java:529) [netty-codec-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:468) [netty-codec-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:290) [netty-codec-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:444) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:412) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:440) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:420) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:788) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.nio.NioEventLoop.processSelectedKeysPlain(NioEventLoop.java:689) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:652) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:562) [netty-transport-4.1.106.Final.jar:4.1.106.Final]
        at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:997) [netty-common-4.1.106.Final.jar:4.1.106.Final]
        at java.base/java.lang.Thread.run(Thread.java:1583) [?:?]
Caused by: org.opensearch.OpenSearchException$3: Index 495884 out of bounds for length 1
        at org.opensearch.OpenSearchException.guessRootCauses(OpenSearchException.java:708) ~[opensearch-core-2.12.0.jar:2.12.0]
        at org.opensearch.action.search.AbstractSearchAsyncAction.executeNextPhase(AbstractSearchAsyncAction.java:377) [opensearch-2.12.0.jar:2.12.0]
        ... 49 more
Caused by: java.lang.ArrayIndexOutOfBoundsException: Index 495884 out of bounds for length 1
        at org.apache.lucene.util.SparseFixedBitSet.get(SparseFixedBitSet.java:129) ~[lucene-core-9.9.2.jar:9.9.2 a2939784c4ca60bc28bf488b5479c02fc2e5e22c - 2024-01-25 09:51:09]
        at org.opensearch.search.fetch.FetchPhase.findRootDocumentIfNested(FetchPhase.java:283) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.search.fetch.FetchPhase.prepareHitContext(FetchPhase.java:299) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.search.fetch.FetchPhase.execute(FetchPhase.java:172) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.search.SearchService.lambda$executeFetchPhase$3(SearchService.java:782) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:74) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.action.ActionRunnable$2.doRun(ActionRunnable.java:89) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.threadpool.TaskAwareRunnable.doRun(TaskAwareRunnable.java:78) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:59) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:913) ~[opensearch-2.12.0.jar:2.12.0]
        at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52) ~[opensearch-2.12.0.jar:2.12.0]
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144) ~[?:?]
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642) ~[?:?]
        ... 1 more

How can one reproduce the bug? Steps to reproduce the behavior:

  1. Create neural retrieval concurrent requests
  2. View cluster startup logs
  3. See the error logs

What is the expected behavior? A clear and concise description of what you expected to happen.

Neural Search is ok when used GPU.

What is your host/environment?

Do you have any screenshots? If applicable, add screenshots to help explain your problem.

Do you have any additional context? Add any other context about the problem.

lihuimingxs commented 5 months ago

I'm not sure if it's a client issue, because I used Opensearch java client version is 2.11.1. But I guess it's not.

austintlee commented 5 months ago

Does this not happen if you don't use a GPU? It doesn't seem related to GPU usage.

lihuimingxs commented 5 months ago

Does this not happen if you don't use a GPU? It doesn't seem related to GPU usage.

Yes, I have not encountered this exception while using the CPU.

ylwu-amzn commented 4 months ago

@lihuimingxs Can you provide more details? What GPU are you using? What's your cluster setting like how many data nodes, ML nodes, are you using GPU instance as data nodes ?

lihuimingxs commented 4 months ago

@lihuimingxs Can you provide more details? What GPU are you using? What's your cluster setting like how many data nodes, ML nodes, are you using GPU instance as data nodes ?

@ylwu-amzn Sure! I have 6 data nodes (including 1 master node) and 2 ML nodes (GPUs). The ML nodes are solely utilized for vector computations and do not store any data.

Tips: Then, due to business requirements, I expanded the number of data nodes to 8 and ML nodes to 4, yet the issue persisted. Hence, I suspect the issue may not be closely related to the number of nodes. I hope my additional explanation provides helpful information.

ML nodes environment:

index:

PUT irp_cre_vec_20240425
{
  "aliases": {
    "irp_cre_vec": {}
  },
  "mappings": {
    "_source": {
      "excludes": [
        "embeddingCnContent1",
        "embeddingCnContent2"
      ]
    },
    "properties": {
      "embeddingCnContent1": {
        "type": "text",
        "index": false
      },
      "embeddingCnContent2": {
        "type": "text",
        "index": false
      },
      "embeddingCnVector1": {
        "type": "knn_vector",
        "dimension": 1024,
        "method": {
          "engine": "faiss",
          "space_type": "l2",
          "name": "hnsw",
          "parameters": {}
        }
      },
      "embeddingCnVector2": {
        "type": "knn_vector",
        "dimension": 1024,
        "method": {
          "engine": "faiss",
          "space_type": "l2",
          "name": "hnsw",
          "parameters": {}
        }
      },
      "id": {
        "type": "keyword"
      }
    }
  },
  "settings": {
    "index": {
      "replication": {
        "type": "DOCUMENT"
      },
      "mapping": {
        "total_fields": {
          "limit": "1000"
        }
      },
      "search": {
        "default_pipeline": "cre_v2_search_model_pipeline"
      },
      "number_of_shards": "1",
      "max_result_window": "10000",
      "default_pipeline": "cre_v2_convert_pipeline",
      "knn": "true",
      "number_of_replicas": "0"
    }
  }
}

opensearch.yml: Note: The Opensearch version is 2.12.0, and the /home/opensearch/opensearch-2.9.0 path in the configuration file indicates only the opensearch installation path, not the Opensearch version.

# ======================== OpenSearch Configuration =========================
#
# NOTE: OpenSearch comes with reasonable defaults for most settings.
#       Before you set out to tweak and tune the configuration, make sure you
#       understand what are you trying to accomplish and the consequences.
#
# The primary way of configuring a node is via this file. This template lists
# the most important settings you may want to configure for a production cluster.
#
# Please consult the documentation for further information on configuration options:
# https://www.opensearch.org
#
# ---------------------------------- Cluster -----------------------------------
#
# Use a descriptive name for your cluster:
#
#cluster.name: my-application
cluster.name: opensearch-cluster
#
# ------------------------------------ Node ------------------------------------
#
# Use a descriptive name for the node:
#
#node.name: node-1
node.name: opensearch-cluster_manager
node.roles: [ cluster_manager, data, ingest ]
#
# Add custom attributes to the node:
#
#node.attr.rack: r1
node.attr.crmTag: "vecPosition"
node.attr.ctsvec: "ctsvec"
#
# ----------------------------------- Paths ------------------------------------
#
# Path to directory where to store the data (separate multiple locations by comma):
#
#path.data: /path/to/data
#
# Path to log files:
#
#path.logs: /path/to/logs
#
# Path to snapshot files:
path.repo: ["/mnt/sfs_turbo"]
# ----------------------------------- Memory -----------------------------------
#
# Lock the memory on startup:
#
#bootstrap.memory_lock: true
#
# Make sure that the heap size is set to about half the memory available
# on the system and that the owner of the process is allowed to use this
# limit.
#
# OpenSearch performs poorly when the system is swapping the memory.
#
# ---------------------------------- Network -----------------------------------
#
# Set the bind address to a specific IP (IPv4 or IPv6):
#
#network.host: 192.168.0.1
network.host: 0.0.0.0
#
# Set a custom port for HTTP:
#
http.port: 9200
#
# For more information, consult the network module documentation.
#
# --------------------------------- Discovery ----------------------------------
#
# Pass an initial list of hosts to perform discovery when this node is started:
# The default list of hosts is ["127.0.0.1", "[::1]"]
#
#discovery.seed_hosts: ["host1", "host2"]
discovery.seed_hosts: ["192.168.1.6", "192.168.1.7", "192.168.1.8", "192.168.1.9", "192.168.1.10", "192.168.1.11"]
#
# Bootstrap the cluster using an initial set of cluster-manager-eligible nodes:
#
#cluster.initial_cluster_manager_nodes: ["node-1", "node-2"]
cluster.initial_cluster_manager_nodes: ["opensearch-cluster_manager"]
#
# For more information, consult the discovery and cluster formation module documentation.
#
# ---------------------------------- Gateway -----------------------------------
#
# Block initial recovery after a full cluster restart until N nodes are started:
#
#gateway.recover_after_nodes: 3
#
# For more information, consult the gateway module documentation.
#
# ---------------------------------- Various -----------------------------------
#
# Require explicit names when deleting indices:
#
#action.destructive_requires_name: true
#
# ---------------------------------- Remote Store -----------------------------------
# Controls whether cluster imposes index creation only with remote store enabled
# cluster.remote_store.enabled: true
#
# Repository to use for segment upload while enforcing remote store for an index
# cluster.remote_store.repository: my-repo-1
#
# Controls whether cluster imposes index creation only with translog remote store enabled
# cluster.remote_store.translog.enabled: true
#
# Repository to use for translog upload while enforcing remote store for an index
# cluster.remote_store.translog.repository: my-repo-1
#
# ---------------------------------- Experimental Features -----------------------------------
#
# Gates the visibility of the experimental segment replication features until they are production ready.
#
#opensearch.experimental.feature.segment_replication_experimental.enabled: false
#
#
# Gates the visibility of the index setting that allows persisting data to remote store along with local disk.
# Once the feature is ready for production release, this feature flag can be removed.
#
#opensearch.experimental.feature.remote_store.enabled: false
#
#
# Gates the functionality of a new parameter to the snapshot restore API
# that allows for creation of a new index type that searches a snapshot
# directly in a remote repository without restoring all index data to disk
# ahead of time.
#
#opensearch.experimental.feature.searchable_snapshot.enabled: false
#
#
# Gates the functionality of enabling extensions to work with OpenSearch.
# This feature enables applications to extend features of OpenSearch outside of
# the core.
#
#opensearch.experimental.feature.extensions.enabled: false
#
#
# Gates the concurrent segment search feature. This feature enables concurrent segment search in a separate
# index searcher threadpool.
#
#opensearch.experimental.feature.concurrent_segment_search.enabled: false

######## Start OpenSearch Security Demo Configuration ########
# WARNING: revise all the lines below before you go into production
#plugins.security.ssl.transport.pemcert_filepath: esnode.pem
#plugins.security.ssl.transport.pemkey_filepath: esnode-key.pem
#plugins.security.ssl.transport.pemtrustedcas_filepath: root-ca.pem
#plugins.security.ssl.transport.enforce_hostname_verification: false
#plugins.security.ssl.http.enabled: false
#plugins.security.ssl.http.pemcert_filepath: esnode.pem
#plugins.security.ssl.http.pemkey_filepath: esnode-key.pem
#plugins.security.ssl.http.pemtrustedcas_filepath: root-ca.pem
#plugins.security.allow_unsafe_democertificates: true
#plugins.security.allow_default_init_securityindex: true
#plugins.security.authcz.admin_dn:
#  - CN=,OU=client,O=client,L=test, C=de

plugins.security.disabled: false
plugins.security.ssl.transport.pemcert_filepath: /home/opensearch/opensearch-2.9.0/config/node1.pem
plugins.security.ssl.transport.pemkey_filepath: /home/opensearch/opensearch-2.9.0/config/node1-key.pem
plugins.security.ssl.transport.pemtrustedcas_filepath: /home/opensearch/opensearch-2.9.0/config/root-ca.pem
plugins.security.ssl.transport.enforce_hostname_verification: false
#plugins.security.ssl.http.enabled: true
plugins.security.ssl.http.enabled: false
plugins.security.ssl.http.pemcert_filepath: /home/opensearch/opensearch-2.9.0/config/node1.pem
plugins.security.ssl.http.pemkey_filepath: /home/opensearch/opensearch-2.9.0/config/node1-key.pem
plugins.security.ssl.http.pemtrustedcas_filepath: /home/opensearch/opensearch-2.9.0/config/root-ca.pem
plugins.security.allow_unsafe_democertificates: true
plugins.security.allow_default_init_securityindex: true
plugins.security.authcz.admin_dn:
  - CN=admin,OU=Taa,O=Carrer,L=Beijing,ST=Beijing,C=CN
plugins.security.nodes_dn:
  - CN=100.125.1.250,OU=Taa,O=Carrer,L=Beijing,ST=Beijing,C=CN

plugins.security.audit.type: internal_opensearch
plugins.security.enable_snapshot_restore_privilege: true
plugins.security.check_snapshot_restore_write_privileges: true
plugins.security.restapi.roles_enabled: ["all_access", "security_rest_api_access"]
plugins.security.system_indices.enabled: true
plugins.security.system_indices.indices: [".plugins-ml-config", ".plugins-ml-connector", ".plugins-ml-model-group", ".plugins-ml-model", ".plugins-ml-task", ".opendistro-alerting-config", ".opendistro-alerting-alert*", ".opendistro-anomaly-results*", ".opendistro-anomaly-detector*", ".opendistro-anomaly-checkpoints", ".opendistro-anomaly-detection-state", ".opendistro-reports-*", ".opensearch-notifications-*", ".opensearch-notebooks", ".opensearch-observability", ".ql-datasources", ".opendistro-asynchronous-search-response*", ".replication-metadata-store", ".opensearch-knn-models"]
node.max_local_storage_nodes: 3
######## End OpenSearch Security Demo Configuration ########
dblock commented 3 months ago

Catch All Triage - 1 2 3 4 5 6