elastic / logstash

Logstash - transport and process your logs, events, or other data
https://www.elastic.co/products/logstash
Other
72 stars 3.5k forks source link

Persistent queue corruption in 6.2.4 and 6.3.2 #9961

Open mattfield opened 6 years ago

mattfield commented 6 years ago

It looks like we might have been bitten by https://github.com/elastic/logstash/issues/9483 in our running of Logstash ingest Filebeat data.

We noticed that all ingestion into one of our aggregation clusters had ceased yesterday evening. Looking first at our fleet of Beats, all that we checked were showing the same error:

2018-09-05T16:26:16Z ERR Connecting error publishing events (retrying): EOF
2018-09-05T16:26:44Z INFO Non-zero metrics in the last 30s: libbeat.logstash.publish.read_errors=1 libbeat.logstash.publish.write_bytes=168
2018-09-05T16:27:14Z INFO No non-zero metrics in the last 30s

After verifying that no config changes had been made to either Beats or Logstash, we checked the events and queue details of one of our Logstash boxes, which had been active up to a certain point, but had no events in their PQ:

"events" : {
    "in" : 829256604,
    "filtered" : 829256604,
    "out" : 829256604,
    "duration_in_millis" : 4274874923,
    "queue_push_duration_in_millis" : 493867391
  }
      "queue" : {
        "events" : 0,
        "type" : "persisted",
        "capacity" : {
          "queue_size_in_bytes" : 19237016,
          "page_capacity_in_bytes" : 67108864,
          "max_queue_size_in_bytes" : 4294967296,
          "max_unread_events" : 0
        }

Their destination ES cluster had been throwing the following, though when checked all it's thread pools were empty:

{"level":"INFO","loggerName":"logstash.outputs.elasticsearch","timeMillis":1536165589715,"thread":"Ruby-0-Thread-38@[main]>worker3: :1","logEvent":{"message":"retrying failed action with response code: 429 ({\"type\"=>\"es_rejected_execution_exception\", \"reason\"=>\"rejected execution of org.elasticsearch.transport.TransportService$7@4098e471 on EsThreadPoolExecutor[bulk, queue capacity = 200, org.elasticsearch.common.util.concurrent.EsThreadPoolExecutor@6c35853e[Running, pool size = 16, active threads = 16, queued tasks = 475, completed tasks = 944956903]]\"})"}}

When we restarted one of the Logstash processes, we saw the following error:

{
  "level": "ERROR",
  "loggerName": "logstash.pipeline",
  "timeMillis": 1536166083887,
  "thread": "Ruby-0-Thread-1: /app/logstash/lib/bootstrap/environment.rb:6",
  "logEvent": {
    "message": "Logstash failed to create queue",
    "pipeline_id": "main",
    "exception": "java.io.EOFException: null",
    "backtrace": [
      "org/logstash/common/io/InputStreamStreamInput.java:19:in `readByte'",
      "org/logstash/common/io/BufferedChecksumStreamInput.java:28:in `readByte'",
      "org/logstash/common/io/StreamInput.java:42:in `readShort'",
      "org/logstash/ackedqueue/io/FileCheckpointIO.java:104:in `read'",
      "org/logstash/ackedqueue/io/FileCheckpointIO.java:53:in `read'",
      "org/logstash/ackedqueue/Queue.java:179:in `open'",
      "org/logstash/ackedqueue/ext/AbstractJRubyQueue.java:88:in `open'",
      "org/logstash/ackedqueue/ext/AbstractJRubyQueue$INVOKER$i$0$0$ruby_open.gen:-1:in `call'",
      "/app/logstash/logstash-core/lib/logstash/util/wrapped_acked_queue.rb:39:in `with_queue'",
      "app/logstash/logstash_minus_core/lib/logstash/util//app/logstash/logstash-core/lib/logstash/util/wrapped_acked_queue.rb:-1:in `RUBY$method$with_queue$0$__VARARGS__'",
      "/app/logstash/logstash-core/lib/logstash/util/wrapped_acked_queue.rb:28:in `create_file_based'",
      "app/logstash/logstash_minus_core/lib/logstash/util//app/logstash/logstash-core/lib/logstash/util/wrapped_acked_queue.rb:-1:in `RUBY$method$create_file_based$0$__VARARGS__'",
      "/app/logstash/logstash-core/lib/logstash/queue_factory.rb:29:in `create'",
      "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/queue_factory.rb:-1:in `RUBY$method$create$0$__VARARGS__'",
      "/app/logstash/logstash-core/lib/logstash/pipeline.rb:172:in `initialize'",
      "org/jruby/RubyClass.java:1022:in `newInstance'",
      "org/jruby/RubyClass$INVOKER$i$newInstance.gen:-1:in `call'",
      "/app/logstash/logstash-core/lib/logstash/pipeline_action/create.rb:40:in `execute'",
      "app/logstash/logstash_minus_core/lib/logstash/pipeline_action//app/logstash/logstash-core/lib/logstash/pipeline_action/create.rb:-1:in `RUBY$method$execute$0$__VARARGS__'",
      "/app/logstash/logstash-core/lib/logstash/agent.rb:315:in `block in converge_state'",
      "/app/logstash/logstash-core/lib/logstash/agent.rb:141:in `with_pipelines'",
      "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$with_pipelines$0$__VARARGS__'",
      "/app/logstash/logstash-core/lib/logstash/agent.rb:312:in `block in converge_state'",
      "org/jruby/RubyArray.java:1734:in `each'",
      "org/jruby/RubyArray$INVOKER$i$0$0$each.gen:-1:in `call'",
      "/app/logstash/logstash-core/lib/logstash/agent.rb:299:in `converge_state'",
      "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$converge_state$0$__VARARGS__'",
      "/app/logstash/logstash-core/lib/logstash/agent.rb:166:in `block in converge_state_and_update'",
      "/app/logstash/logstash-core/lib/logstash/agent.rb:141:in `with_pipelines'",
      "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$with_pipelines$0$__VARARGS__'",
      "/app/logstash/logstash-core/lib/logstash/agent.rb:164:in `converge_state_and_update'",
      "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$converge_state_and_update$0$__VARARGS__'",
      "/app/logstash/logstash-core/lib/logstash/agent.rb:90:in `execute'",
      "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$execute$0$__VARARGS__'",
      "/app/logstash/logstash-core/lib/logstash/runner.rb:348:in `block in execute'",
      "org/jruby/RubyProc.java:289:in `call'",
      "org/jruby/RubyProc.java:273:in `call19'",
      "org/jruby/RubyProc$INVOKER$i$0$0$call19.gen:-1:in `call'",
      "/app/logstash/vendor/bundle/jruby/2.3.0/gems/stud-0.0.23/lib/stud/task.rb:24:in `block in initialize'",
      "org/jruby/RubyProc.java:289:in `call'",
      "org/jruby/RubyProc.java:246:in `call'",
      "java/lang/Thread.java:748:in `run'"
    ]
  }
}

which led us to https://github.com/elastic/logstash/issues/9483 and https://github.com/elastic/logstash/issues/9220. Taking advice from the former, we tried emptying out the contents of our queue directories, which seemed to at least get the processes back to functional and stable. However, when restarting multiple other boxes in the fleet, they went into restart loops, throwing the following:

{
    "level": "ERROR",
    "loggerName": "logstash.agent",
    "timeMillis": 1536169004246,
    "thread": "Ruby-0-Thread-1: /app/logstash/lib/bootstrap/environment.rb:6",
    "logEvent": {
        "message": "Failed to execute action",
        "action": {
            "metaClass": {
                "metaClass": {
                    "metaClass": {
                        "action": "LogStash::PipelineAction::Create/pipeline_id:main",
                        "exception": "IOError",
                        "message": "java.io.IOException: Page file size is too small to hold elements",
                        "backtrace": ["org/logstash/ackedqueue/io/MmapPageIO.java:57:in `mapFile'", "org/logstash/ackedqueue/io/MmapPageIO.java:41:in `recover'", "org/logstash/ackedqueue/Queue.java:205:in `open'", "org/logstash/ackedqueue/ext/AbstractJRubyQueue.java:88:in `open'", "org/logstash/ackedqueue/ext/AbstractJRubyQueue$INVOKER$i$0$0$ruby_open.gen:-1:in `call'", "/app/logstash/logstash-core/lib/logstash/util/wrapped_acked_queue.rb:39:in `with_queue'", "app/logstash/logstash_minus_core/lib/logstash/util//app/logstash/logstash-core/lib/logstash/util/wrapped_acked_queue.rb:-1:in `RUBY$method$with_queue$0$__VARARGS__'", "/app/logstash/logstash-core/lib/logstash/util/wrapped_acked_queue.rb:28:in `create_file_based'", "app/logstash/logstash_minus_core/lib/logstash/util//app/logstash/logstash-core/lib/logstash/util/wrapped_acked_queue.rb:-1:in `RUBY$method$create_file_based$0$__VARARGS__'", "/app/logstash/logstash-core/lib/logstash/queue_factory.rb:29:in `create'", "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/queue_factory.rb:-1:in `RUBY$method$create$0$__VARARGS__'", "/app/logstash/logstash-core/lib/logstash/pipeline.rb:172:in `initialize'", "org/jruby/RubyClass.java:1022:in `newInstance'", "org/jruby/RubyClass$INVOKER$i$newInstance.gen:-1:in `call'", "/app/logstash/logstash-core/lib/logstash/pipeline_action/create.rb:40:in `execute'", "app/logstash/logstash_minus_core/lib/logstash/pipeline_action//app/logstash/logstash-core/lib/logstash/pipeline_action/create.rb:-1:in `RUBY$method$execute$0$__VARARGS__'", "/app/logstash/logstash-core/lib/logstash/agent.rb:315:in `block in converge_state'", "/app/logstash/logstash-core/lib/logstash/agent.rb:141:in `with_pipelines'", "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$with_pipelines$0$__VARARGS__'", "/app/logstash/logstash-core/lib/logstash/agent.rb:312:in `block in converge_state'", "org/jruby/RubyArray.java:1734:in `each'", "org/jruby/RubyArray$INVOKER$i$0$0$each.gen:-1:in `call'", "/app/logstash/logstash-core/lib/logstash/agent.rb:299:in `converge_state'", "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$converge_state$0$__VARARGS__'", "/app/logstash/logstash-core/lib/logstash/agent.rb:166:in `block in converge_state_and_update'", "/app/logstash/logstash-core/lib/logstash/agent.rb:141:in `with_pipelines'", "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$with_pipelines$0$__VARARGS__'", "/app/logstash/logstash-core/lib/logstash/agent.rb:164:in `converge_state_and_update'", "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$converge_state_and_update$0$__VARARGS__'", "/app/logstash/logstash-core/lib/logstash/agent.rb:90:in `execute'", "app/logstash/logstash_minus_core/lib/logstash//app/logstash/logstash-core/lib/logstash/agent.rb:-1:in `RUBY$method$execute$0$__VARARGS__'", "/app/logstash/logstash-core/lib/logstash/runner.rb:348:in `block in execute'", "org/jruby/RubyProc.java:289:in `call'", "org/jruby/RubyProc.java:273:in `call19'", "org/jruby/RubyProc$INVOKER$i$0$0$call19.gen:-1:in `call'", "/app/logstash/vendor/bundle/jruby/2.3.0/gems/stud-0.0.23/lib/stud/task.rb:24:in `block in initialize'", "org/jruby/RubyProc.java:289:in `call'", "org/jruby/RubyProc.java:246:in `call'", "java/lang/Thread.java:748:in `run'"]
                    }
                }
            }
        }
    }
}

This then led us to https://github.com/elastic/logstash/issues/8098. In the end, we had to terminate and rebuild our EC2 boxes to get the fleet back to operational.

Any additional insight would be great here.

danhermann commented 6 years ago

@mattfield, sorry for the problems you've experienced here. A significant number of PQ bugs were fixed for the 6.3.0 release and we've had no reports of PQ problems on that or later releases so I'd strongly recommend the 6.3.0 or later release if you're using PQ.

gservat commented 5 years ago

Hi @danhermann. We're running 6.3.2 and I just tried recreating our logstash containers which resulted in the same Page file size is too small to hold elements error message. Any thoughts?

evanfreed commented 5 years ago

Saw that same message again on our 6.3.2 staging logstash servers 😞

henrikno commented 5 years ago

This also happened in staging which is running LS & ES 6.6.2:

{"level":"ERROR","loggerName":"org.logstash.execution.AbstractPipelineExt","timeMillis":1557186424796,"thread":"Converge PipelineAction::Create<main>","logEvent":{"message":"Logstash failed to create queue."}}
{"level":"ERROR","loggerName":"logstash.agent","timeMillis":1557186424816,"thread":"Converge PipelineAction::Create<main>","logEvent":{"message":"Failed to execute action","action":{"metaClass":{"metaClass":{"metaClass":{"action":"PipelineAction::Create<main>","exception":"Java::JavaLang::IllegalStateException","message":"java.io.IOException: Page file size is too small to hold elements","backtrace":["org.logstash.execution.AbstractPipelineExt.openQueue(AbstractPipelineExt.java:170)","org.logstash.execution.AbstractPipelineExt$INVOKER$i$0$0$openQueue.call(AbstractPipelineExt$INVOKER$i$0$0$openQueue.gen)","org.jruby.internal.runtime.methods.JavaMethod$JavaMethodN.call(JavaMethod.java:737)","org.jruby.ir.targets.InvokeSite.invoke(InvokeSite.java:145)","app.logstash.logstash_minus_core.lib.logstash.pipeline.RUBY$method$initialize$0(/app/logstash/logstash-core/lib/logstash/pipeline.rb:91)","org.jruby.internal.runtime.methods.CompiledIRMethod.call(CompiledIRMethod.java:77)","org.jruby.internal.runtime.methods.MixedModeIRMethod.call(MixedModeIRMethod.java:93)","org.jruby.runtime.callsite.CachingCallSite.cacheAndCall(CachingCallSite.java:298)","org.jruby.runtime.callsite.CachingCallSite.callBlock(CachingCallSite.java:79)","org.jruby.runtime.callsite.CachingCallSite.call(CachingCallSite.java:83)","org.jruby.RubyClass.newInstance(RubyClass.java:1022)","org.jruby.RubyClass$INVOKER$i$newInstance.call(RubyClass$INVOKER$i$newInstance.gen)","org.jruby.ir.targets.InvokeSite.invoke(InvokeSite.java:145)","app.logstash.logstash_minus_core.lib.logstash.pipeline_action.create.RUBY$block$execute$1(/app/logstash/logstash-core/lib/logstash/pipeline_action/create.rb:43)","org.jruby.runtime.CompiledIRBlockBody.callDirect(CompiledIRBlockBody.java:145)","org.jruby.runtime.IRBlockBody.call(IRBlockBody.java:71)","org.jruby.runtime.Block.call(Block.java:124)","org.jruby.RubyProc.call(RubyProc.java:289)","org.jruby.RubyProc.call19(RubyProc.java:273)","org.jruby.RubyProc$INVOKER$i$0$0$call19.call(RubyProc$INVOKER$i$0$0$call19.gen)","org.jruby.ir.targets.InvokeSite.invoke(InvokeSite.java:145)","app.logstash.logstash_minus_core.lib.logstash.agent.RUBY$block$exclusive$1(/app/logstash/logstash-core/lib/logstash/agent.rb:94)","org.jruby.runtime.CompiledIRBlockBody.yieldDirect(CompiledIRBlockBody.java:156)","org.jruby.runtime.IRBlockBody.yieldSpecific(IRBlockBody.java:80)","org.jruby.runtime.Block.yieldSpecific(Block.java:134)","org.jruby.ext.thread.Mutex.synchronize(Mutex.java:148)","org.jruby.ext.thread.Mutex$INVOKER$i$0$0$synchronize.call(Mutex$INVOKER$i$0$0$synchronize.gen)","org.jruby.internal.runtime.methods.JavaMethod$JavaMethodZeroBlock.call(JavaMethod.java:498)","org.jruby.ir.targets.InvokeSite.invoke(InvokeSite.java:145)","app.logstash.logstash_minus_core.lib.logstash.agent.RUBY$method$exclusive$0(/app/logstash/logstash-core/lib/logstash/agent.rb:94)","app.logstash.logstash_minus_core.lib.logstash.agent.RUBY$method$exclusive$0$__VARARGS__(/app/logstash/logstash-core/lib/logstash/agent.rb)","org.jruby.internal.runtime.methods.CompiledIRMethod.call(CompiledIRMethod.java:77)","org.jruby.internal.runtime.methods.MixedModeIRMethod.call(MixedModeIRMethod.java:93)","org.jruby.ir.targets.InvokeSite.invoke(InvokeSite.java:145)","app.logstash.logstash_minus_core.lib.logstash.pipeline_action.create.RUBY$method$execute$0(/app/logstash/logstash-core/lib/logstash/pipeline_action/create.rb:39)","app.logstash.logstash_minus_core.lib.logstash.pipeline_action.create.RUBY$method$execute$0$__VARARGS__(/app/logstash/logstash-core/lib/logstash/pipeline_action/create.rb)","org.jruby.internal.runtime.methods.CompiledIRMethod.call(CompiledIRMethod.java:77)","org.jruby.internal.runtime.methods.MixedModeIRMethod.call(MixedModeIRMethod.java:93)","org.jruby.ir.targets.InvokeSite.invoke(InvokeSite.java:145)","app.logstash.logstash_minus_core.lib.logstash.agent.RUBY$block$converge_state$2(/app/logstash/logstash-core/lib/logstash/agent.rb:327)","org.jruby.runtime.CompiledIRBlockBody.callDirect(CompiledIRBlockBody.java:145)","org.jruby.runtime.IRBlockBody.call(IRBlockBody.java:71)","org.jruby.runtime.Block.call(Block.java:124)","org.jruby.RubyProc.call(RubyProc.java:289)","org.jruby.RubyProc.call(RubyProc.java:246)","org.jruby.internal.runtime.RubyRunnable.run(RubyRunnable.java:104)","java.lang.Thread.run(Thread.java:748)"]}}}}}}
{"level":"ERROR","loggerName":"logstash.agent","timeMillis":1557186424846,"thread":"LogStash::Runner","logEvent":{"message":"An exception happened when converging configuration","exception":{"metaClass":{"metaClass":{"exception":"LogStash::Error","message":"Don't know how to handle `Java::JavaLang::IllegalStateException` for `PipelineAction::Create<main>`","backtrace":["org/logstash/execution/ConvergeResultExt.java:103:in `create'","org/logstash/execution/ConvergeResultExt.java:34:in `add'","/app/logstash/logstash-core/lib/logstash/agent.rb:340:in `block in converge_state'"]}}}}}
henrikno commented 5 years ago

Related issue: https://github.com/elastic/logstash/issues/8480