Open mashhurs opened 3 months ago
Tried to reproduce the case (setup Azure VM with mounted disk, disk will get full) but I always get No space left on device
error at the first and no checkpoint.head corrupted.
[2024-08-20T21:40:25,667][ERROR][logstash.javapipeline ][main] Pipeline worker error, the pipeline will be stopped {:pipeline_id=>"main", :error=>"No space left on device", :exception=>Java::JavaIo::IOException,
:backtrace=>
["java.base/sun.nio.ch.FileDispatcherImpl.write0(Native Method)", "java.base/sun.nio.ch.FileDispatcherImpl.write(FileDispatcherImpl.java:62)", "java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:132)", "java.base/sun.nio.ch.IOUtil.write(IOUtil.java:76)", "java.base/sun.nio.ch.IOUtil.write(IOUtil.java:67)", "java.base/sun.nio.ch.FileChannelImpl.write(FileChannelImpl.java:288)", "org.logstash.ackedqueue.io.FileCheckpointIO.write(FileCheckpointIO.java:106)", "org.logstash.ackedqueue.io.FileCheckpointIO.write(FileCheckpointIO.java:96)", "org.logstash.ackedqueue.Page.tailPageCheckpoint(Page.java:213)", "org.logstash.ackedqueue.Page.checkpoint(Page.java:186)", "org.logstash.ackedqueue.Page.ack(Page.java:164)", "org.logstash.ackedqueue.Queue.ack(Queue.java:727)", "org.logstash.ackedqueue.Batch.close(Batch.java:60)", "org.logstash.ackedqueue.AckedBatch.close(AckedBatch.java:56)", "org.logstash.ackedqueue.AckedReadBatch.close(AckedReadBatch.java:92)", "org.logstash.execution.QueueReadClientBase.closeBatch(QueueReadClientBase.java:133)", "org.logstash.execution.WorkerLoop.run(WorkerLoop.java:89)", "java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)", "java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)", "java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)", "java.base/java.lang.reflect.Method.invoke(Method.java:568)", "org.jruby.javasupport.JavaMethod.invokeDirectWithExceptionHandling(JavaMethod.java:300)", "org.jruby.javasupport.JavaMethod.invokeDirect(JavaMethod.java:164)", "org.jruby.java.invokers.InstanceMethodInvoker.call(InstanceMethodInvoker.java:32)", "org.jruby.runtime.callsite.CachingCallSite.call(CachingCallSite.java:193)", "org.jruby.ir.interpreter.InterpreterEngine.processCall(InterpreterEngine.java:346)", "org.jruby.ir.interpreter.StartupInterpreterEngine.interpret(StartupInterpreterEngine.java:66)", "org.jruby.ir.interpreter.Interpreter.INTERPRET_BLOCK(Interpreter.java:118)", "org.jruby.runtime.MixedModeIRBlockBody.commonYieldPath(MixedModeIRBlockBody.java:136)", "org.jruby.runtime.IRBlockBody.call(IRBlockBody.java:66)", "org.jruby.runtime.IRBlockBody.call(IRBlockBody.java:58)", "org.jruby.runtime.Block.call(Block.java:144)", "org.jruby.RubyProc.call(RubyProc.java:354)", "org.jruby.internal.runtime.RubyRunnable.run(RubyRunnable.java:111)", "java.base/java.lang.Thread.run(Thread.java:840)"],
:thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 sleep>"}
[2024-08-20T21:40:26,485][WARN ][logstash.javapipeline ][main] Waiting for input plugin to close {:pipeline_id=>"main", :thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 run>"}
[2024-08-20T21:40:27,486][WARN ][logstash.javapipeline ][main] Waiting for input plugin to close {:pipeline_id=>"main", :thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 run>"}
[2024-08-20T21:40:28,487][WARN ][logstash.javapipeline ][main] Waiting for input plugin to close {:pipeline_id=>"main", :thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 run>"}
[2024-08-20T21:40:29,488][WARN ][logstash.javapipeline ][main] Waiting for input plugin to close {:pipeline_id=>"main", :thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 run>"}
[2024-08-20T21:40:30,489][WARN ][logstash.javapipeline ][main] Waiting for input plugin to close {:pipeline_id=>"main", :thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 run>"}
[2024-08-20T21:40:31,490][WARN ][logstash.javapipeline ][main] Waiting for input plugin to close {:pipeline_id=>"main", :thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 run>"}
[2024-08-20T21:40:32,492][WARN ][logstash.javapipeline ][main] Waiting for input plugin to close {:pipeline_id=>"main", :thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 run>"}
[2024-08-20T21:40:33,493][WARN ][logstash.javapipeline ][main] Waiting for input plugin to close {:pipeline_id=>"main", :thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 run>"}
[2024-08-20T21:40:34,496][ERROR][logstash.javapipeline ][main] Pipeline error {:pipeline_id=>"main", :exception=>#<Java::JavaIo::IOException: No space left on device>, :backtrace=>["java.base/sun.nio.ch.FileDispatcherImpl.write0(Native Method)", "java.base/sun.nio.ch.FileDispatcherImpl.write(FileDispatcherImpl.java:62)", "java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:132)", "java.base/sun.nio.ch.IOUtil.write(IOUtil.java:76)", "java.base/sun.nio.ch.IOUtil.write(IOUtil.java:67)", "java.base/sun.nio.ch.FileChannelImpl.write(FileChannelImpl.java:288)", "org.logstash.ackedqueue.io.FileCheckpointIO.write(FileCheckpointIO.java:106)", "org.logstash.ackedqueue.Page.forceCheckpoint(Page.java:230)", "org.logstash.ackedqueue.Page.headPageCheckpoint(Page.java:195)", "org.logstash.ackedqueue.Page.checkpoint(Page.java:184)", "org.logstash.ackedqueue.Page.ensurePersistedUpto(Page.java:223)", "org.logstash.ackedqueue.Queue.ensurePersistedUpto(Queue.java:579)", "org.logstash.ackedqueue.Queue.close(Queue.java:764)", "org.logstash.ackedqueue.ext.JRubyAckedQueueExt.close(JRubyAckedQueueExt.java:172)", "org.logstash.ext.JrubyAckedReadClientExt.close(JrubyAckedReadClientExt.java:72)", "org.logstash.execution.AbstractPipelineExt.close(AbstractPipelineExt.java:730)", "org.logstash.execution.AbstractPipelineExt$INVOKER$i$0$0$close.call(AbstractPipelineExt$INVOKER$i$0$0$close.gen)", "org.jruby.runtime.callsite.CachingCallSite.cacheAndCall(CachingCallSite.java:456)", "org.jruby.runtime.callsite.CachingCallSite.call(CachingCallSite.java:195)", "org.jruby.ir.interpreter.InterpreterEngine.processCall(InterpreterEngine.java:346)", "org.jruby.ir.interpreter.StartupInterpreterEngine.interpret(StartupInterpreterEngine.java:66)", "org.jruby.ir.interpreter.InterpreterEngine.interpret(InterpreterEngine.java:76)", "org.jruby.internal.runtime.methods.MixedModeIRMethod.INTERPRET_METHOD(MixedModeIRMethod.java:164)", "org.jruby.internal.runtime.methods.MixedModeIRMethod.call(MixedModeIRMethod.java:151)", "org.jruby.internal.runtime.methods.DynamicMethod.call(DynamicMethod.java:212)", "org.jruby.runtime.callsite.CachingCallSite.cacheAndCall(CachingCallSite.java:456)", "org.jruby.runtime.callsite.CachingCallSite.call(CachingCallSite.java:195)", "org.jruby.ir.interpreter.InterpreterEngine.processCall(InterpreterEngine.java:346)", "org.jruby.ir.interpreter.StartupInterpreterEngine.interpret(StartupInterpreterEngine.java:66)", "org.jruby.ir.interpreter.Interpreter.INTERPRET_BLOCK(Interpreter.java:118)", "org.jruby.runtime.MixedModeIRBlockBody.commonYieldPath(MixedModeIRBlockBody.java:136)", "org.jruby.runtime.IRBlockBody.call(IRBlockBody.java:66)", "org.jruby.runtime.IRBlockBody.call(IRBlockBody.java:58)", "org.jruby.runtime.Block.call(Block.java:144)", "org.jruby.RubyProc.call(RubyProc.java:354)", "org.jruby.internal.runtime.RubyRunnable.run(RubyRunnable.java:111)", "java.base/java.lang.Thread.run(Thread.java:840)"], "pipeline.sources"=>["/home/azureuser/logstash-8.14.1/config/pq-pipeline.conf"], :thread=>"#<Thread:0x7c210b15 /home/azureuser/logstash-8.14.1/logstash-core/lib/logstash/java_pipeline.rb:134 run>"}
[2024-08-20T21:40:34,497][INFO ][logstash.javapipeline ][main] Pipeline terminated {"pipeline.id"=>"main"}
[2024-08-20T21:40:36,236][WARN ][logstash.persistedqueueconfigvalidator] Pipeline main current queue size (26214400) is greater than 'queue.max_bytes' (25165824).
[2024-08-20T21:40:39,228][WARN ][logstash.persistedqueueconfigvalidator] Pipeline main current queue size (26214400) is greater than 'queue.max_bytes' (25165824).
I have set it blocked since we have no clear observations when checkpoint.head
is corrupted. LS has atomic file move operation and File.move
with StandardCopyOption.ATOMIC_MOVE
option depends on filesystem behavior. For such operation failures LS needs to receive AtomicMoveNotSupportedException
exception which we need to validate first. Waiting more logs from users, the logs need to indicate the reason.
// Windows can have problem doing file move See: https://github.com/elastic/logstash/issues/12345
// retry a couple of times to make it works. The first two runs has no break. The rest of reties are exponential backoff.
final Path path = dirPath.resolve(fileName);
try {
Files.move(tmpPath, path, StandardCopyOption.ATOMIC_MOVE);
} catch (IOException ex) {
if (retry) {
try {
logger.debug("CheckpointIO retry moving '{}' to '{}'", tmpPath, path);
backoff.retryable(() -> Files.move(tmpPath, path, StandardCopyOption.ATOMIC_MOVE));
} catch (ExponentialBackoff.RetryException re) {
throw new IOException("Error writing checkpoint", re);
}
} else {
logger.error("Error writing checkpoint without retry: " + ex);
throw ex;
}
}
}
Logstash information: 8.14.1
Please include the following information:
The root cause for this exception is
checkpoint.head
file somehow corrupted. The quick remediation is to overwritecheckpoint.head
withcheckpoint.head.tmp
(if available).bin/logstash --version
)Plugins installed: (
bin/logstash-plugin list --verbose
) defaultJVM (e.g.
java -version
): bundledIf the affected version of Logstash is 7.9 (or earlier), or if it is NOT using the bundled JDK or using the 'no-jdk' version in 7.10 (or higher), please provide the following information:
java -version
)LS_JAVA_HOME
environment variable if set.OS version (
uname -a
if on a Unix-like system):Description of the problem including expected versus actual behavior:
Steps to reproduce:
I failed to reproduce the issue. From the error we can see the disk is full but when disk is full we get
No space left on device
when LS is starting: https://github.com/elastic/logstash/issues/16389#issuecomment-2302656261Provide logs (if relevant):
Could not index event to Elasticsearch. Could not index event to Elasticsearch. Could not index event to Elasticsearch. Could not index event to Elasticsearch.
Starting Logstash ... Logstash failed to create queue. -----> java.io.IOException: Checkpoint checksum mismatch, expected: 278197, actual: 0
{"level":"INFO","loggerName":"logstash.runner","timeMillis":1721353466915,"thread":"main","logEvent":{"message":"Starting Logstash","logstash.version":"8.14.1","jruby.version":"jruby 9.4.7.0 (3.1.4) 2024-04-29 597ff08ac1 OpenJDK 64-Bit Server VM 17.0.11+9 on 17.0.11+9 +indy +jit [x86_64-linux]"}} { "level": "ERROR", "loggerName": "logstash.agent", "timeMillis": 1722887281351, "thread": "Converge PipelineAction::Create",
"logEvent": {
"message": "java.io.IOException: Checkpoint checksum mismatch, expected: 278197, actual: 0",
"action": "LogStash::PipelineAction::Create/pipeline_id:logging",
"exception": "Java::JavaLang::IllegalStateException",
"backtrace": [
"org.logstash.execution.AbstractPipelineExt.openQueue(AbstractPipelineExt.java:260)",
"org.logstash.execution.AbstractPipelineExt$INVOKER$i$0$0$openQueue.call(AbstractPipelineExt$INVOKER$i$0$0$openQueue.gen)",
"org.jruby.runtime.callsite.CachingCallSite.cacheAndCall(CachingCallSite.java:456)",
"org.jruby.runtime.callsite.CachingCallSite.call(CachingCallSite.java:195)",
"org.jruby.ir.interpreter.InterpreterEngine.processCall(InterpreterEngine.java:346)",
"org.jruby.ir.interpreter.StartupInterpreterEngine.interpret(StartupInterpreterEngine.java:66)",
"org.jruby.ir.interpreter.InterpreterEngine.interpret(InterpreterEngine.java:94)",
"org.jruby.internal.runtime.methods.MixedModeIRMethod.INTERPRET_METHOD(MixedModeIRMethod.java:275)",
"org.jruby.internal.runtime.methods.MixedModeIRMethod.call(MixedModeIRMethod.java:262)",
"org.jruby.internal.runtime.methods.DynamicMethod.call(DynamicMethod.java:236)",
"usr.share.logstash.logstash_minus_core.lib.logstash.pipeline_action.create.RUBY$method$execute$0(/usr/share/logstash/logstash-core/lib/logstash/pipeline_action/create.rb:49)",
"usr.share.logstash.logstash_minus_core.lib.logstash.agent.RUBY$block$converge_state$2(/usr/share/logstash/logstash-core/lib/logstash/agent.rb:386)",
"org.jruby.runtime.CompiledIRBlockBody.callDirect(CompiledIRBlockBody.java:141)",
"org.jruby.runtime.IRBlockBody.call(IRBlockBody.java:64)",
"org.jruby.runtime.IRBlockBody.call(IRBlockBody.java:58)",
"org.jruby.runtime.Block.call(Block.java:144)",
"org.jruby.RubyProc.call(RubyProc.java:354)",
"org.jruby.internal.runtime.RubyRunnable.run(RubyRunnable.java:111)",
"java.base/java.lang.Thread.run(Thread.java:840)"
]
}
}