apache / hudi

Upserts, Deletes And Incremental Processing on Big Data.
https://hudi.apache.org/
Apache License 2.0
5.32k stars 2.41k forks source link

Exception org.apache.hudi.exception.HoodieIOException: Could not read commit details #6143

Closed XuQianJin-Stars closed 1 year ago

XuQianJin-Stars commented 2 years ago
org.apache.hudi.exception.HoodieIOException: Could not read commit details from hdfs://XXXXXX/.hoodie/20220719053423274.deltacommit
    at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.readDataFromPath(HoodieActiveTimeline.java:763)
    at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.getInstantDetails(HoodieActiveTimeline.java:264)
    at org.apache.hudi.common.table.timeline.HoodieDefaultTimeline.getInstantDetails(HoodieDefaultTimeline.java:372)
    at org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getCommitMetadata(HoodieInputFormatUtils.java:511)
    at org.apache.hudi.sink.partitioner.profile.WriteProfiles.getCommitMetadata(WriteProfiles.java:194)
    at org.apache.hudi.source.IncrementalInputSplits.lambda$inputSplits$71(IncrementalInputSplits.java:183)
    at java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:193)
    at java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1384)
    at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)
    at java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)
    at java.util.stream.ReduceOps$ReduceOp.evaluateSequential(ReduceOps.java:708)
    at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
    at java.util.stream.ReferencePipeline.collect(ReferencePipeline.java:566)
    at org.apache.hudi.source.IncrementalInputSplits.inputSplits(IncrementalInputSplits.java:183)
    at org.apache.hudi.source.StreamReadMonitoringFunction.monitorDirAndForwardSplits(StreamReadMonitoringFunction.java:196)
    at org.apache.hudi.source.StreamReadMonitoringFunction.run(StreamReadMonitoringFunction.java:169)
    at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:110)
    at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:66)
    at org.apache.flink.streaming.runtime.tasks.SourceStreamTask$LegacySourceFunctionThread.run(SourceStreamTask.java:269)
Caused by: java.io.FileNotFoundException: File does not exist: /XXXXXX.hoodie/20220719053423274.deltacommit
    at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
    at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
    at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:156)
    at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1995)
    at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:797)
    at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:290)
    at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
    at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:529)
    at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1073)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1039)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:963)
    at java.base/java.security.AccessController.doPrivileged(Native Method)
    at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:2034)
    at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3047)

    at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
    at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
    at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
    at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
    at org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:121)
    at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:88)
    at org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations(DFSClient.java:1015)
    at org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:1002)
    at org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:991)
    at org.apache.hadoop.hdfs.DFSClient.open(DFSClient.java:1229)
    at org.apache.hadoop.hdfs.DistributedFileSystem$4.doCall(DistributedFileSystem.java:350)
    at org.apache.hadoop.hdfs.DistributedFileSystem$4.doCall(DistributedFileSystem.java:346)
    at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
    at org.apache.hadoop.hdfs.DistributedFileSystem.open(DistributedFileSystem.java:358)
    at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:811)
    at org.apache.hudi.common.fs.HoodieWrapperFileSystem.open(HoodieWrapperFileSystem.java:460)
    at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.readDataFromPath(HoodieActiveTimeline.java:760)
    ... 18 more
Caused by: org.apache.hadoop.ipc.RemoteException(java.io.FileNotFoundException): File does not exist: /user/tdw/warehouse/csig_billing_rt_ods.db/ods_fee_center_tfee_7_9_ri/.hoodie/20220719053423274.deltacommit
    at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86)
    at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76)
    at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:156)
    at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1995)
    at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:797)
    at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:290)
    at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
    at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:529)
    at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1073)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1039)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:963)
    at java.base/java.security.AccessController.doPrivileged(Native Method)
    at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:2034)
    at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3047)

    at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1559)
    at org.apache.hadoop.ipc.Client.call(Client.java:1505)
    at org.apache.hadoop.ipc.Client.call(Client.java:1412)
    at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:232)
    at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:118)
    at com.sun.proxy.$Proxy10.getBlockLocations(Unknown Source)
    at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getBlockLocations(ClientNamenodeProtocolTranslatorPB.java:293)
    at sun.reflect.GeneratedMethodAccessor38.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:411)
    at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:163)
    at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:155)
    at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)
    at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:346)
    at com.sun.proxy.$Proxy11.getBlockLocations(Unknown Source)
    at org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations(DFSClient.java:1013)
    ... 28 more
rmahindra123 commented 2 years ago

@XuQianJin-Stars Can you elaborate more on the setup and environment and steps to reproduce?

fengjian428 commented 2 years ago

also met this error today. Caused by: org.apache.hudi.exception.HoodieIOException: Could not read commit details from hdfs://xxx/.hoodie/20220720100959953.compaction.requested at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.readDataFromPath(HoodieActiveTimeline.java:624) at org.apache.hudi.common.table.timeline.HoodieActiveTimeline.readCompactionPlanAsBytes(HoodieActiveTimeline.java:266) at org.apache.hudi.common.util.CompactionUtils.getCompactionPlan(CompactionUtils.java:148) at org.apache.hudi.sink.compact.CompactionCommitSink.lambda$commitIfNecessary$1(CompactionCommitSink.java:116) at java.util.HashMap.computeIfAbsent(HashMap.java:1127) at org.apache.hudi.sink.compact.CompactionCommitSink.commitIfNecessary(CompactionCommitSink.java:114) at org.apache.hudi.sink.compact.CompactionCommitSink.invoke(CompactionCommitSink.java:103) at org.apache.hudi.sink.compact.CompactionCommitSink.invoke(CompactionCommitSink.java:54) at org.apache.flink.streaming.api.operators.StreamSink.processElement(StreamSink.java:54) at org.apache.flink.streaming.runtime.tasks.OneInputStreamTask$StreamTaskNetworkOutput.emitRecord(OneInputStreamTask.java:205) at org.apache.flink.streaming.runtime.io.AbstractStreamTaskNetworkInput.processElement(AbstractStreamTaskNetworkInput.java:134) at org.apache.flink.streaming.runtime.io.AbstractStreamTaskNetworkInput.emitNext(AbstractStreamTaskNetworkInput.java:105) at org.apache.flink.streaming.runtime.io.StreamOneInputProcessor.processInput(StreamOneInputProcessor.java:66) at org.apache.flink.streaming.runtime.tasks.StreamTask.processInput(StreamTask.java:423) at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxLoop(MailboxProcessor.java:204) at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(StreamTask.java:684) at org.apache.flink.streaming.runtime.tasks.S

nsivabalan commented 2 years ago

@XuQianJin-Stars : gentle reminder to respond to above's request/questions.

nsivabalan commented 2 years ago

@XuQianJin-Stars @fengjian428 : did you folks inspect the .hoodie folder. by any chance the delta commit or commit file of interest seen in stacktrace is 0 byte file? does the error goes away if you restart the pipeline? or its just stuck. if you can share the contents of .hoodie, we can dig in to see whats happening.

xushiyan commented 1 year ago

closed as discussed offline, this issue is not re-occurring now.

Jason-liujc commented 3 months ago

This happened for us and the root cause is we have concurrent running spark jobs writing to the same insert_overwrite table, different partition. Our jobs didn't have proper concurrent locking configurations. After adding DynamoDB Locks for these insert_overwrite write tasks, we stopped seeing these issues anymore.