apache / gravitino

World's most powerful open data catalog for building a high-performance, geo-distributed and federated metadata lake.
https://gravitino.apache.org
Apache License 2.0
1.1k stars 344 forks source link

[Bug report] Invalid token issue happened in GVFS when Spark job long running #5596

Closed xloya closed 6 days ago

xloya commented 6 days ago

Version

main branch

Describe what's wrong

In Spark, the Delegation Token of FileSystem will be refreshed at a certain time interval. Currently, GVFS does not override the addDelegationTokens method, so some tokens that require FileSystem cannot be updated (such as HDFS). We should override the addDelegationTokens method.

Error message and/or stacktrace

error type: Py4JJavaError
stack:
Traceback (most recent call last):
File "/home/work/hdd4/yarn/cluster/nodemanager/usercache/s_workspace_1_krb/appcache/application_1700812749231_15808832/container_e1419_1700812749231_15808832_01_000001/pyspark.zip/pyspark/sql/dataframe.py", line 804, in count
return int(self._jdf.count())
File "/home/work/hdd4/yarn/cluster/nodemanager/usercache/s_workspace_1_krb/appcache/application_1700812749231_15808832/container_e1419_1700812749231_15808832_01_000001/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
return_value = get_return_value(
File "/home/work/hdd4/yarn/cluster/nodemanager/usercache/s_workspace_1_krb/appcache/application_1700812749231_15808832/container_e1419_1700812749231_15808832_01_000001/pyspark.zip/pyspark/sql/utils.py", line 190, in deco
return f(*a, **kw)
File "/home/work/hdd4/yarn/cluster/nodemanager/usercache/s_workspace_1_krb/appcache/application_1700812749231_15808832/container_e1419_1700812749231_15808832_01_000001/py4j-0.10.9.5-src.zip/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o1297.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 17 in stage 181.0 failed 4 times, most recent failure: Lost task 17.3 in stage 181.0 (TID 344) (cluster-prc-st3040.bj executor 6): javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password [Caused by com.xiaomi.fs.common.exception.InvalidTokenException: token FS_DELEGATION_TOKEN, owner=s_workspace_1_krb@XIAOMI.HADOOP, renewer=yarn_prc, realUser=, lifeTime=1727587928420, sequenceNum=1 can't be found in cache.]
    at com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:596)
    at com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:247)
    at com.xiaomi.fs.common.auth.SaslServerHandler.processSaslToken(SaslServerHandler.java:119)
    at com.xiaomi.fs.common.auth.SaslServerHandler.handleMessage(SaslServerHandler.java:95)
    at com.xiaomi.fs.common.auth.AuthenticatedServerObserver.onNext(AuthenticatedServerObserver.java:50)
    at com.xiaomi.fs.common.auth.AuthenticatedServerObserver.onNext(AuthenticatedServerObserver.java:12)
    at io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:262)
    at io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
    at io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
    at io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailableInternal(ServerCallImpl.java:329)
    at io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:314)
    at io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:833)
    at io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
    at io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
    at java.lang.Thread.run(Thread.java:840)
Caused by: com.xiaomi.fs.common.exception.InvalidTokenException: token FS_DELEGATION_TOKEN, owner=s_workspace_1_krb@XIAOMI.HADOOP, renewer=yarn_prc, realUser=, lifeTime=1727587928420, sequenceNum=1 can't be found in cache.
    at com.xiaomi.fs.common.auth.token.AbstractSecretManager.checkToken(AbstractSecretManager.java:219)
    at com.xiaomi.fs.common.auth.token.AbstractSecretManager.retrievePassword(AbstractSecretManager.java:212)
    at com.xiaomi.fs.common.auth.token.AbstractSecretManager.retriableRetrievePassword(AbstractSecretManager.java:207)
    at com.xiaomi.fs.common.auth.SaslRpcServer$SaslDigestCallbackHandler.getPassword(SaslRpcServer.java:255)
    at com.xiaomi.fs.common.auth.SaslRpcServer$SaslDigestCallbackHandler.handle(SaslRpcServer.java:279)
    at com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:587)
    ... 16 more

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2721)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2657)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2656)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2656)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1188)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1188)
    at scala.Option.foreach(Option.scala:407)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1188)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2917)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2859)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2848)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password [Caused by com.xiaomi.fs.common.exception.InvalidTokenException: token FS_DELEGATION_TOKEN, owner=s_workspace_1_krb@XIAOMI.HADOOP, renewer=yarn_prc, realUser=, lifeTime=1727587928420, sequenceNum=1 can't be found in cache.]
    at com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:596)
    at com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:247)
    at com.xiaomi.fs.common.auth.SaslServerHandler.processSaslToken(SaslServerHandler.java:119)
    at com.xiaomi.fs.common.auth.SaslServerHandler.handleMessage(SaslServerHandler.java:95)
    at com.xiaomi.fs.common.auth.AuthenticatedServerObserver.onNext(AuthenticatedServerObserver.java:50)
    at com.xiaomi.fs.common.auth.AuthenticatedServerObserver.onNext(AuthenticatedServerObserver.java:12)
    at io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:262)
    at io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
    at io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
    at io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailableInternal(ServerCallImpl.java:329)
    at io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:314)
    at io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:833)
    at io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
    at io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
    at java.lang.Thread.run(Thread.java:840)
Caused by: com.xiaomi.fs.common.exception.InvalidTokenException: token FS_DELEGATION_TOKEN, owner=s_workspace_1_krb@XIAOMI.HADOOP, renewer=yarn_prc, realUser=, lifeTime=1727587928420, sequenceNum=1 can't be found in cache.
    at com.xiaomi.fs.common.auth.token.AbstractSecretManager.checkToken(AbstractSecretManager.java:219)
    at com.xiaomi.fs.common.auth.token.AbstractSecretManager.retrievePassword(AbstractSecretManager.java:212)
    at com.xiaomi.fs.common.auth.token.AbstractSecretManager.retriableRetrievePassword(AbstractSecretManager.java:207)
    at com.xiaomi.fs.common.auth.SaslRpcServer$SaslDigestCallbackHandler.getPassword(SaslRpcServer.java:255)
    at com.xiaomi.fs.common.auth.SaslRpcServer$SaslDigestCallbackHandler.handle(SaslRpcServer.java:279)
    at com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:587)

How to reproduce

A long-running Spark Job accessing Fileset on HDFS.

Additional context

No response