In Spark, the Delegation Token of FileSystem will be refreshed at a certain time interval. Currently, GVFS does not override the addDelegationTokens method, so some tokens that require FileSystem cannot be updated (such as HDFS). We should override the addDelegationTokens method.
Error message and/or stacktrace
error type: Py4JJavaError
stack:
Traceback (most recent call last):
File "/home/work/hdd4/yarn/cluster/nodemanager/usercache/s_workspace_1_krb/appcache/application_1700812749231_15808832/container_e1419_1700812749231_15808832_01_000001/pyspark.zip/pyspark/sql/dataframe.py", line 804, in count
return int(self._jdf.count())
File "/home/work/hdd4/yarn/cluster/nodemanager/usercache/s_workspace_1_krb/appcache/application_1700812749231_15808832/container_e1419_1700812749231_15808832_01_000001/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1321, in __call__
return_value = get_return_value(
File "/home/work/hdd4/yarn/cluster/nodemanager/usercache/s_workspace_1_krb/appcache/application_1700812749231_15808832/container_e1419_1700812749231_15808832_01_000001/pyspark.zip/pyspark/sql/utils.py", line 190, in deco
return f(*a, **kw)
File "/home/work/hdd4/yarn/cluster/nodemanager/usercache/s_workspace_1_krb/appcache/application_1700812749231_15808832/container_e1419_1700812749231_15808832_01_000001/py4j-0.10.9.5-src.zip/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o1297.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 17 in stage 181.0 failed 4 times, most recent failure: Lost task 17.3 in stage 181.0 (TID 344) (cluster-prc-st3040.bj executor 6): javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password [Caused by com.xiaomi.fs.common.exception.InvalidTokenException: token FS_DELEGATION_TOKEN, owner=s_workspace_1_krb@XIAOMI.HADOOP, renewer=yarn_prc, realUser=, lifeTime=1727587928420, sequenceNum=1 can't be found in cache.]
at com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:596)
at com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:247)
at com.xiaomi.fs.common.auth.SaslServerHandler.processSaslToken(SaslServerHandler.java:119)
at com.xiaomi.fs.common.auth.SaslServerHandler.handleMessage(SaslServerHandler.java:95)
at com.xiaomi.fs.common.auth.AuthenticatedServerObserver.onNext(AuthenticatedServerObserver.java:50)
at com.xiaomi.fs.common.auth.AuthenticatedServerObserver.onNext(AuthenticatedServerObserver.java:12)
at io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:262)
at io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
at io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
at io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailableInternal(ServerCallImpl.java:329)
at io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:314)
at io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:833)
at io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at java.lang.Thread.run(Thread.java:840)
Caused by: com.xiaomi.fs.common.exception.InvalidTokenException: token FS_DELEGATION_TOKEN, owner=s_workspace_1_krb@XIAOMI.HADOOP, renewer=yarn_prc, realUser=, lifeTime=1727587928420, sequenceNum=1 can't be found in cache.
at com.xiaomi.fs.common.auth.token.AbstractSecretManager.checkToken(AbstractSecretManager.java:219)
at com.xiaomi.fs.common.auth.token.AbstractSecretManager.retrievePassword(AbstractSecretManager.java:212)
at com.xiaomi.fs.common.auth.token.AbstractSecretManager.retriableRetrievePassword(AbstractSecretManager.java:207)
at com.xiaomi.fs.common.auth.SaslRpcServer$SaslDigestCallbackHandler.getPassword(SaslRpcServer.java:255)
at com.xiaomi.fs.common.auth.SaslRpcServer$SaslDigestCallbackHandler.handle(SaslRpcServer.java:279)
at com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:587)
... 16 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2721)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2657)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2656)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2656)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1188)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1188)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1188)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2917)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2859)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2848)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password [Caused by com.xiaomi.fs.common.exception.InvalidTokenException: token FS_DELEGATION_TOKEN, owner=s_workspace_1_krb@XIAOMI.HADOOP, renewer=yarn_prc, realUser=, lifeTime=1727587928420, sequenceNum=1 can't be found in cache.]
at com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:596)
at com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:247)
at com.xiaomi.fs.common.auth.SaslServerHandler.processSaslToken(SaslServerHandler.java:119)
at com.xiaomi.fs.common.auth.SaslServerHandler.handleMessage(SaslServerHandler.java:95)
at com.xiaomi.fs.common.auth.AuthenticatedServerObserver.onNext(AuthenticatedServerObserver.java:50)
at com.xiaomi.fs.common.auth.AuthenticatedServerObserver.onNext(AuthenticatedServerObserver.java:12)
at io.grpc.stub.ServerCalls$StreamingServerCallHandler$StreamingServerCallListener.onMessage(ServerCalls.java:262)
at io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
at io.grpc.ForwardingServerCallListener.onMessage(ForwardingServerCallListener.java:33)
at io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailableInternal(ServerCallImpl.java:329)
at io.grpc.internal.ServerCallImpl$ServerStreamListenerImpl.messagesAvailable(ServerCallImpl.java:314)
at io.grpc.internal.ServerImpl$JumpToApplicationThreadServerStreamListener$1MessagesAvailable.runInContext(ServerImpl.java:833)
at io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37)
at io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:133)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
at java.lang.Thread.run(Thread.java:840)
Caused by: com.xiaomi.fs.common.exception.InvalidTokenException: token FS_DELEGATION_TOKEN, owner=s_workspace_1_krb@XIAOMI.HADOOP, renewer=yarn_prc, realUser=, lifeTime=1727587928420, sequenceNum=1 can't be found in cache.
at com.xiaomi.fs.common.auth.token.AbstractSecretManager.checkToken(AbstractSecretManager.java:219)
at com.xiaomi.fs.common.auth.token.AbstractSecretManager.retrievePassword(AbstractSecretManager.java:212)
at com.xiaomi.fs.common.auth.token.AbstractSecretManager.retriableRetrievePassword(AbstractSecretManager.java:207)
at com.xiaomi.fs.common.auth.SaslRpcServer$SaslDigestCallbackHandler.getPassword(SaslRpcServer.java:255)
at com.xiaomi.fs.common.auth.SaslRpcServer$SaslDigestCallbackHandler.handle(SaslRpcServer.java:279)
at com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:587)
How to reproduce
A long-running Spark Job accessing Fileset on HDFS.
Version
main branch
Describe what's wrong
In Spark, the Delegation Token of FileSystem will be refreshed at a certain time interval. Currently, GVFS does not override the
addDelegationTokens
method, so some tokens that require FileSystem cannot be updated (such as HDFS). We should override theaddDelegationTokens
method.Error message and/or stacktrace
How to reproduce
A long-running Spark Job accessing Fileset on HDFS.
Additional context
No response