apache / incubator-gluten

Gluten is a middle layer responsible for offloading JVM-based SQL engines' execution to native engines.
https://gluten.apache.org/
Apache License 2.0
1.21k stars 437 forks source link

[VL] Insert Table Error #2033

Closed tianyouyangying closed 1 year ago

tianyouyangying commented 1 year ago

hadoop:cdp7.x error info: Error: Error operating EXECUTE_STATEMENT: org.apache.spark.SparkException: Job aborted. at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:651) at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:278) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186) at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113) at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111) at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125) at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98) at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560) at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94) at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81) at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79) at org.apache.spark.sql.Dataset.(Dataset.scala:220) at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779) at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97) at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:622) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:617) at org.apache.kyuubi.engine.spark.operation.ExecuteStatement.$anonfun$executeStatement$1(ExecuteStatement.scala:80) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.kyuubi.engine.spark.operation.SparkOperation.withLocalProperties(SparkOperation.scala:88) at org.apache.kyuubi.engine.spark.operation.ExecuteStatement.org$apache$kyuubi$engine$spark$operation$ExecuteStatement$$executeStatement(ExecuteStatement.scala:74) at org.apache.kyuubi.engine.spark.operation.ExecuteStatement$$anon$1.run(ExecuteStatement.scala:106) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 33 in stage 354.0 failed 4 times, most recent failure: Lost task 33.3 in stage 354.0 (TID 37135) (dbdz84 executor 42): java.lang.RuntimeException: Exception: VeloxRuntimeError Error Source: RUNTIME Error Code: INVALIDSTATE Reason: Do not call allocate() when a write is in progress Retriable: False Expression: !currentHeader Function: allocate File: ../.././velox/common/memory/HashStringAllocator.h Line: 184 Stack trace:

0 _ZN8facebook5velox7process10StackTraceC1Ei

# 1  _ZN8facebook5velox14VeloxExceptionC1EPKcmS3_St17basic_string_viewIcSt11char_traitsIcEES7_S7_S7_bNS1_4TypeES7_
# 2  _ZN8facebook5velox6detail14veloxCheckFailINS0_17VeloxRuntimeErrorEPKcEEvRKNS1_18VeloxCheckFailArgsET0_
# 3  _ZN8facebook5velox9aggregate22SingleValueAccumulator5writeEPKNS0_10BaseVectorEiPNS0_19HashStringAllocatorE
# 4  _ZN8facebook5velox9aggregate9prestosql12_GLOBAL__N_122NonNumericMaxAggregate11addRawInputEPPcRKNS0_17SelectivityVectorERKSt6vectorISt10shared_ptrINS0_10BaseVectorEESaISD_EEb
# 5  _ZN8facebook5velox4exec11GroupingSet21addInputForActiveRowsERKSt10shared_ptrINS0_9RowVectorEEb
# 6  _ZN8facebook5velox4exec11GroupingSet21addInputForActiveRowsERKSt10shared_ptrINS0_9RowVectorEEb
# 7  _ZN8facebook5velox4exec11GroupingSet8addInputERKSt10shared_ptrINS0_9RowVectorEEb
# 8  _ZN8facebook5velox4exec15HashAggregation8addInputESt10shared_ptrINS0_9RowVectorEE
# 9  _ZN8facebook5velox4exec6Driver11runInternalERSt10shared_ptrIS2_ERS3_INS1_13BlockingStateEERS3_INS0_9RowVectorEE
# 10 _ZN8facebook5velox4exec6Driver4nextERSt10shared_ptrINS1_13BlockingStateEE
# 11 _ZN8facebook5velox4exec4Task4nextEPN5folly10SemiFutureINS3_4UnitEEE
# 12 _ZN6gluten24WholeStageResultIterator4nextEv
# 13 Java_io_glutenproject_vectorized_ColumnarBatchOutIterator_nativeHasNext
# 14 0x00007f4e2a25eb68

    at io.glutenproject.vectorized.ColumnarBatchOutIterator.nativeHasNext(Native Method)
    at io.glutenproject.vectorized.ColumnarBatchOutIterator.hasNextInternal(ColumnarBatchOutIterator.java:45)
    at io.glutenproject.vectorized.GeneralOutIterator.hasNext(GeneralOutIterator.java:37)
    at io.glutenproject.backendsapi.velox.IteratorHandler$$anon$3.hasNext(IteratorHandler.scala:298)
    at io.glutenproject.vectorized.CloseableColumnBatchIterator.hasNext(CloseableColumnBatchIterator.scala:41)
    at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
    at org.apache.spark.shuffle.ColumnarShuffleWriter.internalWrite(ColumnarShuffleWriter.scala:103)
    at org.apache.spark.shuffle.ColumnarShuffleWriter.write(ColumnarShuffleWriter.scala:203)
    at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
    at org.apache.spark.scheduler.Task.run(Task.scala:136)
    at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
    at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
    at scala.Option.foreach(Option.scala:407)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
ulysses-you commented 1 year ago

@tianyouyangying can you provide a re-procude case ? thank you

tianyouyangying commented 1 year ago

Cannot re-produce case, Running again is an error as follows:

Error: Error operating EXECUTE_STATEMENT: org.apache.spark.SparkException: Job aborted.
    at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:651)
    at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:278)
    at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
    at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
    at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
    at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
    at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
    at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
    at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
    at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
    at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
    at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
    at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
    at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
    at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
    at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
    at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
    at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
    at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
    at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
    at org.apache.spark.sql.Dataset.<init>(Dataset.scala:220)
    at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100)
    at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
    at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
    at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:622)
    at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
    at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:617)
    at org.apache.kyuubi.engine.spark.operation.ExecuteStatement.$anonfun$executeStatement$1(ExecuteStatement.scala:80)
    at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
    at org.apache.kyuubi.engine.spark.operation.SparkOperation.withLocalProperties(SparkOperation.scala:88)
    at org.apache.kyuubi.engine.spark.operation.ExecuteStatement.org$apache$kyuubi$engine$spark$operation$ExecuteStatement$$executeStatement(ExecuteStatement.scala:74)
    at org.apache.kyuubi.engine.spark.operation.ExecuteStatement$$anon$1.run(ExecuteStatement.scala:106)
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
    at java.util.concurrent.FutureTask.run(FutureTask.java:266)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 18.0 failed 4 times, most recent failure: Lost task 5.3 in stage 18.0 (TID 3032) (dbdz68 executor 26): ExecutorLostFailure (executor 26 exited caused by one of the running tasks) Reason: Container from a bad node: container_e05_1684488042345_3070_01_000034 on host: dbdz68. Exit status: 134. Diagnostics: [2023-06-21 11:21:07.682]Exception from container-launch.
Container id: container_e05_1684488042345_3070_01_000034
Exit code: 134
Exception message: Launch container failed
Shell output: main : command provided 1
main : run as user is hive
main : requested yarn user is hive
Getting exit code file...
Creating script paths...
Writing pid file...
Writing to tmp file /data3/yarn2/nm/nmPrivate/application_1684488042345_3070/container_e05_1684488042345_3070_01_000034/container_e05_1684488042345_3070_01_000034.pid.tmp
Writing to cgroup task files...
Creating local dirs...
Launching container...

[2023-06-21 11:21:07.684]Container exited with a non-zero exit code 134. Error file: prelaunch.err.
Last 4096 bytes of prelaunch.err :
/bin/bash: line 1: 25051 Aborted                 /usr/java/jdk1.8.0_232-cloudera/bin/java -server -Xmx4096m '-XX:+IgnoreUnrecognizedVMOptions' '--add-opens=java.base/java.lang=ALL-UNNAMED' '--add-opens=java.base/java.lang.invoke=ALL-UNNAMED' '--add-opens=java.base/java.lang.reflect=ALL-UNNAMED' '--add-opens=java.base/java.io=ALL-UNNAMED' '--add-opens=java.base/java.net=ALL-UNNAMED' '--add-opens=java.base/java.nio=ALL-UNNAMED' '--add-opens=java.base/java.util=ALL-UNNAMED' '--add-opens=java.base/java.util.concurrent=ALL-UNNAMED' '--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED' '--add-opens=java.base/sun.nio.ch=ALL-UNNAMED' '--add-opens=java.base/sun.nio.cs=ALL-UNNAMED' '--add-opens=java.base/sun.security.action=ALL-UNNAMED' '--add-opens=java.base/sun.util.calendar=ALL-UNNAMED' '--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED' -Djava.io.tmpdir=/data2/yarn2/nm/usercache/hive/appcache/application_1684488042345_3070/container_e05_1684488042345_3070_01_000034/tmp '-Dspark.network.timeout=600000' '-Dspark.rpc.askTimeout=1000' '-Dspark.authenticate=false' '-Dspark.network.crypto.enabled=false' '-Dspark.driver.port=35562' '-Dspark.shuffle.service.port=7337' '-Dspark.ui.port=0' -Dspark.yarn.app.container.log.dir=/data3/yarn2/container-logs/application_1684488042345_3070/container_e05_1684488042345_3070_01_000034 -XX:OnOutOfMemoryError='kill %p' org.apache.spark.executor.YarnCoarseGrainedExecutorBackend --driver-url spark://CoarseGrainedScheduler@dbdz68:35562 --executor-id 26 --hostname dbdz68 --cores 8 --app-id application_1684488042345_3070 --resourceProfileId 0 > /data3/yarn2/container-logs/application_1684488042345_3070/container_e05_1684488042345_3070_01_000034/stdout 2> /data3/yarn2/container-logs/application_1684488042345_3070/container_e05_1684488042345_3070_01_000034/stderr
Last 4096 bytes of stderr :
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
OpenJDK 64-Bit Server VM warning: You have loaded library /data3/yarn2/nm/usercache/hive/appcache/application_1684488042345_3070/gluten-7eabdaeb-81cc-4668-8833-ff9abca4e26b/jni/87b7d890-af1a-4cc8-8716-8d5959e1723c/gluten-3672577289053489304/libhdfs3.so.1 which might have disabled stack guard. The VM will try to fix the stack guard now.
It's highly recommended that you fix the library with 'execstack -c <libfile>', or link it with '-z noexecstack'.
I20230621 11:21:04.075465 25792 VeloxInitializer.cc:91] STARTUP: VeloxInitializer conf = {
 {spark.hadoop.input.connect.timeout, 180000}
 {spark.gluten.sql.columnar.backend.velox.IOThreads, 0}
 {spark.hadoop.fs.s3a.endpoint, localhost:9000}
 {spark.gluten.sql.columnar.backend.velox.SplitPreloadPerDriver, 2}
 {spark.hadoop.fs.s3a.access.key, }
 {spark.gluten.memory.task.offHeap.size.in.bytes, 1342177280}
 {spark.hadoop.input.write.timeout, 180000}
 {spark.hadoop.fs.s3a.path.style.access, true}
 {spark.hadoop.fs.s3a.use.instance.credentials, false}
 {spark.hadoop.fs.s3a.connection.ssl.enabled, false}
 {spark.memory.offHeap.enabled, true}
 {spark.hadoop.fs.s3a.secret.key, }
 {spark.hadoop.input.read.timeout, 180000}
 {spark.hadoop.dfs.client.log.severity, INFO}
 {spark.gluten.memory.offHeap.size.in.bytes, 10737418240}
}
memPoolOptions = { alignment:64, capacity:7680M, trackUsage:1 }
spillThreshold = 4608M
E20230621 11:21:06.801776 25790 Exceptions.h:68] Line: ../.././velox/common/memory/HashStringAllocator.h:184, Function:allocate, Expression: !currentHeader_ Do not call allocate() when a write is in progress, Source: RUNTIME, ErrorCode: INVALID_STATE
E20230621 11:21:07.282449 25791 Exceptions.h:68] Line: ../.././velox/common/memory/HashStringAllocator.h:184, Function:allocate, Expression: !currentHeader_ Do not call allocate() when a write is in progress, Source: RUNTIME, ErrorCode: INVALID_STATE
E20230621 11:21:07.311520 25793 Exceptions.h:68] Line: ../.././velox/common/memory/HashStringAllocator.h:184, Function:allocate, Expression: !currentHeader_ Do not call allocate() when a write is in progress, Source: RUNTIME, ErrorCode: INVALID_STATE

[2023-06-21 11:21:07.684]Container exited with a non-zero exit code 134. Error file: prelaunch.err.
Last 4096 bytes of prelaunch.err :
/bin/bash: line 1: 25051 Aborted                 /usr/java/jdk1.8.0_232-cloudera/bin/java -server -Xmx4096m '-XX:+IgnoreUnrecognizedVMOptions' '--add-opens=java.base/java.lang=ALL-UNNAMED' '--add-opens=java.base/java.lang.invoke=ALL-UNNAMED' '--add-opens=java.base/java.lang.reflect=ALL-UNNAMED' '--add-opens=java.base/java.io=ALL-UNNAMED' '--add-opens=java.base/java.net=ALL-UNNAMED' '--add-opens=java.base/java.nio=ALL-UNNAMED' '--add-opens=java.base/java.util=ALL-UNNAMED' '--add-opens=java.base/java.util.concurrent=ALL-UNNAMED' '--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED' '--add-opens=java.base/sun.nio.ch=ALL-UNNAMED' '--add-opens=java.base/sun.nio.cs=ALL-UNNAMED' '--add-opens=java.base/sun.security.action=ALL-UNNAMED' '--add-opens=java.base/sun.util.calendar=ALL-UNNAMED' '--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED' -Djava.io.tmpdir=/data2/yarn2/nm/usercache/hive/appcache/application_1684488042345_3070/container_e05_1684488042345_3070_01_000034/tmp '-Dspark.network.timeout=600000' '-Dspark.rpc.askTimeout=1000' '-Dspark.authenticate=false' '-Dspark.network.crypto.enabled=false' '-Dspark.driver.port=35562' '-Dspark.shuffle.service.port=7337' '-Dspark.ui.port=0' -Dspark.yarn.app.container.log.dir=/data3/yarn2/container-logs/application_1684488042345_3070/container_e05_1684488042345_3070_01_000034 -XX:OnOutOfMemoryError='kill %p' org.apache.spark.executor.YarnCoarseGrainedExecutorBackend --driver-url spark://CoarseGrainedScheduler@dbdz68:35562 --executor-id 26 --hostname dbdz68 --cores 8 --app-id application_1684488042345_3070 --resourceProfileId 0 > /data3/yarn2/container-logs/application_1684488042345_3070/container_e05_1684488042345_3070_01_000034/stdout 2> /data3/yarn2/container-logs/application_1684488042345_3070/container_e05_1684488042345_3070_01_000034/stderr
Last 4096 bytes of stderr :
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
OpenJDK 64-Bit Server VM warning: You have loaded library /data3/yarn2/nm/usercache/hive/appcache/application_1684488042345_3070/gluten-7eabdaeb-81cc-4668-8833-ff9abca4e26b/jni/87b7d890-af1a-4cc8-8716-8d5959e1723c/gluten-3672577289053489304/libhdfs3.so.1 which might have disabled stack guard. The VM will try to fix the stack guard now.
It's highly recommended that you fix the library with 'execstack -c <libfile>', or link it with '-z noexecstack'.
I20230621 11:21:04.075465 25792 VeloxInitializer.cc:91] STARTUP: VeloxInitializer conf = {
 {spark.hadoop.input.connect.timeout, 180000}
 {spark.gluten.sql.columnar.backend.velox.IOThreads, 0}
 {spark.hadoop.fs.s3a.endpoint, localhost:9000}
 {spark.gluten.sql.columnar.backend.velox.SplitPreloadPerDriver, 2}
 {spark.hadoop.fs.s3a.access.key, }
 {spark.gluten.memory.task.offHeap.size.in.bytes, 1342177280}
 {spark.hadoop.input.write.timeout, 180000}
 {spark.hadoop.fs.s3a.path.style.access, true}
 {spark.hadoop.fs.s3a.use.instance.credentials, false}
 {spark.hadoop.fs.s3a.connection.ssl.enabled, false}
 {spark.memory.offHeap.enabled, true}
 {spark.hadoop.fs.s3a.secret.key, }
 {spark.hadoop.input.read.timeout, 180000}
 {spark.hadoop.dfs.client.log.severity, INFO}
 {spark.gluten.memory.offHeap.size.in.bytes, 10737418240}
}
memPoolOptions = { alignment:64, capacity:7680M, trackUsage:1 }
spillThreshold = 4608M
E20230621 11:21:06.801776 25790 Exceptions.h:68] Line: ../.././velox/common/memory/HashStringAllocator.h:184, Function:allocate, Expression: !currentHeader_ Do not call allocate() when a write is in progress, Source: RUNTIME, ErrorCode: INVALID_STATE
E20230621 11:21:07.282449 25791 Exceptions.h:68] Line: ../.././velox/common/memory/HashStringAllocator.h:184, Function:allocate, Expression: !currentHeader_ Do not call allocate() when a write is in progress, Source: RUNTIME, ErrorCode: INVALID_STATE
E20230621 11:21:07.311520 25793 Exceptions.h:68] Line: ../.././velox/common/memory/HashStringAllocator.h:184, Function:allocate, Expression: !currentHeader_ Do not call allocate() when a write is in progress, Source: RUNTIME, ErrorCode: INVALID_STATE

.
Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
    at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
    at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
    at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
    at scala.Option.foreach(Option.scala:407)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) (state=,code=0)
Closing: 0: jdbc:hive2://dbdz68:10099/prestat;password=root;principal=hive/dbdz68@CLOUDERA;

[INFO] 2023-06-21 11:21:08.200 +0800 - FINALIZE_SESSION

loukey-lj commented 1 year ago

Is this problem solved, I get an error every time I execute this sql? @tianyouyangying select bdpscheduletaskid, cast( avg( ( unix_timestamp(finishtimestr)- unix_timestamp(jobstarttimestr) ) ) as string ) as timecost, cast( avg(mem_value) as string ) as men_vale, cast( avg(cpu_value) as string ) as cpu_value, max(jobstarttimestr) as jobstarttimestr, max(finishtimestr) as finishtimestr from ods_jobdiagnose.ods_task_consumption_list where ymd >= '20230710' and ymd <= '20230720' and finalstatus = 'SUCCEEDED' group by bdpscheduletaskid having count(distinct ymd)= 11 and count(ymd)= 11 and min( substr(jobstarttimestr, 0, 10) ) = '2023-07-10' and max( substr(finishtimestr, 0, 10) ) = '2023-07-20' and min(apptype)= 'SPARK' and max(apptype)= 'SPARK';


SLF4J: See https://www.slf4j.org/codes.html#ignoredBindings for an explanation.
E20230728 16:42:42.203538 147019 Exceptions.h:68] Line: ../.././velox/common/memory/HashStringAllocator.h:184, Function:allocate, Expression: !currentHeader_ Do not call allocate() when a write is in progress, Source: RUNTIME, ErrorCode: INVALID_STATE
java.lang.RuntimeException: Exception: VeloxRuntimeError
Error Source: RUNTIME
Error Code: INVALID_STATE
Reason: Do not call allocate() when a write is in progress
Retriable: False
Expression: !currentHeader_
Function: allocate
File: ../.././velox/common/memory/HashStringAllocator.h
Line: 184
Stack trace:
# 0  _ZN8facebook5velox7process10StackTraceC1Ei
# 1  _ZN8facebook5velox14VeloxExceptionC1EPKcmS3_St17basic_string_viewIcSt11char_traitsIcEES7_S7_S7_bNS1_4TypeES7_
# 2  _ZN8facebook5velox6detail14veloxCheckFailINS0_17VeloxRuntimeErrorEPKcEEvRKNS1_18VeloxCheckFailArgsET0_
# 3  _ZN8facebook5velox9aggregate22SingleValueAccumulator5writeEPKNS0_10BaseVectorEiPNS0_19HashStringAllocatorE
# 4  _ZN8facebook5velox9aggregate9prestosql12_GLOBAL__N_122NonNumericMaxAggregate11addRawInputEPPcRKNS0_17SelectivityVectorERKSt6vectorISt10shared_ptrINS0_10BaseVectorEESaISD_EEb
# 5  _ZN8facebook5velox4exec11GroupingSet21addInputForActiveRowsERKSt10shared_ptrINS0_9RowVectorEEb
# 6  _ZN8facebook5velox4exec11GroupingSet21addInputForActiveRowsERKSt10shared_ptrINS0_9RowVectorEEb
# 7  _ZN8facebook5velox4exec11GroupingSet8addInputERKSt10shared_ptrINS0_9RowVectorEEb
# 8  _ZN8facebook5velox4exec15HashAggregation8addInputESt10shared_ptrINS0_9RowVectorEE
# 9  _ZN8facebook5velox4exec6Driver11runInternalERSt10shared_ptrIS2_ERS3_INS1_13BlockingStateEERS3_INS0_9RowVectorEE
# 10 _ZN8facebook5velox4exec6Driver4nextERSt10shared_ptrINS1_13BlockingStateEE
# 11 _ZN8facebook5velox4exec4Task4nextEPN5folly10SemiFutureINS3_4UnitEEE
# 12 _ZN6gluten24WholeStageResultIterator4nextEv
# 13 Java_io_glutenproject_vectorized_ColumnarBatchOutIterator_nativeHasNext
# 14 0x00007fc0e1017a14

at io.glutenproject.vectorized.ColumnarBatchOutIterator.nativeHasNext(Native Method)
at io.glutenproject.vectorized.ColumnarBatchOutIterator.hasNextInternal(ColumnarBatchOutIterator.java:47)
at io.glutenproject.vectorized.GeneralOutIterator.hasNext(GeneralOutIterator.java:37)
at io.glutenproject.backendsapi.velox.IteratorHandler$$anon$3.hasNext(IteratorHandler.scala:298)
at io.glutenproject.vectorized.CloseableColumnBatchIterator.hasNext(CloseableColumnBatchIterator.scala:41)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
at org.apache.spark.writer.gluten.uniffle.writer.NativeShuffleWriter.writeImpl(NativeShuffleWriter.scala:98)
at org.apache.spark.writer.gluten.uniffle.writer.NativeShuffleWriter.write(NativeShuffleWriter.scala:80)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)```