apache / hudi

Upserts, Deletes And Incremental Processing on Big Data.
https://hudi.apache.org/
Apache License 2.0
5.45k stars 2.42k forks source link

[SUPPORT] Alter table change column type, then query table get NullPointerException #12267

Open GaussEAcc opened 5 days ago

GaussEAcc commented 5 days ago

To Reproduce

Spark-sql execute blew sql command:

CREATE TABLE hudi_table1 (
    ts BIGINT,
    uuid STRING,
    rider STRING,
    driver STRING,
    fare DOUBLE,
    city STRING
) 
USING HUDI options(type='cow',primaryKey='ts',hoodie.datasource.write.recordkey.field='ts',hoodie.bucket.index.hash.field='ts',hoodie.bucket.index.num.buckets=10,hoodie.index.bucket.engine='SIMPLE',hoodie.index.type='BUCKET',hoodie.clustering.plan.strategy.sort.columns='ts')
PARTITIONED BY (city);
INSERT INTO hudi_table1
VALUES
(1695159649087,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco');

set hoodie.schema.on.read.enable=true;
set spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog;
ALTER TABLE hudi_table1 CHANGE COLUMN fare fare decimal;

SELECT * FROM hudi_table1;

Expected behavior

Query success.

Environment Description

Stacktrace

24/11/16 17:19:59 ERROR Executor: Exception in task 0.0 in stage 40.0 (TID 80) java.lang.NullPointerException at org.apache.spark.sql.execution.vectorized.OnHeapColumnVector.getLong(OnHeapColumnVector.java:380) at org.apache.spark.sql.execution.vectorized.WritableColumnVector.getDecimal(WritableColumnVector.java:396) at org.apache.spark.sql.vectorized.ColumnarBatchRow.getDecimal(ColumnarBatchRow.java:121) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:889) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:889) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365) at org.apache.spark.rdd.RDD.iterator(RDD.scala:329) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:136) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:568) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1645) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:571) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750) 24/11/16 17:19:59 WARN TaskSetManager: Lost task 0.0 in stage 40.0 (TID 80) (xxx executor driver): java.lang.NullPointerException at org.apache.spark.sql.execution.vectorized.OnHeapColumnVector.getLong(OnHeapColumnVector.java:380) at org.apache.spark.sql.execution.vectorized.WritableColumnVector.getDecimal(WritableColumnVector.java:396) at org.apache.spark.sql.vectorized.ColumnarBatchRow.getDecimal(ColumnarBatchRow.java:121) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:889) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:889) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365) at org.apache.spark.rdd.RDD.iterator(RDD.scala:329) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:136) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:568) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1645) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:571) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750)

24/11/16 17:19:59 ERROR TaskSetManager: Task 0 in stage 40.0 failed 1 times; aborting job 24/11/16 17:19:59 INFO TaskSchedulerImpl: Removed TaskSet 40.0, whose tasks have all completed, from pool 24/11/16 17:19:59 INFO TaskSchedulerImpl: Cancelling stage 40 24/11/16 17:19:59 INFO TaskSchedulerImpl: Killing all running tasks in stage 40: Stage cancelled 24/11/16 17:19:59 INFO DAGScheduler: ResultStage 40 (collect at SparkPlan.scala:424) failed in 0.064 s due to Job aborted due to stage failure: Task 0 in stage 40.0 failed 1 times, most recent failure: Lost task 0.0 in stage 40.0 (TID 80) (xxxx executor driver): java.lang.NullPointerException at org.apache.spark.sql.execution.vectorized.OnHeapColumnVector.getLong(OnHeapColumnVector.java:380) at org.apache.spark.sql.execution.vectorized.WritableColumnVector.getDecimal(WritableColumnVector.java:396) at org.apache.spark.sql.vectorized.ColumnarBatchRow.getDecimal(ColumnarBatchRow.java:121) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:889) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:889) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365) at org.apache.spark.rdd.RDD.iterator(RDD.scala:329) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:136) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:568) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1645) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:571) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750)

Driver stacktrace: 24/11/16 17:19:59 INFO AppStatusListener: total cpuCost for the job 33 is 53 24/11/16 17:19:59 INFO DAGScheduler: Job 33 failed: collect at SparkPlan.scala:424, took 0.069865 s 24/11/16 17:19:59 INFO SQLDefense: {"engine":"SPARK","instanceId":"c0e4827d-4e0e-4fe1-aae7-f5d97771d885","queryId":38,"querySql":"select * from hudi_table1","status":"Failed","executionTime":"106","queuedTime":"","startTime":"2024-11-16 17:19:59","endTime":"2024-11-16 17:19:59","userName":"xxxxx","userIP":"xxx","inputRow":0,"inputData":0,"writtenRow":0,"writtenData":0,"resultRow":0,"resultData":"","assignedMemory":"","totalMemory":"","cpuTime":101,"scanPartitions":"List()","scanFiles":"List()","splits":"","tasks":"","SQLDefense":[]}

org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 40.0 failed 1 times, most recent failure: Lost task 0.0 in stage 40.0 (TID 80) (76.77.7.42 executor driver): java.lang.NullPointerException at org.apache.spark.sql.execution.vectorized.OnHeapColumnVector.getLong(OnHeapColumnVector.java:380) at org.apache.spark.sql.execution.vectorized.WritableColumnVector.getDecimal(WritableColumnVector.java:396) at org.apache.spark.sql.vectorized.ColumnarBatchRow.getDecimal(ColumnarBatchRow.java:121) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:889) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:889) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365) at org.apache.spark.rdd.RDD.iterator(RDD.scala:329) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:136) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:568) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1645) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:571) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750)

Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2793) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2729) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2728) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2728) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2987) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1015) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2432) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2453) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2472) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2497) at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1020) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:405) at org.apache.spark.rdd.RDD.collect(RDD.scala:1019) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:424) at org.apache.spark.sql.execution.SparkPlan.executeCollectPublic(SparkPlan.scala:451) at org.apache.spark.sql.execution.HiveResult$.hiveResultString(HiveResult.scala:76) at org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.$anonfun$run$2(SparkSQLDriver.scala:69) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:187) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:107) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:958) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:69) at org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:69) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:417) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1(SparkSQLCLIDriver.scala:566) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.$anonfun$processLine$1$adapted(SparkSQLCLIDriver.scala:560) at scala.collection.Iterator.foreach(Iterator.scala:943) at scala.collection.Iterator.foreach$(Iterator.scala:943) at scala.collection.AbstractIterator.foreach(Iterator.scala:1431) at scala.collection.IterableLike.foreach(IterableLike.scala:74) at scala.collection.IterableLike.foreach$(IterableLike.scala:73) at scala.collection.AbstractIterable.foreach(Iterable.scala:56) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processLine(SparkSQLCLIDriver.scala:560) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:293) at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:1013) at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180) at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203) at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90) at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1101) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1110) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) Caused by: java.lang.NullPointerException at org.apache.spark.sql.execution.vectorized.OnHeapColumnVector.getLong(OnHeapColumnVector.java:380) at org.apache.spark.sql.execution.vectorized.WritableColumnVector.getDecimal(WritableColumnVector.java:396) at org.apache.spark.sql.vectorized.ColumnarBatchRow.getDecimal(ColumnarBatchRow.java:121) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:889) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:889) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365) at org.apache.spark.rdd.RDD.iterator(RDD.scala:329) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:136) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:568) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1645) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:571) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750)