Not able to perform on action on hive tables using spark-acid jar

vinay-kl commented 4 years ago

Running on HDI 4.0 with spark 2.4.0 and HIVE 3.1.2.

spark-shell --packages qubole:spark-acid:0.4.0-s_2.11 val df = spark.read.format("HiveAcid").options(Map("table" -> "default.college")).load() df.count

--stack-trace--

Caused by: java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.getJobConf(Hive3Rdd.scala:165) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD$$anon$1.(Hive3Rdd.scala:257) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.compute(Hive3Rdd.scala:252) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.compute(Hive3Rdd.scala:86) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:121) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)

JobConf is picked from hadoop-mapreduce-client-core-3.1.1.jar

Configuration is picked from hadoop-common-3.1.1.jar

val a: org.apache.hadoop.conf.Configuration = null new JobConf(a)

java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) ... 49 elided

The value is coming as null in this line---> https://github.com/qubole/spark-acid/blob/f445eeef4416ee27192905e0e69a43076db7b2b1/src/main/scala/com/qubole/spark/datasources/hiveacid/rdd/Hive3Rdd.scala#L138

--even tried with setting spark.hadoop.cloneConf seems to be breaking in both if and else condition-- -- tried with HDI 3.6 with spark 2.3.0 but there was an issue with guava jar version being 24.1.1, so it was throwing as the method was found

java.lang.NoSuchMethodError: com.google.common.collect.MapMaker.softValues()Lcom/google/common/collect/MapMaker; at com.qubole.spark.datasources.hiveacid.rdd.Cache$.(Hive3Rdd.scala:53) at com.qubole.spark.datasources.hiveacid.rdd.Cache$.(Hive3Rdd.scala) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.getJobConf(Hive3Rdd.scala:160) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.getPartitions(Hive3Rdd.scala:196) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.UnionRDD$$anonfun$1.apply(UnionRDD.scala:84) at org.apache.spark.rdd.UnionRDD$$anonfun$1.apply(UnionRDD.scala:84) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.immutable.List.foreach(List.scala:381) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at scala.collection.immutable.List.map(List.scala:285) at org.apache.spark.rdd.UnionRDD.getPartitions(UnionRDD.scala:84) at com.qubole.spark.datasources.hiveacid.rdd.AcidLockUnionRDD.getPartitions(AcidLockUnionRDD.scala:37) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.ShuffleDependency.(Dependency.scala:91) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.prepareShuffleDependency(ShuffleExchangeExec.scala:322) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:91) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119) at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127) at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:371) at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:150) at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:605) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127) at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:294) at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2775) at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2774) at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3259) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3258) at org.apache.spark.sql.Dataset.count(Dataset.scala:2774) ... 49 elided

in this line--> https://github.com/qubole/spark-acid/blob/f445eeef4416ee27192905e0e69a43076db7b2b1/src/main/scala/com/qubole/spark/datasources/hiveacid/rdd/Hive3Rdd.scala#L51

Can you guys please help me with this..

Thanks and Regards, Vinay K L

amoghmargoor commented 4 years ago

Hi @vinay-kl, Looks like you are using the old jar. There was some major refactoring which happened since then. Can you build the latest master and use it's Jar ? Let me know if you need any help with that.

vinay-kl commented 4 years ago

@amoghmargoor sure, i am trying that, will post an update soon on this..

vinay-kl commented 4 years ago

@amoghmargoor with the latest jar also the issue persists, however if i change to JobConf constructor with no arguments there seems to be no issue. Is there any repercussions if its done this way? in this line https://github.com/qubole/spark-acid/blob/2e66f76b3a3c974a73f03d9e80d9c66a0475a552/src/main/scala/com/qubole/spark/hiveacid/rdd/HiveAcidRDD.scala#L174

amoghmargoor commented 4 years ago

hey @vinay-kl I guess it is because the conf being passed as argument to JobConf is null. That means the conf being passed at this location will be lost: https://github.com/qubole/spark-acid/blob/2e66f76b3a3c974a73f03d9e80d9c66a0475a552/src/main/scala/com/qubole/spark/hiveacid/reader/hive/HiveAcidReader.scala#L93 which is not desirable. Can you check if readerOptions.hadoopConf is actually null at that HiveAcidReader's location ? Also whats the stack trace now ?

vinay-kl commented 4 years ago

Hey @amoghmargoor the readerOptions.hadoopConf is not null, the same which is broadcasted is also non null. The conf printed are core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, __spark_hadoop_conf__.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml.

I'm able to use the API properly, only if no args constructor is used instead at src/main/scala/com/qubole/spark/hiveacid/rdd/HiveAcidRDD.scala Line 174

val newJobConf = new JobConf()

amoghmargoor commented 4 years ago

@vinay-kl The original issue which you created said that this value is null:

The value is coming as null in this line---> https://github.com/qubole/spark-acid/blob/f445eeef4416ee27192905e0e69a43076db7b2b1/src/main/scala/com/qubole/spark/datasources/hiveacid/rdd/Hive3Rdd.scala#L138

So I assume this problem is solved after the Jar upgrade according to your last comment.

Now if broadcasted conf is not null, what error do you get now (with stack trace) ?

vinay-kl commented 4 years ago

@amoghmargoor my bad for not being clear on that, The issue still persists even after the jar upgrade. stack trace remains the same.. i was only able to by-pass by creating a no args constructor, which is aforementioned.

And a warning of this sort is thrown

20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.enforce.sorting does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.hook.proto.base-directory does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.strict.managed.tables does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.stats.fetch.partition.stats does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.heapsize does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.enforce.bucketing does not exist

All these config keys are present in hive-site.xml

amoghmargoor commented 4 years ago

But you just said conf is not null, so it cannot be same issue. Can you please paste the current stacktrace even if you think it is same ?

Sent from my iPhone

On 30-Jan-2020, at 5:47 AM, Vinay Bharadwaj notifications@github.com wrote:

@amoghmargoor my bad for not being clear on that, The issue still persists even after the jar upgrade. stack trace remains the same..

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub, or unsubscribe.

vinay-kl commented 4 years ago

print statements

_conf::::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false conf:::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false 20/01/30 15:05:55 ERROR AcidUtils: Failed to get files with ID; using regular API: Only supported for DFS; got class org.apache.hadoop.fs.azure.NativeAzureFileSystem conf::::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false conf:::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false 20/01/30 15:05:55 ERROR AcidUtils: Failed to get files with ID; using regular API: Only supported for DFS; got class org.apache.hadoop.fs.azure.NativeAzureFileSystem conf::::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false conf:::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false_

WARN TaskSetManager: Lost task 3.0 in stage 0.0 (TID 3, wn1-sp-hiv.3e3rqto3nr5evmsjbqz0pkrj4g.tx.internal.cloudapp.net, executor 2): java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.getJobConf(HiveAcidRDD.scala:168) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD$$anon$1.(HiveAcidRDD.scala:262) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:258) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:84) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:121) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) [Stage 0:> (0 + 6) / 7]20/01/30 15:05:57 ERROR TaskSetManager: Task 1 in stage 0.0 failed 4 times; aborting job org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 4 times, most recent failure: Lost task 1.3 in stage 0.0 (TID 18, wn4-sp-hiv.3e3rqto3nr5evmsjbqz0pkrj4g.tx.internal.cloudapp.net, executor 3): java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.getJobConf(HiveAcidRDD.scala:168) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD$$anon$1.(HiveAcidRDD.scala:262) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:258) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:84) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:121) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Driver stacktrace:at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) at org.apache.spark.rdd.RDD.collect(RDD.scala:944) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299) at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2830) at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2829) at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364) at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363) at org.apache.spark.sql.Dataset.count(Dataset.scala:2829) ... 49 elided Caused by: java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.getJobConf(HiveAcidRDD.scala:168) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD$$anon$1.(HiveAcidRDD.scala:262) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:258) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:84) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:121) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)

qubole / spark-acid

Not able to perform on action on hive tables using spark-acid jar #34