qubole / spark-acid

ACID Data Source for Apache Spark based on Hive ACID
Apache License 2.0
97 stars 34 forks source link

Not able to perform on action on hive tables using spark-acid jar #34

Open vinay-kl opened 4 years ago

vinay-kl commented 4 years ago

Running on HDI 4.0 with spark 2.4.0 and HIVE 3.1.2.

spark-shell --packages qubole:spark-acid:0.4.0-s_2.11 val df = spark.read.format("HiveAcid").options(Map("table" -> "default.college")).load() df.count

--stack-trace--

Caused by: java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.getJobConf(Hive3Rdd.scala:165) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD$$anon$1.(Hive3Rdd.scala:257) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.compute(Hive3Rdd.scala:252) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.compute(Hive3Rdd.scala:86) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:121) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)

JobConf is picked from hadoop-mapreduce-client-core-3.1.1.jar

Configuration is picked from hadoop-common-3.1.1.jar

val a: org.apache.hadoop.conf.Configuration = null new JobConf(a)

java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) ... 49 elided

The value is coming as null in this line---> https://github.com/qubole/spark-acid/blob/f445eeef4416ee27192905e0e69a43076db7b2b1/src/main/scala/com/qubole/spark/datasources/hiveacid/rdd/Hive3Rdd.scala#L138

--even tried with setting spark.hadoop.cloneConf seems to be breaking in both if and else condition-- -- tried with HDI 3.6 with spark 2.3.0 but there was an issue with guava jar version being 24.1.1, so it was throwing as the method was found

java.lang.NoSuchMethodError: com.google.common.collect.MapMaker.softValues()Lcom/google/common/collect/MapMaker; at com.qubole.spark.datasources.hiveacid.rdd.Cache$.(Hive3Rdd.scala:53) at com.qubole.spark.datasources.hiveacid.rdd.Cache$.(Hive3Rdd.scala) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.getJobConf(Hive3Rdd.scala:160) at com.qubole.spark.datasources.hiveacid.rdd.Hive3RDD.getPartitions(Hive3Rdd.scala:196) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.UnionRDD$$anonfun$1.apply(UnionRDD.scala:84) at org.apache.spark.rdd.UnionRDD$$anonfun$1.apply(UnionRDD.scala:84) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.immutable.List.foreach(List.scala:381) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at scala.collection.immutable.List.map(List.scala:285) at org.apache.spark.rdd.UnionRDD.getPartitions(UnionRDD.scala:84) at com.qubole.spark.datasources.hiveacid.rdd.AcidLockUnionRDD.getPartitions(AcidLockUnionRDD.scala:37) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:46) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:251) at org.apache.spark.ShuffleDependency.(Dependency.scala:91) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.prepareShuffleDependency(ShuffleExchangeExec.scala:322) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:91) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119) at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127) at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:371) at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:150) at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:605) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127) at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:294) at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2775) at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2774) at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3259) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3258) at org.apache.spark.sql.Dataset.count(Dataset.scala:2774) ... 49 elided

in this line--> https://github.com/qubole/spark-acid/blob/f445eeef4416ee27192905e0e69a43076db7b2b1/src/main/scala/com/qubole/spark/datasources/hiveacid/rdd/Hive3Rdd.scala#L51

Can you guys please help me with this..

Thanks and Regards, Vinay K L

amoghmargoor commented 4 years ago

Hi @vinay-kl, Looks like you are using the old jar. There was some major refactoring which happened since then. Can you build the latest master and use it's Jar ? Let me know if you need any help with that.

vinay-kl commented 4 years ago

@amoghmargoor sure, i am trying that, will post an update soon on this..

vinay-kl commented 4 years ago

@amoghmargoor with the latest jar also the issue persists, however if i change to JobConf constructor with no arguments there seems to be no issue. Is there any repercussions if its done this way? in this line https://github.com/qubole/spark-acid/blob/2e66f76b3a3c974a73f03d9e80d9c66a0475a552/src/main/scala/com/qubole/spark/hiveacid/rdd/HiveAcidRDD.scala#L174

amoghmargoor commented 4 years ago

hey @vinay-kl I guess it is because the conf being passed as argument to JobConf is null. That means the conf being passed at this location will be lost: https://github.com/qubole/spark-acid/blob/2e66f76b3a3c974a73f03d9e80d9c66a0475a552/src/main/scala/com/qubole/spark/hiveacid/reader/hive/HiveAcidReader.scala#L93 which is not desirable. Can you check if readerOptions.hadoopConf is actually null at that HiveAcidReader's location ? Also whats the stack trace now ?

vinay-kl commented 4 years ago

Hey @amoghmargoor the readerOptions.hadoopConf is not null, the same which is broadcasted is also non null. The conf printed are core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, __spark_hadoop_conf__.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml.

I'm able to use the API properly, only if no args constructor is used instead at src/main/scala/com/qubole/spark/hiveacid/rdd/HiveAcidRDD.scala Line 174

val newJobConf = new JobConf()

amoghmargoor commented 4 years ago

@vinay-kl The original issue which you created said that this value is null:

The value is coming as null in this line---> https://github.com/qubole/spark-acid/blob/f445eeef4416ee27192905e0e69a43076db7b2b1/src/main/scala/com/qubole/spark/datasources/hiveacid/rdd/Hive3Rdd.scala#L138

So I assume this problem is solved after the Jar upgrade according to your last comment.

Now if broadcasted conf is not null, what error do you get now (with stack trace) ?

vinay-kl commented 4 years ago

@amoghmargoor my bad for not being clear on that, The issue still persists even after the jar upgrade. stack trace remains the same.. i was only able to by-pass by creating a no args constructor, which is aforementioned.

And a warning of this sort is thrown

20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.enforce.sorting does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.hook.proto.base-directory does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.strict.managed.tables does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.stats.fetch.partition.stats does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.heapsize does not exist 20/01/30 07:08:01 WARN HiveConf: HiveConf of name hive.enforce.bucketing does not exist

All these config keys are present in hive-site.xml

amoghmargoor commented 4 years ago

But you just said conf is not null, so it cannot be same issue. Can you please paste the current stacktrace even if you think it is same ?

Sent from my iPhone

On 30-Jan-2020, at 5:47 AM, Vinay Bharadwaj notifications@github.com wrote:

 @amoghmargoor my bad for not being clear on that, The issue still persists even after the jar upgrade. stack trace remains the same..

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub, or unsubscribe.

vinay-kl commented 4 years ago

print statements

_conf::::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false conf:::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false 20/01/30 15:05:55 ERROR AcidUtils: Failed to get files with ID; using regular API: Only supported for DFS; got class org.apache.hadoop.fs.azure.NativeAzureFileSystem conf::::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false conf:::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false 20/01/30 15:05:55 ERROR AcidUtils: Failed to get files with ID; using regular API: Only supported for DFS; got class org.apache.hadoop.fs.azure.NativeAzureFileSystem conf::::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false conf:::Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml, spark_hadoop_conf.xml, file:/etc/spark2/3.1.2.2-1/0/hive-site.xml false_

WARN TaskSetManager: Lost task 3.0 in stage 0.0 (TID 3, wn1-sp-hiv.3e3rqto3nr5evmsjbqz0pkrj4g.tx.internal.cloudapp.net, executor 2): java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.getJobConf(HiveAcidRDD.scala:168) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD$$anon$1.(HiveAcidRDD.scala:262) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:258) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:84) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:121) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) [Stage 0:> (0 + 6) / 7]20/01/30 15:05:57 ERROR TaskSetManager: Task 1 in stage 0.0 failed 4 times; aborting job org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 4 times, most recent failure: Lost task 1.3 in stage 0.0 (TID 18, wn4-sp-hiv.3e3rqto3nr5evmsjbqz0pkrj4g.tx.internal.cloudapp.net, executor 3): java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.getJobConf(HiveAcidRDD.scala:168) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD$$anon$1.(HiveAcidRDD.scala:262) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:258) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:84) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:121) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Driver stacktrace:at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) at org.apache.spark.rdd.RDD.collect(RDD.scala:944) at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299) at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2830) at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2829) at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364) at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363) at org.apache.spark.sql.Dataset.count(Dataset.scala:2829) ... 49 elided Caused by: java.lang.NullPointerException at org.apache.hadoop.conf.Configuration.(Configuration.java:820) at org.apache.hadoop.mapred.JobConf.(JobConf.java:440) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.getJobConf(HiveAcidRDD.scala:168) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD$$anon$1.(HiveAcidRDD.scala:262) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:258) at com.qubole.spark.hiveacid.rdd.HiveAcidRDD.compute(HiveAcidRDD.scala:84) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:121) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)