Stratio / Spark-MongoDB

Spark library for easy MongoDB access
http://www.stratio.com
Apache License 2.0
307 stars 96 forks source link

Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times #182

Open odigetti opened 6 years ago

odigetti commented 6 years ago

I'm working on a Spark ML sentiment analysis platform using a mobile feedback APP as a data source. The Data (Feedback) re stored on MongoDB and i wanna read them with spark. I tried to connect spark to MongoDB but not lucky i keep getting this error : Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.analysis.TypeCoercion$.findTightestCommonTypeOfTwo()Lscala/Function2; at com.stratio.datasource.mongodb.schema.MongodbSchema.com$stratio$datasource$mongodb$schema$MongodbSchema$$compatibleType(MongodbSchema.scala:85) at com.stratio.datasource.mongodb.schema.MongodbSchema$$anonfun$3.apply(MongodbSchema.scala:46) at com.stratio.datasource.mongodb.schema.MongodbSchema$$anonfun$3.apply(MongodbSchema.scala:46) at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189) at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188) at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:150) at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32) at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194) at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)

Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2124) at org.apache.spark.rdd.RDD$$anonfun$aggregate$1.apply(RDD.scala:1118) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) at org.apache.spark.rdd.RDD.aggregate(RDD.scala:1111) at com.stratio.datasource.mongodb.schema.MongodbSchema.schema(MongodbSchema.scala:46) at com.stratio.datasource.mongodb.MongodbRelation.com$stratio$datasource$mongodb$MongodbRelation$$lazySchema$lzycompute(MongodbRelation.scala:63) at com.stratio.datasource.mongodb.MongodbRelation.com$stratio$datasource$mongodb$MongodbRelation$$lazySchema(MongodbRelation.scala:60) at com.stratio.datasource.mongodb.MongodbRelation$$anonfun$1.apply(MongodbRelation.scala:65) at com.stratio.datasource.mongodb.MongodbRelation$$anonfun$1.apply(MongodbRelation.scala:65) at scala.Option.getOrElse(Option.scala:121) at com.stratio.datasource.mongodb.MongodbRelation.(MongodbRelation.scala:65) at com.stratio.datasource.mongodb.DefaultSource.createRelation(DefaultSource.scala:36) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:340) at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:164) at com.spml.MongoDbSparkCon.main(MongoDbSparkCon.java:36) Caused by: java.lang.NoSuchMethodError: org.apache.spark.sql.catalyst.analysis.TypeCoercion$.findTightestCommonTypeOfTwo()Lscala/Function2; at com.stratio.datasource.mongodb.schema.MongodbSchema.com$stratio$datasource$mongodb$schema$MongodbSchema$$compatibleType(MongodbSchema.scala:85) at com.stratio.datasource.mongodb.schema.MongodbSchema$$anonfun$3.apply(MongodbSchema.scala:46) at com.stratio.datasource.mongodb.schema.MongodbSchema$$anonfun$3.apply(MongodbSchema.scala:46) at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189) at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188) at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:150) at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32) at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194) at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 2018-05-28 18:54:24 INFO DAGScheduler:54 - Job 0 failed: aggregate at MongodbSchema.scala:46, took 0,855790 s at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)

My spark code : Im using Java

public class MongoDbSparkCon {

public static void main(String[] args) {

    JavaSparkContext sc = new JavaSparkContext("local[*]", "test spark-mongodb java");
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);

    Map options = new HashMap();
    options.put("host", "localhost:27017");
    options.put("database", "feedbackuib");
    options.put("collection", "Feedback");

    Dataset<Row> df = sqlContext.read().format("com.stratio.datasource.mongodb").options(options).load();
    df.registerTempTable("Feedback");
    sqlContext.sql("SELECT * FROM Feedback");
    df.show();

}}

Pom.xml :

org.apache.spark spark-core_2.11 2.2.0 org.apache.spark spark-mllib_2.11 2.3.0 org.apache.spark spark-sql_2.11 2.2.0 org.apache.spark spark-streaming_2.11 2.3.0 provided org.mongodb.spark mongo-spark-connector_2.10 2.2.2 com.stratio.datasource spark-mongodb_2.11 0.12.0 org.mongodb mongo-java-driver 3.6.3 org.mongodb casbah-core_2.11 3.1.1 org.mongodb casbah-commons_2.11 3.1.1 org.mongodb casbah-query_2.11 3.1.1 org.scala-lang scala-library 2.11.0 com.typesafe.akka akka-actor_2.11 2.5.12

Please anny help !!