rapidsai / spark-examples

[ARCHIVED] Moved to github.com/NVIDIA/spark-xgboost-examples
https://github.com/NVIDIA/spark-xgboost-examples
Apache License 2.0
70 stars 40 forks source link

Training: ExceptionFailure #85

Open liuandy-good opened 4 years ago

liuandy-good commented 4 years ago

2020-05-30 19:33:31 WARN TaskSetManager:66 - Lost task 0.0 in stage 0.0 (TID 0, 192.168.1.200, executor 0): java.lang.UnsatisfiedLinkError: ai.rapids.cudf.Table.gdfReadCSV([Ljava/lang/String;[Ljava/lang/String;[Ljava/lang/String;Ljava/lang/String;JJIBBB[Ljava/lang/String;[Ljava/lang/String;[Ljava/lang/String;)[J at ai.rapids.cudf.Table.gdfReadCSV(Native Method) at ai.rapids.cudf.Table.readCSV(Table.java:314) at ml.dmlc.xgboost4j.scala.spark.rapids.CSVPartitionReader.readToTable(GpuCSVScan.scala:214) at ml.dmlc.xgboost4j.scala.spark.rapids.CSVPartitionReader.readBatch(GpuCSVScan.scala:194) at ml.dmlc.xgboost4j.scala.spark.rapids.CSVPartitionReader.next(GpuCSVScan.scala:230) at ml.dmlc.xgboost4j.scala.spark.rapids.PartitionedFileReader.next(FilePartitionReaderFactory.scala:41) at ml.dmlc.xgboost4j.scala.spark.rapids.FilePartitionReader.next(FilePartitionReader.scala:69) at ml.dmlc.xgboost4j.scala.spark.rapids.GpuDatasetRDD$$anon$1.hasNext(GpuDatasetRDD.scala:51) at ml.dmlc.xgboost4j.scala.spark.Watches$.ml$dmlc$xgboost4j$scala$spark$Watches$$buildDMatrixIncrementally(XGBoost.scala:966) at ml.dmlc.xgboost4j.scala.spark.Watches$$anonfun$42.apply(XGBoost.scala:1026) at ml.dmlc.xgboost4j.scala.spark.Watches$$anonfun$42.apply(XGBoost.scala:1026) at ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset$.time(GpuDataset.scala:508) at ml.dmlc.xgboost4j.scala.spark.Watches$.buildWatches(XGBoost.scala:1025) at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$ml$dmlc$xgboost4j$scala$spark$XGBoost$$trainForGpuDataset$1.apply(XGBoost.scala:628) at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$ml$dmlc$xgboost4j$scala$spark$XGBoost$$trainForGpuDataset$1.apply(XGBoost.scala:625) at ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset$$anonfun$ml$dmlc$xgboost4j$scala$spark$rapids$GpuDataset$$getBatchMapper$1.apply(GpuDataset.scala:516) at ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset$$anonfun$ml$dmlc$xgboost4j$scala$spark$rapids$GpuDataset$$getBatchMapper$1.apply(GpuDataset.scala:515) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:337) at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:335) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1094) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1085) at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1020) at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1085) at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:811) at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335) at org.apache.spark.rdd.RDD.iterator(RDD.scala:286) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748)

2020-05-30 19:33:31 INFO TaskSetManager:54 - Starting task 0.1 in stage 0.0 (TID 1, 192.168.1.200, executor 0, partition 0, ANY, 8496 bytes) 2020-05-30 19:33:31 ERROR XGBoostTaskFailedListener:178 - Training Task Failed during XGBoost Training: ExceptionFailure(java.lang.UnsatisfiedLinkError,ai.rapids.cudf.Table.gdfReadCSV([Ljava/lang/String;[Ljava/lang/String;[Ljava/lang/String;Ljava/lang/String;JJIBBB[Ljava/lang/String;[Ljava/lang/String;[Ljava/lang/String;)[J,[Ljava.lang.StackTraceElement;@6c432b9b,java.lang.UnsatisfiedLinkError: ai.rapids.cudf.Table.gdfReadCSV([Ljava/lang/String;[Ljava/lang/String;[Ljava/lang/String;Ljava/lang/String;JJIBBB[Ljava/lang/String;[Ljava/lang/String;[Ljava/lang/String;)[J at ai.rapids.cudf.Table.gdfReadCSV(Native Method) at ai.rapids.cudf.Table.readCSV(Table.java:314) at ml.dmlc.xgboost4j.scala.spark.rapids.CSVPartitionReader.readToTable(GpuCSVScan.scala:214) at ml.dmlc.xgboost4j.scala.spark.rapids.CSVPartitionReader.readBatch(GpuCSVScan.scala:194) at ml.dmlc.xgboost4j.scala.spark.rapids.CSVPartitionReader.next(GpuCSVScan.scala:230) at ml.dmlc.xgboost4j.scala.spark.rapids.PartitionedFileReader.next(FilePartitionReaderFactory.scala:41) at ml.dmlc.xgboost4j.scala.spark.rapids.FilePartitionReader.next(FilePartitionReader.scala:69) at ml.dmlc.xgboost4j.scala.spark.rapids.GpuDatasetRDD$$anon$1.hasNext(GpuDatasetRDD.scala:51) at ml.dmlc.xgboost4j.scala.spark.Watches$.ml$dmlc$xgboost4j$scala$spark$Watches$$buildDMatrixIncrementally(XGBoost.scala:966) at ml.dmlc.xgboost4j.scala.spark.Watches$$anonfun$42.apply(XGBoost.scala:1026) at ml.dmlc.xgboost4j.scala.spark.Watches$$anonfun$42.apply(XGBoost.scala:1026) at ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset$.time(GpuDataset.scala:508) at ml.dmlc.xgboost4j.scala.spark.Watches$.buildWatches(XGBoost.scala:1025) at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$ml$dmlc$xgboost4j$scala$spark$XGBoost$$trainForGpuDataset$1.apply(XGBoost.scala:628) at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$ml$dmlc$xgboost4j$scala$spark$XGBoost$$trainForGpuDataset$1.apply(XGBoost.scala:625) at ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset$$anonfun$ml$dmlc$xgboost4j$scala$spark$rapids$GpuDataset$$getBatchMapper$1.apply(GpuDataset.scala:516) at ml.dmlc.xgboost4j.scala.spark.rapids.GpuDataset$$anonfun$ml$dmlc$xgboost4j$scala$spark$rapids$GpuDataset$$getBatchMapper$1.apply(GpuDataset.scala:515) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:337) at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:335) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1094) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1085) at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1020) at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1085) at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:811) at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335) at org.apache.spark.rdd.RDD.iterator(RDD.scala:286) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:109) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) ,Some(org.apache.spark.ThrowableSerializationWrapper@6284c863),Vector(AccumulableInfo(3,None,Some(5960),None,false,true,None), AccumulableInfo(5,None,Some(0),None,false,true,None), AccumulableInfo(6,None,Some(57),None,false,true,None)),Vector(LongAccumulator(id: 3, name: Some(internal.metrics.executorRunTime), value: 5960), LongAccumulator(id: 5, name: Some(internal.metrics.resultSize), value: 0), LongAccumulator(id: 6, name: Some(internal.metrics.jvmGCTime), value: 57))), stopping SparkContext 2020-05-30 19:33:31 INFO AbstractConnector:318 - Stopped Spark@541733dc{HTTP/1.1,[http/1.1]}{0.0.0.0:4040} 2020-05-30 19:33:31 INFO SparkUI:54 - Stopped Spark web UI at http://ecs-gpu-dl:4040 2020-05-30 19:33:31 INFO DAGScheduler:54 - Job 0 failed: foreachPartition at XGBoost.scala:686, took 7.389169 s 2020-05-30 19:33:31 INFO DAGScheduler:54 - ResultStage 0 (foreachPartition at XGBoost.scala:686) failed in 7.356 s due to Stage cancelled because SparkContext was shut down 2020-05-30 19:33:31 ERROR RabitTracker:91 - Uncaught exception thrown by worker: org.apache.spark.SparkException: Job 0 cancelled because SparkContext was shut down at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:837) at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:835) at scala.collection.mutable.HashSet.foreach(HashSet.scala:78) at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:835) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1900) at org.apache.spark.util.EventLoop.stop(EventLoop.scala:83) at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1813) at org.apache.spark.SparkContext$$anonfun$stop$8.apply$mcV$sp(SparkContext.scala:1931) at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1361) at org.apache.spark.SparkContext.stop(SparkContext.scala:1930) at org.apache.spark.TaskFailedListener$$anon$1$$anonfun$run$1.apply$mcV$sp(SparkParallelismTracker.scala:197) at org.apache.spark.TaskFailedListener$$anon$1$$anonfun$run$1.apply(SparkParallelismTracker.scala:197) at org.apache.spark.TaskFailedListener$$anon$1$$anonfun$run$1.apply(SparkParallelismTracker.scala:197) at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58) at org.apache.spark.TaskFailedListener$$anon$1.run(SparkParallelismTracker.scala:196) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099) at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:935) at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:933) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:933) at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributedForGpuDataset$1$$anon$1.run(XGBoost.scala:686) 2020-05-30 19:33:31 INFO StandaloneSchedulerBackend:54 - Shutting down all executors 2020-05-30 19:33:31 INFO CoarseGrainedSchedu

firestarman commented 4 years ago

Please make sure the cudf Jar version you used matches your local cuda runtime version.

cudf-<ver>.jar  -> cuda9.2
cudf-<ver>-cuda10.jar  -> cuda10.0
cudf-<ver>-cuda10-1.jar  -> cuda10.1