Job aborted due to stage failure: Task 0 in stage 9.0 failed 1 times, most recent failure: Lost task 0.0 in stage 9.0 (TID 44, localhost, executor driver): java.util.concurrent.ExecutionException: Layer info: TFTrainingHelper[44456754]/TFNet[5a094281]
java.lang.IllegalArgumentException: Incompatible shapes: [0] vs. [3]
[[Node: sparse_softmax_cross_entropy_loss/xentropy/assert_equal/Equal = Equal[T=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](sparse_softmax_cross_entropy_loss/xentropy/Shape_1, sparse_softmax_cross_entropy_loss/xentropy/strided_slice)]]
at org.tensorflow.Session.run(Native Method)
at org.tensorflow.Session.access$100(Session.java:48)
at org.tensorflow.Session$Runner.runHelper(Session.java:298)
at org.tensorflow.Session$Runner.run(Session.java:248)
at com.intel.analytics.zoo.pipeline.api.net.TFNet.updateOutput(TFNet.scala:252)
at com.intel.analytics.bigdl.nn.abstractnn.AbstractModule.forward(AbstractModule.scala:257)
at com.intel.analytics.zoo.pipeline.api.net.TFTrainingHelper.updateOutput(TFTrainingHelper.scala:100)
at com.intel.analytics.bigdl.nn.abstractnn.AbstractModule.forward(AbstractModule.scala:257)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$6$$anonfun$apply$2.apply$mcI$sp(DistriOptimizer.scala:252)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$6$$anonfun$apply$2.apply(DistriOptimizer.scala:245)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$6$$anonfun$apply$2.apply(DistriOptimizer.scala:245)
at com.intel.analytics.bigdl.utils.ThreadPool$$anonfun$1$$anon$4.call(ThreadPool.scala:112)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
at java.util.concurrent.FutureTask.report(FutureTask.java:122)
at java.util.concurrent.FutureTask.get(FutureTask.java:192)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$8.apply(DistriOptimizer.scala:264)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$8.apply(DistriOptimizer.scala:264)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4.apply(DistriOptimizer.scala:264)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4.apply(DistriOptimizer.scala:202)
at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: Layer info: TFTrainingHelper[44456754]/TFNet[5a094281]
java.lang.IllegalArgumentException: Incompatible shapes: [0] vs. [3]
[[Node: sparse_softmax_cross_entropy_loss/xentropy/assert_equal/Equal = Equal[T=DT_INT32, _device="/job:localhost/replica:0/task:0/device:CPU:0"](sparse_softmax_cross_entropy_loss/xentropy/Shape_1, sparse_softmax_cross_entropy_loss/xentropy/strided_slice)]]
at org.tensorflow.Session.run(Native Method)
at org.tensorflow.Session.access$100(Session.java:48)
at org.tensorflow.Session$Runner.runHelper(Session.java:298)
at org.tensorflow.Session$Runner.run(Session.java:248)
at com.intel.analytics.zoo.pipeline.api.net.TFNet.updateOutput(TFNet.scala:252)
at com.intel.analytics.bigdl.nn.abstractnn.AbstractModule.forward(AbstractModule.scala:257)
at com.intel.analytics.zoo.pipeline.api.net.TFTrainingHelper.updateOutput(TFTrainingHelper.scala:100)
at com.intel.analytics.bigdl.nn.abstractnn.AbstractModule.forward(AbstractModule.scala:257)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$6$$anonfun$apply$2.apply$mcI$sp(DistriOptimizer.scala:252)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$6$$anonfun$apply$2.apply(DistriOptimizer.scala:245)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$6$$anonfun$apply$2.apply(DistriOptimizer.scala:245)
at com.intel.analytics.bigdl.utils.ThreadPool$$anonfun$1$$anon$4.call(ThreadPool.scala:112)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
at com.intel.analytics.bigdl.nn.abstractnn.AbstractModule.forward(AbstractModule.scala:263)
at com.intel.analytics.zoo.pipeline.api.net.TFTrainingHelper.updateOutput(TFTrainingHelper.scala:100)
at com.intel.analytics.bigdl.nn.abstractnn.AbstractModule.forward(AbstractModule.scala:257)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$6$$anonfun$apply$2.apply$mcI$sp(DistriOptimizer.scala:252)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$6$$anonfun$apply$2.apply(DistriOptimizer.scala:245)
at com.intel.analytics.bigdl.optim.DistriOptimizer$$anonfun$4$$anonfun$6$$anonfun$apply$2.apply(DistriOptimizer.scala:245)
at com.intel.analytics.bigdl.utils.ThreadPool$$anonfun$1$$anon$4.call(ThreadPool.scala:112)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
... 3 more
Driver stacktrace:
When I use TFOptimizer to train a tensorflow model by slim. I got a error.
I run the https://github.com/intel-analytics/analytics-zoo/blob/5212eb75956965fbedc64a0f0bb563bfc0b855b6/pyzoo/zoo/examples/tensorflow/distributed_training/train_lenet.py,get same error.