eaplatanios / tensorflow_scala

TensorFlow API for the Scala Programming Language
http://platanios.org/tensorflow_scala/
Apache License 2.0
936 stars 96 forks source link

No TensorBoard at https://localhost:6006 #145

Closed SrTobi closed 5 years ago

SrTobi commented 5 years ago

I tried to assemble the example from the readme and got it to compile. But I couldn't go to https://localhost:6006 because there was no server listening there.

I then tried the code from the MNIST example and that worked as well but still does not open a server on 6006... Is there something I need to do additionally? Some external software?

Here is the code I use:

import java.nio.file.Paths

import com.typesafe.scalalogging.Logger
import org.platanios.tensorflow.api.core.Shape
import org.platanios.tensorflow.api.learn.ClipGradientsByGlobalNorm
import org.platanios.tensorflow.api.{tf, _}
import org.platanios.tensorflow.data.image.MNISTLoader
import org.slf4j.LoggerFactory

object Main {

  private val logger = Logger(LoggerFactory.getLogger("Examples / MNIST"))

  def main(args: Array[String]): Unit = {
    val dataSet = MNISTLoader.load(Paths.get("datasets/MNIST"))
    val trainImages = tf.data.datasetFromTensorSlices(dataSet.trainImages).map(_.toFloat)
    val trainLabels = tf.data.datasetFromTensorSlices(dataSet.trainLabels).map(_.toLong)
    val testImages = tf.data.datasetFromTensorSlices(dataSet.testImages).map(_.toFloat)
    val testLabels = tf.data.datasetFromTensorSlices(dataSet.testLabels).map(_.toLong)
    val trainData =
      trainImages.zip(trainLabels)
        .repeat()
        .shuffle(10000)
        .batch(256)
        .prefetch(10)
    val evalTrainData = trainImages.zip(trainLabels).batch(1000).prefetch(10)
    val evalTestData = testImages.zip(testLabels).batch(1000).prefetch(10)

    logger.info("Building the logistic regression model.")

    val input = tf.learn.Input(FLOAT32, Shape(-1, dataSet.trainImages.shape(1), dataSet.trainImages.shape(2)))
    val trainInput = tf.learn.Input(INT64, Shape(-1))
    val layer = tf.learn.Flatten[Float]("Input/Flatten") >>
      tf.learn.Linear[Float]("Layer_0/Linear", 128) >> tf.learn.ReLU[Float]("Layer_0/ReLU", 0.1f) >>
      tf.learn.Linear[Float]("Layer_1/Linear", 64) >> tf.learn.ReLU[Float]("Layer_1/ReLU", 0.1f) >>
      tf.learn.Linear[Float]("Layer_2/Linear", 32) >> tf.learn.ReLU[Float]("Layer_2/ReLU", 0.1f) >>
      tf.learn.Linear[Float]("OutputLayer/Linear", 10)
    val loss = tf.learn.SparseSoftmaxCrossEntropy[Float, Long, Float]("Loss/CrossEntropy") >>
      tf.learn.Mean[Float]("Loss/Mean") >>
      tf.learn.ScalarSummary[Float]("Loss/Summary", "Loss")
    val optimizer = tf.train.YellowFin()

    val model = tf.learn.Model.simpleSupervised(
      input = input,
      trainInput = trainInput,
      layer = layer,
      loss = loss,
      optimizer = optimizer,
      clipGradients = ClipGradientsByGlobalNorm(5.0f))

    logger.info("Training the linear regression model.")
    val summariesDir = Paths.get("temp/mnist-mlp")
    val accMetric = tf.metrics.MapMetric(
      (v: (Output[Float], (Output[Float], Output[Int]))) => {
        (tf.argmax(v._1, -1, INT64).toFloat, v._2._2.toFloat)
      }, tf.metrics.Accuracy("Accuracy"))
    val estimator = tf.learn.InMemoryEstimator(
      model,
      tf.learn.Configuration(Some(summariesDir)),
      tf.learn.StopCriteria(maxSteps = Some(100000)),
      Set(
        tf.learn.LossLogger(trigger = tf.learn.StepHookTrigger(100)),
        tf.learn.Evaluator(
          log = true, datasets = Seq(("Train", () => evalTrainData), ("Test", () => evalTestData)),
          metrics = Seq(accMetric), trigger = tf.learn.StepHookTrigger(1000), name = "Evaluator"),
        tf.learn.StepRateLogger(log = false, summaryDir = summariesDir, trigger = tf.learn.StepHookTrigger(100)),
        tf.learn.SummarySaver(summariesDir, tf.learn.StepHookTrigger(100)),
        tf.learn.CheckpointSaver(summariesDir, tf.learn.StepHookTrigger(1000))),
      tensorBoardConfig = tf.learn.TensorBoardConfig(summariesDir, reloadInterval = 1))
    estimator.train(() => trainData, tf.learn.StopCriteria(maxSteps = Some(10000)))

    def accuracy(images: Tensor[UByte], labels: Tensor[UByte]): Float = {
      val predictions = estimator.infer(() => images.toFloat)
      predictions
        .argmax(1).toUByte
        .equal(labels).toFloat
        .mean().scalar
    }

    logger.info(s"Train accuracy = ${accuracy(dataSet.trainImages, dataSet.trainLabels)}")
    logger.info(s"Test accuracy = ${accuracy(dataSet.testImages, dataSet.testLabels)}")
  }
}
mandar2812 commented 5 years ago

@SrTobi, when you run this code snippet, tensorflow-scala should print some logs on the screen which will confirm if Tensorboard started as expected or failed. Often a failure happens because the program tensorboard (package tensorflow) is not installed on your system python/virtual environment.

SrTobi commented 5 years ago

I don't see any error message there... even after I installed the package tensorboard. I can run tensorboard --logdir temp and that starts the tensorboard server... but tensorflow scala does not...

This is the log (if the temp directory doesn't exist):

/usr/lib/jvm/java-8-openjdk/bin/java -javaagent:/opt/JetBrains/apps/IDEA-U/ch-0/183.5153.38/lib/idea_rt.jar=46469:/opt/JetBrains/apps/IDEA-U/ch-0/183.5153.38/bin -Dfile.encoding=UTF-8 -classpath /usr/lib/jvm/java-8-openjdk/jre/lib/charsets.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/cldrdata.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/dnsns.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/jaccess.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/localedata.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/nashorn.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/sunec.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/sunjce_provider.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/sunpkcs11.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/zipfs.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/jce.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/jsse.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/management-agent.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/resources.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/rt.jar:/home/tobi/workspace/tensorflow-test/target/scala-2.12/classes:/home/tobi/.ivy2/cache/ch.qos.logback/logback-classic/jars/logback-classic-1.2.3.jar:/home/tobi/.ivy2/cache/org.platanios/tensorflow-data_2.12/jars/tensorflow-data_2.12-0.4.1.jar:/home/tobi/.ivy2/cache/org.objenesis/objenesis/jars/objenesis-2.6.jar:/home/tobi/.ivy2/cache/org.apache.commons/commons-compress/jars/commons-compress-1.15.jar:/home/tobi/.ivy2/cache/org.typelevel/macro-compat_2.12/jars/macro-compat_2.12-1.1.1.jar:/home/tobi/.ivy2/cache/org.typelevel/machinist_2.12/jars/machinist_2.12-0.6.5.jar:/home/tobi/.ivy2/cache/org.typelevel/cats-macros_2.12/jars/cats-macros_2.12-1.4.0.jar:/home/tobi/.ivy2/cache/org.typelevel/cats-kernel_2.12/jars/cats-kernel_2.12-1.4.0.jar:/home/tobi/.ivy2/cache/org.typelevel/cats-core_2.12/jars/cats-core_2.12-1.4.0.jar:/home/tobi/.ivy2/cache/org.tensorflow/proto/jars/proto-1.11.0.jar:/home/tobi/.ivy2/cache/org.spire-math/jawn-parser_2.12/jars/jawn-parser_2.12-0.13.0.jar:/home/tobi/.ivy2/cache/org.slf4j/slf4j-api/jars/slf4j-api-1.7.25.jar:/home/tobi/.ivy2/cache/org.scalactic/scalactic_2.12/bundles/scalactic_2.12-3.0.5.jar:/home/tobi/.ivy2/cache/org.platanios/tensorflow-jni_2.12/jars/tensorflow-jni_2.12-0.4.1.jar:/home/tobi/.ivy2/cache/org.platanios/tensorflow-api_2.12/jars/tensorflow-api_2.12-0.4.1.jar:/home/tobi/.ivy2/cache/org.hamcrest/hamcrest-core/jars/hamcrest-core-1.3.jar:/home/tobi/.ivy2/cache/junit/junit/jars/junit-4.12.jar:/home/tobi/.ivy2/cache/io.circe/circe-parser_2.12/jars/circe-parser_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/io.circe/circe-numbers_2.12/jars/circe-numbers_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/io.circe/circe-jawn_2.12/jars/circe-jawn_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/io.circe/circe-generic_2.12/jars/circe-generic_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/io.circe/circe-core_2.12/jars/circe-core_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/com.typesafe.scala-logging/scala-logging_2.12/bundles/scala-logging_2.12-3.9.0.jar:/home/tobi/.ivy2/cache/com.google.protobuf/protobuf-java/bundles/protobuf-java-3.5.1.jar:/home/tobi/.ivy2/cache/com.github.ghik/silencer-lib_2.12/jars/silencer-lib_2.12-0.6.jar:/home/tobi/.ivy2/cache/com.chuusai/shapeless_2.12/bundles/shapeless_2.12-2.3.3.jar:/home/tobi/.ivy2/cache/ch.qos.logback/logback-core/jars/logback-core-1.2.3.jar:/home/tobi/.ivy2/cache/org.scala-lang/scala-library/jars/scala-library-2.12.8.jar:/home/tobi/.ivy2/cache/org.scala-lang/scala-reflect/jars/scala-reflect-2.12.8.jar:/home/tobi/.ivy2/cache/org.platanios/tensorflow_2.12/jars/tensorflow_2.12-0.4.1-linux-cpu-x86_64.jar Main
2019-01-15 10:30:39.908 [main] INFO  MNIST Data Loader - Extracting images from file 'datasets/MNIST/train-images-idx3-ubyte.gz'.
2019-01-15 10:30:42.852 [main] INFO  MNIST Data Loader - Extracting labels from file 'datasets/MNIST/train-labels-idx1-ubyte.gz'.
2019-01-15 10:30:42.855 [main] INFO  MNIST Data Loader - Extracting images from file 'datasets/MNIST/t10k-images-idx3-ubyte.gz'.
2019-01-15 10:30:42.934 [main] INFO  MNIST Data Loader - Extracting labels from file 'datasets/MNIST/t10k-labels-idx1-ubyte.gz'.
2019-01-15 10:30:42.934 [main] INFO  MNIST Data Loader - Finished loading the MNIST dataset.
2019-01-15 10:30:43.039 [main] INFO  Examples / MNIST - Building the logistic regression model.
2019-01-15 10:30:43.084 [main] INFO  Examples / MNIST - Training the linear regression model.
2019-01-15 10:30:43.293444: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-01-15 10:30:46.428 [main] INFO  Learn / Hooks / Checkpoint Saver - Saving checkpoint for step 0.
2019-01-15 10:30:46.428 [main] INFO  Variables / Saver - Saving parameters to '/home/tobi/workspace/tensorflow-test/temp/mnist-mlp/model.ckpt-0'.
2019-01-15 10:30:46.612 [main] INFO  Variables / Saver - Saved parameters to '/home/tobi/workspace/tensorflow-test/temp/mnist-mlp/model.ckpt-0'.
2019-01-15 10:30:46.635 [main] INFO  Learn / Hooks / Loss Logger - (    N/A    ) Step:      0, Loss: 746491.1875
2019-01-15 10:30:46.692 [main] INFO  Variables / Saver - Restoring parameters from '/home/tobi/workspace/tensorflow-test/temp/mnist-mlp/model.ckpt-0'.
2019-01-15 10:30:46.761 [main] INFO  Learn / Hooks / Evaluation - Step 0 Evaluator:
2019-01-15 10:30:46.762 [main] INFO  Learn / Hooks / Evaluation - ╔═══════╤════════════╗
2019-01-15 10:30:46.763 [main] INFO  Learn / Hooks / Evaluation - ║       │   Accuracy ║
2019-01-15 10:30:46.763 [main] INFO  Learn / Hooks / Evaluation - ╟───────┼────────────╢
2019-01-15 10:30:48.902 [main] INFO  Learn / Hooks / Evaluation - ║ Train │     0.1258 ║
2019-01-15 10:30:49.189 [main] INFO  Learn / Hooks / Evaluation - ║  Test │     0.1194 ║
2019-01-15 10:30:49.193 [main] INFO  Learn / Hooks / Evaluation - ╚═══════╧════════════╝
2019-01-15 10:30:50.186 [main] INFO  Learn / Hooks / Loss Logger - (    3.550 s) Step:    100, Loss: 1080.4623
2019-01-15 10:30:51.279 [main] INFO  Learn / Hooks / Loss Logger - (    1.093 s) Step:    200, Loss: 1923.3953

Also, when I start the program if temp exist I get the following exception:

/usr/lib/jvm/java-8-openjdk/bin/java -javaagent:/opt/JetBrains/apps/IDEA-U/ch-0/183.5153.38/lib/idea_rt.jar=38885:/opt/JetBrains/apps/IDEA-U/ch-0/183.5153.38/bin -Dfile.encoding=UTF-8 -classpath /usr/lib/jvm/java-8-openjdk/jre/lib/charsets.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/cldrdata.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/dnsns.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/jaccess.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/localedata.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/nashorn.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/sunec.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/sunjce_provider.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/sunpkcs11.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/ext/zipfs.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/jce.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/jsse.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/management-agent.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/resources.jar:/usr/lib/jvm/java-8-openjdk/jre/lib/rt.jar:/home/tobi/workspace/tensorflow-test/target/scala-2.12/classes:/home/tobi/.ivy2/cache/ch.qos.logback/logback-classic/jars/logback-classic-1.2.3.jar:/home/tobi/.ivy2/cache/org.platanios/tensorflow-data_2.12/jars/tensorflow-data_2.12-0.4.1.jar:/home/tobi/.ivy2/cache/org.objenesis/objenesis/jars/objenesis-2.6.jar:/home/tobi/.ivy2/cache/org.apache.commons/commons-compress/jars/commons-compress-1.15.jar:/home/tobi/.ivy2/cache/org.typelevel/macro-compat_2.12/jars/macro-compat_2.12-1.1.1.jar:/home/tobi/.ivy2/cache/org.typelevel/machinist_2.12/jars/machinist_2.12-0.6.5.jar:/home/tobi/.ivy2/cache/org.typelevel/cats-macros_2.12/jars/cats-macros_2.12-1.4.0.jar:/home/tobi/.ivy2/cache/org.typelevel/cats-kernel_2.12/jars/cats-kernel_2.12-1.4.0.jar:/home/tobi/.ivy2/cache/org.typelevel/cats-core_2.12/jars/cats-core_2.12-1.4.0.jar:/home/tobi/.ivy2/cache/org.tensorflow/proto/jars/proto-1.11.0.jar:/home/tobi/.ivy2/cache/org.spire-math/jawn-parser_2.12/jars/jawn-parser_2.12-0.13.0.jar:/home/tobi/.ivy2/cache/org.slf4j/slf4j-api/jars/slf4j-api-1.7.25.jar:/home/tobi/.ivy2/cache/org.scalactic/scalactic_2.12/bundles/scalactic_2.12-3.0.5.jar:/home/tobi/.ivy2/cache/org.platanios/tensorflow-jni_2.12/jars/tensorflow-jni_2.12-0.4.1.jar:/home/tobi/.ivy2/cache/org.platanios/tensorflow-api_2.12/jars/tensorflow-api_2.12-0.4.1.jar:/home/tobi/.ivy2/cache/org.hamcrest/hamcrest-core/jars/hamcrest-core-1.3.jar:/home/tobi/.ivy2/cache/junit/junit/jars/junit-4.12.jar:/home/tobi/.ivy2/cache/io.circe/circe-parser_2.12/jars/circe-parser_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/io.circe/circe-numbers_2.12/jars/circe-numbers_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/io.circe/circe-jawn_2.12/jars/circe-jawn_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/io.circe/circe-generic_2.12/jars/circe-generic_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/io.circe/circe-core_2.12/jars/circe-core_2.12-0.10.0.jar:/home/tobi/.ivy2/cache/com.typesafe.scala-logging/scala-logging_2.12/bundles/scala-logging_2.12-3.9.0.jar:/home/tobi/.ivy2/cache/com.google.protobuf/protobuf-java/bundles/protobuf-java-3.5.1.jar:/home/tobi/.ivy2/cache/com.github.ghik/silencer-lib_2.12/jars/silencer-lib_2.12-0.6.jar:/home/tobi/.ivy2/cache/com.chuusai/shapeless_2.12/bundles/shapeless_2.12-2.3.3.jar:/home/tobi/.ivy2/cache/ch.qos.logback/logback-core/jars/logback-core-1.2.3.jar:/home/tobi/.ivy2/cache/org.scala-lang/scala-library/jars/scala-library-2.12.8.jar:/home/tobi/.ivy2/cache/org.scala-lang/scala-reflect/jars/scala-reflect-2.12.8.jar:/home/tobi/.ivy2/cache/org.platanios/tensorflow_2.12/jars/tensorflow_2.12-0.4.1-linux-cpu-x86_64.jar Main
2019-01-15 10:39:31.288 [main] INFO  MNIST Data Loader - Extracting images from file 'datasets/MNIST/train-images-idx3-ubyte.gz'.
2019-01-15 10:39:32.512 [main] INFO  MNIST Data Loader - Extracting labels from file 'datasets/MNIST/train-labels-idx1-ubyte.gz'.
2019-01-15 10:39:32.514 [main] INFO  MNIST Data Loader - Extracting images from file 'datasets/MNIST/t10k-images-idx3-ubyte.gz'.
2019-01-15 10:39:32.554 [main] INFO  MNIST Data Loader - Extracting labels from file 'datasets/MNIST/t10k-labels-idx1-ubyte.gz'.
2019-01-15 10:39:32.554 [main] INFO  MNIST Data Loader - Finished loading the MNIST dataset.
2019-01-15 10:39:32.643 [main] INFO  Examples / MNIST - Building the logistic regression model.
2019-01-15 10:39:32.685 [main] INFO  Examples / MNIST - Training the linear regression model.
2019-01-15 10:39:32.865456: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-01-15 10:39:34.078 [main] INFO  Variables / Saver - Restoring parameters from '/home/tobi/workspace/tensorflow-test/temp/mnist-mlp/model.ckpt-1000'.
Exception in thread "main" org.platanios.tensorflow.jni.InvalidArgumentException: Incompatible shapes: [64,32] vs. [784,128]
     [[{{node Estimator/Train/Model/YellowFin_1/GradientsVariance/ExponentialMovingAverage/Assign/Estimator/Train/Model/ClipGradients/ClipByGlobalNorm/Identity_5/Identity/ExponentialMovingAverage/Subtract}}]]
    at org.platanios.tensorflow.jni.Session$.run(Native Method)
    at org.platanios.tensorflow.api.core.client.Session.runHelper(Session.scala:165)
    at org.platanios.tensorflow.api.learn.SessionWrapper.runHelper(SessionWrapper.scala:124)
    at org.platanios.tensorflow.api.core.client.Session.run(Session.scala:83)
    at org.platanios.tensorflow.api.learn.estimators.InMemoryEstimator.$anonfun$train$1(InMemoryEstimator.scala:207)
    at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
    at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
    at org.platanios.tensorflow.api.ops.Op$.createWith(Op.scala:2043)
    at org.platanios.tensorflow.api.learn.estimators.InMemoryEstimator.train(InMemoryEstimator.scala:190)
    at Main$.main(Main.scala:70)
    at Main.main(Main.scala)

Interestingly the line Incompatible shapes: [64,32] vs. [784,128] is different every time (e.g. Incompatible shapes: [64,32] vs. [64], etc)

eaplatanios commented 5 years ago

@SrTobi I'll investigate the TensorBoard issue. Regarding the incompatible shapes, I see that you're trying to restore the model parameters from /home/tobi/workspace/tensorflow-test/temp/mnist-mlp/model.ckpt-1000. In this case, you need to make sure that the model saved in that checkpoint is the same as the one you're currently building and trying to load in your code.

eaplatanios commented 5 years ago

@SrTobi This has now been fixed. Thanks for finding the bug! :)