eaplatanios / tensorflow_scala

TensorFlow API for the Scala Programming Language
http://platanios.org/tensorflow_scala/
Apache License 2.0
936 stars 96 forks source link

Conv2D crash on MNIST in tensforlow lib #193

Open novakov-alexey-zz opened 3 years ago

novakov-alexey-zz commented 3 years ago

Using tf.learn.Conv2D layer for MNIST dataset leads to a fatal error somewhere in C++ code of tensorflow library.

How to reproduce

Below code leads to an error. This code is based on existing examples of MNIST and CIFAR:

import org.platanios.tensorflow.api._
import org.platanios.tensorflow.api.core.types.UByte
import org.platanios.tensorflow.api.implicits.helpers.{
  OutputStructure,
  OutputToDataType,
  OutputToShape
}
import org.platanios.tensorflow.api.learn.ClipGradientsByGlobalNorm
import org.platanios.tensorflow.api.ops.Output
import org.platanios.tensorflow.data.image.MNISTLoader
import org.platanios.tensorflow.examples
import org.platanios.tensorflow.api.ops.NN.{SameConvPadding, ValidConvPadding}

import com.typesafe.scalalogging.Logger
import org.slf4j.LoggerFactory

import java.nio.file.Paths

val dataSet = MNISTLoader.load(Paths.get("datasets/MNIST"), MNISTLoader.MNIST)

val colorChannels = 1 // grey
  val inputShape = Shape(-1, 28, 28, colorChannels)
  val trainImagesReshaped = dataSet.trainImages.reshape(inputShape)
  val trainImages =
    tf.data
      .datasetFromTensorSlices(
        trainImagesReshaped
      )
      .map(v => v.toFloat / 255)
  val trainLabels =
    tf.data.datasetFromTensorSlices(dataSet.trainLabels).map(_.toFloat)
  val testImageReshaped = dataSet.testImages.reshape(inputShape)
  val testImages =
    tf.data
      .datasetFromTensorSlices(testImageReshaped)
      .map(v => v.toFloat / 255)
  val testLabels =
    tf.data.datasetFromTensorSlices(dataSet.testLabels).map(_.toFloat)
  val trainData =
    trainImages
      .zip(trainLabels)
      .repeat()
      .shuffle(10000)
      .batch(256)
      .prefetch(10)
  val evalTrainData = trainImages.zip(trainLabels).batch(1000).prefetch(10)
  val evalTestData = testImages.zip(testLabels).batch(1000).prefetch(10)  
  val input = tf.learn.Input(
    FLOAT32,
    inputShape
  )
  val trainInput = tf.learn.Input(FLOAT32, Shape(-1))
  val layer = tf.learn.Conv2D[Float](
    "Layer_0/Conv2D",
    filterShape = Shape(5, 5, colorChannels, 8),
    stride1 = 1,
    stride2 = 1,
    SameConvPadding
  ) >>
    tf.learn.AddBias[Float]("Layer_0/Bias") >>
    tf.learn.ReLU[Float]("Layer_0/ReLU", 0.1f) >>
    tf.learn.MaxPool[Float](
      "Layer_0/MaxPool",
      windowSize = Seq(1, 2, 2, 1),
      stride1 = 1,
      stride2 = 1,
      SameConvPadding
    ) >>    
    tf.learn.Flatten[Float]("Layer_2/Flatten") >>
    tf.learn.Linear[Float]("Layer_2/Linear", 256) >>
    tf.learn.ReLU[Float]("Layer_2/ReLU", 0.1f) >>
    tf.learn.Linear[Float]("OutputLayer/Linear", 10)
  val loss = tf.learn.SoftmaxCrossEntropy[Float, Float](
    "Loss/CrossEntropy"
  ) >>
    tf.learn.Mean[Float]("Loss/Mean") >>
    tf.learn.ScalarSummary[Float]("Loss/Summary", "Loss")

  val optimizer = tf.train.Adam()  

  val model = tf.learn.Model.simpleSupervised(
    input = input,
    trainInput = trainInput,
    layer = layer,
    loss = loss,
    optimizer = optimizer,
    clipGradients = ClipGradientsByGlobalNorm(5.0f)
  )

  val summariesDir = Paths.get("temp/mnist-cnn")
  val estimator = tf.learn.InMemoryEstimator(
    model,
    tf.learn.Configuration(Some(summariesDir)),
    tf.learn.StopCriteria(maxSteps = Some(100000)),
    Set(
      tf.learn.LossLogger(trigger = tf.learn.StepHookTrigger(100)),
      tf.learn.StepRateLogger(
        log = false,
        summaryDir = summariesDir,
        trigger = tf.learn.StepHookTrigger(100)
      ),
      tf.learn.CheckpointSaver(summariesDir, tf.learn.StepHookTrigger(1000))
    ),
    tensorBoardConfig =
      tf.learn.TensorBoardConfig(summariesDir, reloadInterval = 1)
  )
  estimator.train(() => trainData, tf.learn.StopCriteria(maxSteps = Some(1000)))

Version:

scalaVersion := "2.13.5"

lazy val tensorFlowScalaVer = "0.5.10"
"org.platanios" %% "tensorflow-data" % tensorFlowScalaVer,
"org.platanios" %% "tensorflow" % tensorFlowScalaVer classifier "darwin"

Error:

2021-04-01 15:12:06.111 [main] INFO  MNIST Data Loader - Extracting images from file 'datasets/MNIST/train-images-idx3-ubyte.gz'.
2021-04-01 15:12:17.523 [main] INFO  MNIST Data Loader - Extracting labels from file 'datasets/MNIST/train-labels-idx1-ubyte.gz'.
2021-04-01 15:12:17.529 [main] INFO  MNIST Data Loader - Extracting images from file 'datasets/MNIST/t10k-images-idx3-ubyte.gz'.
2021-04-01 15:12:17.603 [main] INFO  MNIST Data Loader - Extracting labels from file 'datasets/MNIST/t10k-labels-idx1-ubyte.gz'.
2021-04-01 15:12:17.606 [main] INFO  MNIST Data Loader - Finished loading the MNIST dataset.
2021-04-01 15:12:17.701597: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fa0e92c8ee0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-04-01 15:12:17.701635: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-04-01 15:12:19.815 [main] INFO  Learn / Hooks / TensorBoard - Launching TensorBoard in 'localhost:6006' for log directory '/Users/<user...>/dev/git/tensorflow-scala-cnn/temp/mnist-cnn'.
#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGFPE (0x8) at pc=0x000000014002d9b3, pid=80972, tid=9987
#
# JRE version: OpenJDK Runtime Environment AdoptOpenJDK (11.0.9+11) (build 11.0.9+11)
# Java VM: OpenJDK 64-Bit Server VM AdoptOpenJDK (11.0.9+11, mixed mode, tiered, compressed oops, g1 gc, bsd-amd64)
# Problematic frame:
# C  [libtensorflow.2.dylib+0x974a9b3]  _ZN10tensorflow8grappler20OpLevelCostEstimator31ConvolutionDimensionsFromInputsERKNS_16TensorShapeProtoES4_RKNS_6OpInfoEPb+0x3d3
#
# No core dump will be written. Core dumps have been disabled. To enable core dumping, try "ulimit -c unlimited" before starting Java again
#
# An error report file with more information is saved as:
# /Users/<user...>/dev/git/tensorflow-scala-cnn/hs_err_pid80972.log
[thread 30723 also had an error]
#
# If you would like to submit a bug report, please visit:
#   https://github.com/AdoptOpenJDK/openjdk-support/issues
# The crash happened outside the Java Virtual Machine in native code.
# See problematic frame for where to report the bug.
#

Observations:

Above error contains suspicious message about problematic frame:

_ZN10tensorflow8grappler20OpLevelCostEstimator31ConvolutionDimensionsFromInputsERKNS_16TensorShapeProtoES4_RKNS_6OpInfoEPb

Some extract from the error log:

---------------  S U M M A R Y ------------

Command Line: -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,quiet=n -Duser.dir=/Users/<user>/dev/git/tensorflow-scala-cnn org.platanios.tensorflow.examples.CNN

...
Time: Thu Apr  1 15:12:22 2021 CEST elapsed time: 17.863516 seconds (0d 0h 0m 17s)

---------------  T H R E A D  ---------------

Current thread (0x00007fa0e4808800):  JavaThread "main" [_thread_in_native, id=9987, stack(0x000070000b34f000,0x000070000b44f000)]

Stack: [0x000070000b34f000,0x000070000b44f000],  sp=0x000070000b448e90,  free space=999k
Native frames: (J=compiled Java code, A=aot compiled Java code, j=interpreted, Vv=VM code, C=native code)
C  [libtensorflow.2.dylib+0x974a9b3]  _ZN10tensorflow8grappler20OpLevelCostEstimator31ConvolutionDimensionsFromInputsERKNS_16TensorShapeProtoES4_RKNS_6OpInfoEPb+0x3d3
C  [libtensorflow.2.dylib+0x974e17c]  _ZN10tensorflow8grappler20OpLevelCostEstimator35CountConv2DBackpropFilterOperationsERKNS_6OpInfoEPNS1_21ConvolutionDimensionsEPb+0xcc
C  [libtensorflow.2.dylib+0x974256e]  _ZNK10tensorflow8grappler20OpLevelCostEstimator27PredictConv2DBackpropFilterERKNS0_9OpContextE+0x2e
C  [libtensorflow.2.dylib+0x974f7ba]  _ZNSt3__110__function6__funcIZZN10tensorflow8grappler20OpLevelCostEstimatorC1EvENK3$_2clEMS4_KFNS3_5CostsERKNS3_9OpContextEEEUlS9_E_NS_9allocatorISC_EEFS6_S9_EEclES9_+0x2a
C  [libtensorflow.2.dylib+0x97481ef]  _ZNK10tensorflow8grappler20OpLevelCostEstimator12PredictCostsERKNS0_9OpContextE+0xbf
C  [libtensorflow.2.dylib+0x973bef7]  _ZNK10tensorflow8grappler23AnalyticalCostEstimator12PredictCostsERKNS_8GraphDefEPNS_11RunMetadataEPNS0_5CostsE+0x3a7
C  [libtensorflow.2.dylib+0x973a2eb]  _ZN10tensorflow8grappler14VirtualCluster3RunERKNS0_12GrapplerItemEPNS_11RunMetadataE+0x7b
C  [libtensorflow.2.dylib+0x96c70ae]  _ZN10tensorflow8grappler11GraphMemory15InferStaticallyERKNSt3__113unordered_mapINS2_12basic_stringIcNS2_11char_traitsIcEENS2_9allocatorIcEEEENS_16DevicePropertiesENS2_4hashIS9_EENS2_8equal_toIS9_EENS7_INS2_4pairIKS9_SA_EEEEEE+0x9e
C  [libtensorflow.2.dylib+0x96b565d]  _ZN10tensorflow8grappler12_GLOBAL__N_112SwappingPassENS_25RewriterConfig_MemOptTypeEPNS0_7ClusterEPNSt3__110unique_ptrINS0_11GraphMemoryENS5_14default_deleteIS7_EEEEPNS0_12GrapplerItemEPNS5_13unordered_setINS5_12basic_stringIcNS5_11char_traitsIcEENS5_9allocatorIcEEEENS5_4hashISK_EENS5_8equal_toISK_EENSI_ISK_EEEE+0x1ead
C  [libtensorflow.2.dylib+0x96abe4d]  _ZN10tensorflow8grappler15MemoryOptimizer8OptimizeEPNS0_7ClusterERKNS0_12GrapplerItemEPNS_8GraphDefE+0x140d
C  [libtensorflow.2.dylib+0x95fa8bb]  _ZN10tensorflow8grappler13MetaOptimizer12RunOptimizerEPNS0_14GraphOptimizerEPNS0_7ClusterEPNS0_12GrapplerItemEPNS_8GraphDefEPNS1_23GraphOptimizationResultE+0x32b
C  [libtensorflow.2.dylib+0x95f9008]  _ZN10tensorflow8grappler13MetaOptimizer13OptimizeGraphEPNS0_7ClusterEONS0_12GrapplerItemEPNS_8GraphDefE+0x908
C  [libtensorflow.2.dylib+0x95fc67b]  _ZN10tensorflow8grappler13MetaOptimizer19OptimizeConsumeItemEPNS0_7ClusterEONS0_12GrapplerItemEPNS_8GraphDefE+0x27b
C  [libtensorflow.2.dylib+0x95fe393]  _ZN10tensorflow8grappler16RunMetaOptimizerEONS0_12GrapplerItemERKNS_11ConfigProtoEPNS_10DeviceBaseEPNS0_7ClusterEPNS_8GraphDefE+0xc3
C  [libtensorflow.2.dylib+0x95f42cb]  _ZN10tensorflow19GraphExecutionState13OptimizeGraphERKNS_17BuildGraphOptionsEPNSt3__110unique_ptrINS_5GraphENS4_14default_deleteIS6_EEEEPNS5_INS_25FunctionLibraryDefinitionENS7_ISB_EEEE+0x1ddb
C  [libtensorflow.2.dylib+0x95efa3a]  _ZN10tensorflow19GraphExecutionState10BuildGraphERKNS_17BuildGraphOptionsEPNSt3__110unique_ptrINS_11ClientGraphENS4_14default_deleteIS6_EEEE+0xaa
C  [libtensorflow.2.dylib+0x95c106b]  _ZN10tensorflow13DirectSession12CreateGraphsERKNS_17BuildGraphOptionsEPNSt3__113unordered_mapINS4_12basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS4_10unique_ptrINS_5GraphENS4_14default_deleteISD_EEEENS4_4hashISB_EENS4_8equal_toISB_EENS9_INS4_4pairIKSB_SG_EEEEEEPNSC_INS_25FunctionLibraryDefinitionENSE_ISR_EEEEPNS0_12RunStateArgsEPN4absl14lts_2020_02_2513InlinedVectorINS_8DataTypeELm4ENS9_IS10_EEEES13_Px+0x22b
C  [libtensorflow.2.dylib+0x95bfb81]  _ZN10tensorflow13DirectSession15CreateExecutorsERKNS_15CallableOptionsEPNSt3__110unique_ptrINS0_16ExecutorsAndKeysENS4_14default_deleteIS6_EEEEPNS5_INS0_12FunctionInfoENS7_ISB_EEEEPNS0_12RunStateArgsE+0x231
C  [libtensorflow.2.dylib+0x95bbef6]  _ZN10tensorflow13DirectSession20GetOrCreateExecutorsEN4absl14lts_2020_02_254SpanIKNSt3__112basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEEEESC_SC_PPNS0_16ExecutorsAndKeysEPNS0_12RunStateArgsE+0xd76
C  [libtensorflow.2.dylib+0x95ba0af]  _ZN10tensorflow13DirectSession3RunERKNS_10RunOptionsERKNSt3__16vectorINS4_4pairINS4_12basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS_6TensorEEENSA_ISE_EEEERKNS5_ISC_NSA_ISC_EEEESM_PNS5_ISD_NSA_ISD_EEEEPNS_11RunMetadataERKNS_6thread17ThreadPoolOptionsE+0x3df
C  [libtensorflow.2.dylib+0x95b9cc1]  _ZN10tensorflow13DirectSession3RunERKNS_10RunOptionsERKNSt3__16vectorINS4_4pairINS4_12basic_stringIcNS4_11char_traitsIcEENS4_9allocatorIcEEEENS_6TensorEEENSA_ISE_EEEERKNS5_ISC_NSA_ISC_EEEESM_PNS5_ISD_NSA_ISD_EEEEPNS_11RunMetadataE+0x31
C  [libtensorflow.2.dylib+0xc0428]  _ZL13TF_Run_HelperPN10tensorflow7SessionEPKcPK9TF_BufferRKNSt3__16vectorINS7_4pairINS7_12basic_stringIcNS7_11char_traitsIcEENS7_9allocatorIcEEEENS_6TensorEEENSD_ISH_EEEERKNS8_ISF_NSD_ISF_EEEEPP9TF_TensorSP_PS4_P9TF_Status+0x5c8
C  [libtensorflow.2.dylib+0xcf710]  TF_SessionRun+0x4c0
C  [libtensorflow_jni.so+0xb867f]  Java_org_platanios_tensorflow_jni_Session_00024_run+0x6ef
j  org.platanios.tensorflow.jni.Session$.run(J[B[J[J[I[J[I[JZ[J)[B+0
j  org.platanios.tensorflow.api.core.client.Session.runHelper(Lorg/platanios/tensorflow/api/core/client/FeedMap;Ljava/lang/Object;Ljava/lang/Object;Lscala/Option;ZLorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OutputStructure;Lorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OpStructure;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToTensor;)Lscala/Tuple2;+554
j  org.platanios.tensorflow.api.learn.SessionWrapper.runHelper(Lorg/platanios/tensorflow/api/core/client/FeedMap;Ljava/lang/Object;Ljava/lang/Object;Lscala/Option;ZLorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OutputStructure;Lorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OpStructure;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToTensor;)Lscala/Tuple2;+294
j  org.platanios.tensorflow.api.core.client.Session.run(Lorg/platanios/tensorflow/api/core/client/FeedMap;Ljava/lang/Object;Ljava/lang/Object;Lscala/Option;Lorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OutputStructure;Lorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OpStructure;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToTensor;)Ljava/lang/Object;+20
j  org.platanios.tensorflow.api.learn.estimators.InMemoryEstimator.$anonfun$train$1(Lorg/platanios/tensorflow/api/learn/estimators/InMemoryEstimator;Lscala/Function0;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToDataType;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToShape;Lorg/platanios/tensorflow/api/learn/StopCriteria;)V+392
j  org.platanios.tensorflow.api.learn.estimators.InMemoryEstimator$$Lambda$564.apply$mcV$sp()V+20
j  scala.runtime.java8.JFunction0$mcV$sp.apply()Ljava/lang/Object;+1
J 2001 c1 scala.util.DynamicVariable.withValue(Ljava/lang/Object;Lscala/Function0;)Ljava/lang/Object; (44 bytes) @ 0x000000011b9506c4 [0x000000011b9500c0+0x0000000000000604]
J 2246 c1 org.platanios.tensorflow.api.ops.Op$.createWith(Lorg/platanios/tensorflow/api/core/Graph;Ljava/lang/String;Ljava/lang/String;Lscala/Option;Lscala/collection/immutable/Set;Lscala/collection/immutable/Set;Lscala/collection/immutable/Map;Ljava/lang/String;Lscala/Function0;)Ljava/lang/Object; (1278 bytes) @ 0x000000011b9ffaf4 [0x000000011b9fa1a0+0x0000000000005954]
j  org.platanios.tensorflow.api.learn.estimators.InMemoryEstimator.train(Lscala/Function0;Lorg/platanios/tensorflow/api/learn/StopCriteria;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToDataType;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToShape;)V+83
j  org.platanios.tensorflow.examples.CNN$.delayedEndpoint$org$platanios$tensorflow$examples$CNN$1()V+2854
j  org.platanios.tensorflow.examples.CNN$delayedInit$body.apply()Ljava/lang/Object;+4
j  scala.Function0.apply$mcV$sp()V+1
j  scala.Function0.apply$mcV$sp$(Lscala/Function0;)V+1
j  scala.runtime.AbstractFunction0.apply$mcV$sp()V+1
j  scala.App.$anonfun$main$1(Lscala/Function0;)V+1
j  scala.App.$anonfun$main$1$adapted(Lscala/Function0;)Ljava/lang/Object;+1
j  scala.App$$Lambda$1.apply(Ljava/lang/Object;)Ljava/lang/Object;+4
j  scala.collection.IterableOnceOps.foreach(Lscala/Function1;)V+26
j  scala.collection.IterableOnceOps.foreach$(Lscala/collection/IterableOnceOps;Lscala/Function1;)V+2
j  scala.collection.AbstractIterable.foreach(Lscala/Function1;)V+2
j  scala.App.main([Ljava/lang/String;)V+18
j  scala.App.main$(Lscala/App;[Ljava/lang/String;)V+2
j  org.platanios.tensorflow.examples.CNN$.main([Ljava/lang/String;)V+2
j  org.platanios.tensorflow.examples.CNN.main([Ljava/lang/String;)V+4
v  ~StubRoutines::call_stub
V  [libjvm.dylib+0x3b1880]  _ZN9JavaCalls11call_helperEP9JavaValueRK12methodHandleP17JavaCallArgumentsP6Thread+0x21a
V  [libjvm.dylib+0x3f65ba]  _ZL17jni_invoke_staticP7JNIEnv_P9JavaValueP8_jobject11JNICallTypeP10_jmethodIDP18JNI_ArgumentPusherP6Thread+0x10a
V  [libjvm.dylib+0x3f9daa]  jni_CallStaticVoidMethod+0x1d2
C  [libjli.dylib+0x4b29]  JavaMain+0xab0
C  [libsystem_pthread.dylib+0x6950]  _pthread_start+0xe0
C  [libsystem_pthread.dylib+0x247b]  thread_start+0xf

Java frames: (J=compiled Java code, j=interpreted, Vv=VM code)
j  org.platanios.tensorflow.jni.Session$.run(J[B[J[J[I[J[I[JZ[J)[B+0
j  org.platanios.tensorflow.api.core.client.Session.runHelper(Lorg/platanios/tensorflow/api/core/client/FeedMap;Ljava/lang/Object;Ljava/lang/Object;Lscala/Option;ZLorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OutputStructure;Lorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OpStructure;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToTensor;)Lscala/Tuple2;+554
j  org.platanios.tensorflow.api.learn.SessionWrapper.runHelper(Lorg/platanios/tensorflow/api/core/client/FeedMap;Ljava/lang/Object;Ljava/lang/Object;Lscala/Option;ZLorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OutputStructure;Lorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OpStructure;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToTensor;)Lscala/Tuple2;+294
j  org.platanios.tensorflow.api.core.client.Session.run(Lorg/platanios/tensorflow/api/core/client/FeedMap;Ljava/lang/Object;Ljava/lang/Object;Lscala/Option;Lorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OutputStructure;Lorg/platanios/tensorflow/api/utilities/DefaultsTo;Lorg/platanios/tensorflow/api/implicits/helpers/OpStructure;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToTensor;)Ljava/lang/Object;+20
j  org.platanios.tensorflow.api.learn.estimators.InMemoryEstimator.$anonfun$train$1(Lorg/platanios/tensorflow/api/learn/estimators/InMemoryEstimator;Lscala/Function0;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToDataType;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToShape;Lorg/platanios/tensorflow/api/learn/StopCriteria;)V+392
j  org.platanios.tensorflow.api.learn.estimators.InMemoryEstimator$$Lambda$564.apply$mcV$sp()V+20
j  scala.runtime.java8.JFunction0$mcV$sp.apply()Ljava/lang/Object;+1
J 2001 c1 scala.util.DynamicVariable.withValue(Ljava/lang/Object;Lscala/Function0;)Ljava/lang/Object; (44 bytes) @ 0x000000011b9506c4 [0x000000011b9500c0+0x0000000000000604]
J 2246 c1 org.platanios.tensorflow.api.ops.Op$.createWith(Lorg/platanios/tensorflow/api/core/Graph;Ljava/lang/String;Ljava/lang/String;Lscala/Option;Lscala/collection/immutable/Set;Lscala/collection/immutable/Set;Lscala/collection/immutable/Map;Ljava/lang/String;Lscala/Function0;)Ljava/lang/Object; (1278 bytes) @ 0x000000011b9ffaf4 [0x000000011b9fa1a0+0x0000000000005954]
j  org.platanios.tensorflow.api.learn.estimators.InMemoryEstimator.train(Lscala/Function0;Lorg/platanios/tensorflow/api/learn/StopCriteria;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToDataType;Lorg/platanios/tensorflow/api/implicits/helpers/OutputToShape;)V+83
j  org.platanios.tensorflow.examples.CNN$.delayedEndpoint$org$platanios$tensorflow$examples$CNN$1()V+2854
j  org.platanios.tensorflow.examples.CNN$delayedInit$body.apply()Ljava/lang/Object;+4
j  scala.Function0.apply$mcV$sp()V+1
j  scala.Function0.apply$mcV$sp$(Lscala/Function0;)V+1
j  scala.runtime.AbstractFunction0.apply$mcV$sp()V+1
j  scala.App.$anonfun$main$1(Lscala/Function0;)V+1
j  scala.App.$anonfun$main$1$adapted(Lscala/Function0;)Ljava/lang/Object;+1
j  scala.App$$Lambda$1.apply(Ljava/lang/Object;)Ljava/lang/Object;+4
j  scala.collection.IterableOnceOps.foreach(Lscala/Function1;)V+26
j  scala.collection.IterableOnceOps.foreach$(Lscala/collection/IterableOnceOps;Lscala/Function1;)V+2
j  scala.collection.AbstractIterable.foreach(Lscala/Function1;)V+2
j  scala.App.main([Ljava/lang/String;)V+18
j  scala.App.main$(Lscala/App;[Ljava/lang/String;)V+2
j  org.platanios.tensorflow.examples.CNN$.main([Ljava/lang/String;)V+2
j  org.platanios.tensorflow.examples.CNN.main([Ljava/lang/String;)V+4
v  ~StubRoutines::call_stub
mandar2812 commented 3 years ago

@eaplatanios I can verify that this happens whenever a convolutional layer is used. I have reproduced this in 0.6.0-SNAPSHOT.

There seems to be some error resulting due to the Graph optimizer Tensorflow is using in the backend. What do you think?

eaplatanios commented 3 years ago

This does indeed look related to grappler (the TF graph optimizer). Does it reproduce with version 0.6.3?

novakov-alexey-zz commented 3 years ago

Just tried with 0.6.4. It still fails.

windymelt commented 1 year ago

Still failing with version 0.6.5 (Linux) with CIFAR model.

    import tensorflow.data.image.CIFARLoader
    val dataSet = CIFARLoader.load(Paths.get("/home/windymelt/Downloads/cifar-100-python"), CIFARLoader.CIFAR_100)

    import tensorflow.api.ops.data.Data
    import tensorflow.api.::
    val trainImages = () => Data.datasetFromTensorSlices(dataSet.trainImages, "TrainImages").map(_.toFloat)

    val trainLabels = () => Data.datasetFromTensorSlices(dataSet.trainLabels(::, 1), "TrainLabels").map(_.toLong)
    val trainData = () => trainImages().zip(trainLabels())
      .repeat()
      .shuffle(10000)
      .batch(32)
      .prefetch(10)

    import tensorflow.api._
    import tensorflow.api.learn.layers._
    val input = Input(
      FLOAT32,
      Shape(-1, dataSet.trainImages.shape(1), dataSet.trainImages.shape(2), dataSet.trainImages.shape(3))
    )
    val trainInput = Input(INT64, Shape(-1))

    import tensorflow.api.ops.NN.SameConvPadding
    val layer = //Conv2D[Float]("Layer_0/Conv2D", Shape(2, 2, 3, 16), 1, 1, SameConvPadding) >>
        AddBias[Float]("Layer_0/Bias") >>
        ReLU[Float]("Layer_0/ReLU", 0.1f) >>
        MaxPool[Float]("Layer_0/MaxPool", Seq(1, 2, 2, 1), 1, 1, SameConvPadding) >>
        // Conv2D[Float]("Layer_1/Conv2D", Shape(2, 2, 16, 32), 1, 1, SameConvPadding) >>
        AddBias[Float]("Bias_1") >>
        ReLU[Float]("Layer_1/ReLU", 0.1f) >>
        MaxPool[Float]("Layer_1/MaxPool", Seq(1, 2, 2, 1), 1, 1, SameConvPadding) >>
        Flatten[Float]("Layer_2/Flatten") >>
        Linear[Float]("Layer_2/Linear", 256) >>
        ReLU[Float]("Layer_2/ReLU", 0.1f) >>
    Linear[Float]("OutputLayer/Linear", 100)

    val loss = SparseSoftmaxCrossEntropy[Float, Long, Float]("Loss/CrossEntropy") >>
    Mean[Float]("Loss/Mean") >>
    ScalarSummary[Float]("Loss/Summary", "Loss")

    val optimizer = tf.train.AdaGrad(0.1f)

    val model = tf.learn.Model.simpleSupervised(
      input = input,
      trainInput = trainInput,
      layer = layer,
      loss = loss,
      optimizer = optimizer)

    val summariesDir = Paths.get("temp/cnn-cifar")

    val estimator = tensorflow.api.learn.estimators.InMemoryEstimator(
      model,
      tensorflow.api.learn.Configuration(Some(summariesDir)),
      tensorflow.api.learn.StopCriteria(maxSteps = Some(100000)),
      Set(
        tensorflow.api.learn.hooks.LossLogger(trigger = tf.learn.StepHookTrigger(100)),
        tensorflow.api.learn.hooks.StepRateLogger(log = false, summaryDir = summariesDir, trigger = tensorflow.api.learn.hooks.StepHookTrigger(100)),
        tensorflow.api.learn.hooks.CheckpointSaver(summariesDir, tensorflow.api.learn.hooks.StepHookTrigger(1000))),
      tensorBoardConfig = tensorflow.api.config.TensorBoardConfig(summariesDir, reloadInterval = 1))

    estimator.train(trainData, tensorflow.api.learn.StopCriteria(maxSteps = Some(10000)))

I removed Conv2D layer like above code snippet then it works without SIGFPE.

Error message follows:

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGFPE (0x8) at pc=0x00007f056cd2240b, pid=10315, tid=10938
#
# JRE version: OpenJDK Runtime Environment Corretto-17.0.5.8.1 (17.0.5+8) (build 17.0.5+8-LTS)
# Java VM: OpenJDK 64-Bit Server VM Corretto-17.0.5.8.1 (17.0.5+8-LTS, mixed mode, sharing, tiered, compressed oops, compressed class ptrs, g1 gc, linux-amd64)
# Problematic frame:
# C  [libtensorflow.so.2+0xa92240b]  tensorflow::grappler::OpLevelCostEstimator::ConvolutionDimensionsFromInputs(tensorflow::TensorShapeProto const&, tensorflow::TensorShapeProto const&, tensorflow::OpInfo const&, bool*)+0x2fb
#
# Core dump will be written. Default location: Core dumps may be processed with "/bin/false" (or dumping to /home/windymelt/src/github.com/windymelt/tensorflow-scala-exercice/core.10315)
#
# An error report file with more information is saved as:
# /home/windymelt/src/github.com/windymelt/tensorflow-scala-exercice/hs_err_pid10315.log
#
# If you would like to submit a bug report, please visit:
#   https://github.com/corretto/corretto-17/issues/
# The crash happened outside the Java Virtual Machine in native code.
# See problematic frame for where to report the bug.
#
[1]    10315 IOT instruction (core dumped)  sbt run