apple / tensorflow_macos

TensorFlow for macOS 11.0+ accelerated using Apple's ML Compute framework.
Other
3.67k stars 310 forks source link

TransposeMLCBytes nullptr error during training process #277

Open iRonJ opened 3 years ago

iRonJ commented 3 years ago
InternalError                             Traceback (most recent call last)
<ipython-input-4-0e66376570e5> in <module>
     15 
     16 # Train and evaluate using tf.keras.Model.fit()
---> 17 history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
     18                     validation_data=valid_dataset, validation_steps=7)
     19 

~/miniforge3/envs/tfm1/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1098                 _r=1):
   1099               callbacks.on_train_batch_begin(step)
-> 1100               tmp_logs = self.train_function(iterator)
   1101               if data_handler.should_sync:
   1102                 context.async_wait()

~/miniforge3/envs/tfm1/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    826     tracing_count = self.experimental_get_tracing_count()
    827     with trace.Trace(self._name) as tm:
--> 828       result = self._call(*args, **kwds)
    829       compiler = "xla" if self._experimental_compile else "nonXla"
    830       new_tracing_count = self.experimental_get_tracing_count()

~/miniforge3/envs/tfm1/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    853       # In this case we have created variables on the first call, so we run the
    854       # defunned version which is guaranteed to never create variables.
--> 855       return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
    856     elif self._stateful_fn is not None:
    857       # Release the lock early so that multiple threads can perform the call

~/miniforge3/envs/tfm1/lib/python3.8/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
   2940       (graph_function,
   2941        filtered_flat_args) = self._maybe_define_function(args, kwargs)
-> 2942     return graph_function._call_flat(
   2943         filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access
   2944 

~/miniforge3/envs/tfm1/lib/python3.8/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
   1916         and executing_eagerly):
   1917       # No tape is watching; skip to running the function.
-> 1918       return self._build_call_outputs(self._inference_function.call(
   1919           ctx, args, cancellation_manager=cancellation_manager))
   1920     forward_backward = self._select_forward_and_backward_functions(

~/miniforge3/envs/tfm1/lib/python3.8/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
    553       with _InterpolateFunctionError(self):
    554         if cancellation_manager is None:
--> 555           outputs = execute.execute(
    556               str(self.signature.name),
    557               num_outputs=self._num_outputs,

~/miniforge3/envs/tfm1/lib/python3.8/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     57   try:
     58     ctx.ensure_initialized()
---> 59     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
     60                                         inputs, attrs, num_outputs)
     61   except core._NotOkStatusException as e:

InternalError:  TransposeMLCBytes: Input bytes is nullptr.
     [[node gradient_tape/tf_bert_for_sequence_classification/bert/encoder/layer_._11/intermediate/dense/MLCMatMul_1 (defined at /Users/ronj/miniforge3/envs/tfm1/lib/python3.8/site-packages/tensorflow/compiler/tf2mlcompute/ops/gen_mlc_ops.py:5535) ]]
     [[MLCSubgraphOp_0_39]] [Op:__inference_train_function_36201]

Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/tf_bert_for_sequence_classification/bert/encoder/layer_._11/intermediate/dense/MLCMatMul_1:
 tf_bert_for_sequence_classification/bert/encoder/layer_._11/attention/output/LayerNorm/MLCLayerNorm (defined at /Users/ronj/miniforge3/envs/tfm1/lib/python3.8/site-packages/tensorflow/compiler/tf2mlcompute/ops/gen_mlc_ops.py:4934) 
 Adam/gradients/AddN (defined at <ipython-input-4-0e66376570e5>:17

This error occurs about 2/3rd of the way of 1 epoch. I was trying to do a quick eval of TF for M1 by following the "Quick tour TF 2.0 training and PyTorch interoperability" section of this page: https://github.com/lcskrishna/transformers

Since it was in the middle of an epoch when this error happened, im not sure why there would be null values here suddenly, i'll try to do some more testing surrounding this.

matyasbohacek commented 3 years ago

Same problem here