Open miguelgfierro opened 2 years ago
The notebook https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/standard_vae_deep_dive.ipynb has multiple errors:
> raise error
E papermill.exceptions.PapermillExecutionError:
E ---------------------------------------------------------------------------
E Exception encountered at "In [2]":
E ---------------------------------------------------------------------------
E TypeError Traceback (most recent call last)
E /tmp/ipykernel_27602/2176430598.py in <module>
E 14 # temporary Path to save the optimal model's weights
E 15 tmp_dir = TemporaryDirectory()
E ---> 16 WEIGHTS_PATH = os.path.join(tmp_dir, "svae_weights.hdf5")
E 17
E 18 SEED = 98765
E
E /anaconda/envs/reco_gpu/lib/python3.7/posixpath.py in join(a, *p)
E 78 will be discarded. An empty last part will result in a path that
E 79 ends with a separator."""
E ---> 80 a = os.fspath(a)
E 81 sep = _get_sep(a)
E 82 path = a
E
E TypeError: expected str, bytes or os.PathLike object, not TemporaryDirectory
/anaconda/envs/reco_gpu/lib/python3.7/site-packages/papermill/execute.py:234: PapermillExecutionError
the weights are not downloaded... We need to rerun the notebook with the new version of TF...
Tensorflow version:
$ pip list | grep tensorflow
tensorflow 2.7.1
tensorflow-estimator 2.7.0
tensorflow-io-gcs-filesystem 0.24.0
Execute the notebook via papermill:
$ git checkout miguel/missing
$ pytest tests/integration/examples/test_notebooks_gpu.py::test_standard_vae_deep_dive_integration
===================================================================================== FAILURES ======================================================================================
_________________________________________________________ test_standard_vae_deep_dive_integration[1m-100-expected_values0] __________________________________________________________
notebooks = {'als_deep_dive': '/home/hoaphumanoid/notebooks/recommenders/examples/02_model_collaborative_filtering/als_deep_dive.i.../home/hoaphumanoid/notebooks/recommenders/examples/02_model_collaborative_filtering/cornac_bivae_deep_dive.ipynb', ...}
output_notebook = 'output.ipynb', kernel_name = 'python3', size = '1m', epochs = 100
expected_values = {'eval_map_2': 0.138111, 'eval_map_4': 0.171624, 'eval_ndcg_2': 0.392379, 'eval_ndcg_4': 0.443328, ...}
@pytest.mark.gpu
@pytest.mark.integration
@pytest.mark.parametrize(
"size, epochs, expected_values",
[
(
"1m",
100,
dict(
eval_map_2=0.138111,
eval_ndcg_2=0.392379,
eval_precision_2=0.231383,
eval_recall_2=0.354346,
eval_map_4=0.171624,
eval_ndcg_4=0.443328,
eval_precision_4=0.251867,
eval_recall_4=0.409650,
),
),
],
)
def test_standard_vae_deep_dive_integration(
notebooks, output_notebook, kernel_name, size, epochs, expected_values
):
notebook_path = notebooks["standard_vae_deep_dive"]
pm.execute_notebook(
notebook_path,
output_notebook,
kernel_name=kernel_name,
> parameters=dict(MOVIELENS_DATA_SIZE=size, EPOCHS=epochs),
)
tests/integration/examples/test_notebooks_gpu.py:743:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/anaconda/envs/reco_gpu/lib/python3.7/site-packages/papermill/execute.py:122: in execute_notebook
raise_for_execution_errors(nb, output_path)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
nb = {'cells': [{'id': '30e12c14', 'cell_type': 'markdown', 'source': '<span style="color:red; font-family:Helvetica Neue, ...nd_time': '2022-04-11T09:21:34.449190', 'duration': 54.905805, 'exception': True}}, 'nbformat': 4, 'nbformat_minor': 5}
output_path = 'output.ipynb'
def raise_for_execution_errors(nb, output_path):
"""Assigned parameters into the appropriate place in the input notebook
Parameters
----------
nb : NotebookNode
Executable notebook object
output_path : str
Path to write executed notebook
"""
error = None
for index, cell in enumerate(nb.cells):
if cell.get("outputs") is None:
continue
for output in cell.outputs:
if output.output_type == "error":
if output.ename == "SystemExit" and (output.evalue == "" or output.evalue == "0"):
continue
error = PapermillExecutionError(
cell_index=index,
exec_count=cell.execution_count,
source=cell.source,
ename=output.ename,
evalue=output.evalue,
traceback=output.traceback,
)
break
if error:
# Write notebook back out with the Error Message at the top of the Notebook, and a link to
# the relevant cell (by adding a note just before the failure with an HTML anchor)
error_msg = ERROR_MESSAGE_TEMPLATE % str(error.exec_count)
error_msg_cell = nbformat.v4.new_markdown_cell(error_msg)
error_msg_cell.metadata['tags'] = [ERROR_MARKER_TAG]
error_anchor_cell = nbformat.v4.new_markdown_cell(ERROR_ANCHOR_MSG)
error_anchor_cell.metadata['tags'] = [ERROR_MARKER_TAG]
# put the anchor before the cell with the error, before all the indices change due to the
# heading-prepending
nb.cells.insert(error.cell_index, error_anchor_cell)
nb.cells.insert(0, error_msg_cell)
write_ipynb(nb, output_path)
> raise error
E papermill.exceptions.PapermillExecutionError:
E ---------------------------------------------------------------------------
E Exception encountered at "In [25]":
E ---------------------------------------------------------------------------
E TypeError Traceback (most recent call last)
E /tmp/ipykernel_21487/2395358905.py in <module>
E 4 x_val_tr=val_data_tr,
E 5 x_val_te=val_data_te_ratings, # with the original ratings
E ----> 6 mapper=am_val
E 7 )
E 8 print("Took {} seconds for training.".format(t))
E
E ~/notebooks/recommenders/recommenders/models/vae/standard_vae.py in fit(self, x_train, x_valid, x_val_tr, x_val_te, mapper)
E 406 verbose=self.verbose,
E 407 callbacks=[metrics, history, self.reduce_lr],
E --> 408 validation_data=(x_valid, x_valid),
E 409 )
E 410
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
E 2028 use_multiprocessing=use_multiprocessing,
E 2029 shuffle=shuffle,
E -> 2030 initial_epoch=initial_epoch)
E 2031
E 2032 @doc_controls.do_not_generate_docs
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/utils/traceback_utils.py in error_handler(*args, **kwargs)
E 65 except Exception as e: # pylint: disable=broad-except
E 66 filtered_tb = _process_traceback_frames(e.__traceback__)
E ---> 67 raise e.with_traceback(filtered_tb) from None
E 68 finally:
E 69 del filtered_tb
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py in autograph_handler(*args, **kwargs)
E 1127 except Exception as e: # pylint:disable=broad-except
E 1128 if hasattr(e, "ag_error_metadata"):
E -> 1129 raise e.ag_error_metadata.to_exception(e)
E 1130 else:
E 1131 raise
E
E TypeError: in user code:
E
E File "/anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/engine/training.py", line 878, in train_function *
E return step_function(self, iterator)
E File "/anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/engine/training.py", line 867, in step_function **
E outputs = model.distribute_strategy.run(run_step, args=(data,))
E File "/anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/engine/training.py", line 860, in run_step **
E outputs = model.train_step(data)
E File "/anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/engine/training.py", line 810, in train_step
E y, y_pred, sample_weight, regularization_losses=self.losses)
E File "/anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/engine/compile_utils.py", line 240, in __call__
E total_loss_metric_value, sample_weight=batch_dim)
E File "/anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/utils/metrics_utils.py", line 73, in decorated
E update_op = update_state_fn(*args, **kwargs)
E File "/anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/metrics.py", line 177, in update_state_fn
E return ag_update_state(*args, **kwargs)
E File "/anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/metrics.py", line 452, in update_state **
E sample_weight, values)
E File "/anaconda/envs/reco_gpu/lib/python3.7/site-packages/keras/engine/keras_tensor.py", line 256, in __array__
E f'You are passing {self}, an intermediate Keras symbolic input/output, '
E
E TypeError: You are passing KerasTensor(type_spec=TensorSpec(shape=(), dtype=tf.float32, name=None), name='Placeholder:0', description="created by layer 'tf.cast_4'"), an intermediate Keras symbolic input/output, to a TF API that does not allow registering custom dispatchers, such as `tf.cond`, `tf.function`, gradient tapes, or `tf.map_fn`. Keras Functional model construction only supports TF API calls that *do* support dispatching, such as `tf.math.add` or `tf.reshape`. Other APIs cannot be called directly on symbolic Kerasinputs/outputs. You can work around this limitation by putting the operation in a custom Keras layer `call` and calling that layer on this symbolic input/output.
Trying with the original TF version that the notebook was written in:
$ pip list | grep tensorflow
tensorflow 2.2.0rc1
tensorflow-estimator 2.2.0
tensorflow-io-gcs-filesystem 0.24.0
$ pip list | grep Keras
Keras 2.3.1
Keras-Applications 1.0.8
Keras-Preprocessing 1.1.2
Execute the notebook via papermill:
$ git checkout miguel/missing
$ pytest tests/integration/examples/test_notebooks_gpu.py::test_standard_vae_deep_dive_integration
/anaconda/envs/reco_gpu/lib/python3.7/site-packages/papermill/execute.py:122: in execute_notebook
raise_for_execution_errors(nb, output_path)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
nb = {'cells': [{'id': '32477e14', 'cell_type': 'markdown', 'source': '<span style="color:red; font-family:Helvetica Neue, ...nd_time': '2022-04-11T09:44:58.317346', 'duration': 55.282067, 'exception': True}}, 'nbformat': 4, 'nbformat_minor': 5}
output_path = 'output.ipynb'
def raise_for_execution_errors(nb, output_path):
"""Assigned parameters into the appropriate place in the input notebook
Parameters
----------
nb : NotebookNode
Executable notebook object
output_path : str
Path to write executed notebook
"""
error = None
for index, cell in enumerate(nb.cells):
if cell.get("outputs") is None:
continue
for output in cell.outputs:
if output.output_type == "error":
if output.ename == "SystemExit" and (output.evalue == "" or output.evalue == "0"):
continue
error = PapermillExecutionError(
cell_index=index,
exec_count=cell.execution_count,
source=cell.source,
ename=output.ename,
evalue=output.evalue,
traceback=output.traceback,
)
break
if error:
# Write notebook back out with the Error Message at the top of the Notebook, and a link to
# the relevant cell (by adding a note just before the failure with an HTML anchor)
error_msg = ERROR_MESSAGE_TEMPLATE % str(error.exec_count)
error_msg_cell = nbformat.v4.new_markdown_cell(error_msg)
error_msg_cell.metadata['tags'] = [ERROR_MARKER_TAG]
error_anchor_cell = nbformat.v4.new_markdown_cell(ERROR_ANCHOR_MSG)
error_anchor_cell.metadata['tags'] = [ERROR_MARKER_TAG]
# put the anchor before the cell with the error, before all the indices change due to the
# heading-prepending
nb.cells.insert(error.cell_index, error_anchor_cell)
nb.cells.insert(0, error_msg_cell)
write_ipynb(nb, output_path)
> raise error
E papermill.exceptions.PapermillExecutionError:
E ---------------------------------------------------------------------------
E Exception encountered at "In [25]":
E ---------------------------------------------------------------------------
E TypeError Traceback (most recent call last)
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
E 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
E ---> 60 inputs, attrs, num_outputs)
E 61 except core._NotOkStatusException as e:
E
E TypeError: An op outside of the function building code is being passed
E a "Graph" tensor. It is possible to have Graph tensors
E leak out of the function building context by including a
E tf.init_scope in your function building code.
E For example, the following function will fail:
E @tf.function
E def has_init_scope():
E my_constant = tf.constant(1.)
E with tf.init_scope():
E added = my_constant * 2
E The graph tensor has name: dense_2/Identity:0
E
E During handling of the above exception, another exception occurred:
E
E _SymbolicException Traceback (most recent call last)
E /tmp/ipykernel_3741/2395358905.py in <module>
E 4 x_val_tr=val_data_tr,
E 5 x_val_te=val_data_te_ratings, # with the original ratings
E ----> 6 mapper=am_val
E 7 )
E 8 print("Took {} seconds for training.".format(t))
E
E ~/notebooks/recommenders/recommenders/models/vae/standard_vae.py in fit(self, x_train, x_valid, x_val_tr, x_val_te, mapper)
E 406 verbose=self.verbose,
E 407 callbacks=[metrics, history, self.reduce_lr],
E --> 408 validation_data=(x_valid, x_valid),
E 409 )
E 410
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/util/deprecation.py in new_func(*args, **kwargs)
E 322 'in a future version' if date is None else ('after %s' % date),
E 323 instructions)
E --> 324 return func(*args, **kwargs)
E 325 return tf_decorator.make_decorator(
E 326 func, new_func, 'deprecated',
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
E 1412 use_multiprocessing=use_multiprocessing,
E 1413 shuffle=shuffle,
E -> 1414 initial_epoch=initial_epoch)
E 1415
E 1416 @deprecation.deprecated(
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
E 63 def _method_wrapper(self, *args, **kwargs):
E 64 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
E ---> 65 return method(self, *args, **kwargs)
E 66
E 67 # Running inside `run_distribute_coordinator` already.
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
E 781 batch_size=batch_size):
E 782 callbacks.on_train_batch_begin(step)
E --> 783 tmp_logs = train_function(iterator)
E 784 # Catch OutOfRangeError for Datasets of unknown size.
E 785 # This blocks until the batch has finished executing.
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
E 578 xla_context.Exit()
E 579 else:
E --> 580 result = self._call(*args, **kwds)
E 581
E 582 if tracing_count == self._get_tracing_count():
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
E 642 # Lifting succeeded, so variables are initialized and we can run the
E 643 # stateless function.
E --> 644 return self._stateless_fn(*args, **kwds)
E 645 else:
E 646 canon_args, canon_kwds = \
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
E 2418 with self._lock:
E 2419 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
E -> 2420 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
E 2421
E 2422 @property
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _filtered_call(self, args, kwargs)
E 1663 if isinstance(t, (ops.Tensor,
E 1664 resource_variable_ops.BaseResourceVariable))),
E -> 1665 self.captured_inputs)
E 1666
E 1667 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
E 1744 # No tape is watching; skip to running the function.
E 1745 return self._build_call_outputs(self._inference_function.call(
E -> 1746 ctx, args, cancellation_manager=cancellation_manager))
E 1747 forward_backward = self._select_forward_and_backward_functions(
E 1748 args,
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
E 596 inputs=args,
E 597 attrs=attrs,
E --> 598 ctx=ctx)
E 599 else:
E 600 outputs = execute.execute_with_cancellation(
E
E /anaconda/envs/reco_gpu/lib/python3.7/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
E 72 raise core._SymbolicException(
E 73 "Inputs to eager execution function cannot be Keras symbolic "
E ---> 74 "tensors, but found {}".format(keras_symbolic_tensors))
E 75 raise e
E 76 # pylint: enable=protected-access
E
E _SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'dense_2/Identity:0' shape=(None, 70) dtype=float32>, <tf.Tensor 'dense_1/Identity:0' shape=(None, 70) dtype=float32>]
/anaconda/envs/reco_gpu/lib/python3.7/site-packages/papermill/execute.py:234: PapermillExecutionError
From Andreas: One thing I notice in this code https://github.com/microsoft/recommenders/blob/main/recommenders/models/vae/standard_vae.py is that they use methods from keras.backend at several places. These methods are not available in the new API https://www.tensorflow.org/api_docs/python/tf/keras/backend but they are in the old one https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/keras/backend
So, one thing to try is to replace these references with something using tf.compat.v1.keras.backend
Description
https://github.com/microsoft/recommenders/search?q=vae
FYI @anargyri @pradnyeshjoshi
In which platform does it happen?
How do we replicate the issue?
Expected behavior (i.e. solution)
Other Comments