NVIDIA-Merlin / models

Merlin Models is a collection of deep learning recommender system model reference implementations
https://nvidia-merlin.github.io/models/main/index.html
Apache License 2.0
262 stars 50 forks source link

[BUG] Getting error when serving a reloaded session-based model on Triton #1217

Closed rnyak closed 11 months ago

rnyak commented 11 months ago

Bug description

I am getting the following error when I try to serve a loaded model on Triton.

ValueError                                Traceback (most recent call last)
Cell In[7], line 12
      6 from merlin.systems.dag.ops.workflow import TransformWorkflow
      9 wf = Workflow.load(os.path.join(DATA_FOLDER, "workflow_etl"))
---> 12 inf_ops = wf.input_schema.column_names >> TransformWorkflow(wf) >> PredictTensorflow(reloaded_model)
     14 ensemble = Ensemble(inf_ops, wf.input_schema)
     15 ensemble.export(os.path.join('/workspace/', 'ensemble_reloaded'))

File /usr/local/lib/python3.10/dist-packages/merlin/systems/dag/ops/tensorflow.py:57, in PredictTensorflow.__init__(self, model_or_path, custom_objects)
     54     self.path = None
     55     self.model = model_or_path
---> 57 self.input_schema, self.output_schema = _construct_schemas_from_model(self.model)

File /usr/local/lib/python3.10/dist-packages/merlin/systems/dag/ops/tensorflow.py:144, in _construct_schemas_from_model(model, signature_name, tag_set)
    142 # save to disk to generate signature from saved model
    143 with tempfile.TemporaryDirectory() as saved_model_dir:
--> 144     tf.saved_model.save(model, saved_model_dir)
    145     meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir, tag_set)
    146     signature_def = meta_graph_def.signature_def[signature_name]

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:1240, in save(obj, export_dir, signatures, options)
   1238 # pylint: enable=line-too-long
   1239 metrics.IncrementWriteApi(_SAVE_V2_LABEL)
-> 1240 save_and_return_nodes(obj, export_dir, signatures, options)
   1242 metrics.IncrementWrite(write_version="2")

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:1276, in save_and_return_nodes(obj, export_dir, signatures, options, experimental_skip_checkpoint)
   1272 saved_model = saved_model_pb2.SavedModel()
   1273 meta_graph_def = saved_model.meta_graphs.add()
   1275 _, exported_graph, object_saver, asset_info, saved_nodes, node_paths = (
-> 1276     _build_meta_graph(obj, signatures, options, meta_graph_def))
   1277 saved_model.saved_model_schema_version = (
   1278     constants.SAVED_MODEL_SCHEMA_VERSION)
   1280 # Write the checkpoint, copy assets into the assets directory, and write out
   1281 # the SavedModel proto itself.

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:1455, in _build_meta_graph(obj, signatures, options, meta_graph_def)
   1428 """Creates a MetaGraph under a save context.
   1429 
   1430 Args:
   (...)
   1451   saveable_view.node_paths: _SaveableView paths.
   1452 """
   1454 with save_context.save_context(options):
-> 1455   return _build_meta_graph_impl(obj, signatures, options, meta_graph_def)

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:1398, in _build_meta_graph_impl(obj, signatures, options, meta_graph_def)
   1396 augmented_graph_view = _AugmentedGraphView(obj)
   1397 if signatures is None:
-> 1398   signatures = signature_serialization.find_function_to_export(
   1399       augmented_graph_view)
   1401 signatures, wrapped_functions = (
   1402     signature_serialization.canonicalize_signatures(signatures))
   1403 signature_serialization.validate_augmented_graph_view(augmented_graph_view)

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/signature_serialization.py:103, in find_function_to_export(saveable_view)
     99 # TODO(b/205014194): Discuss removing this behaviour. It can lead to WTFs when
    100 # a user decides to annotate more functions with tf.function and suddenly
    101 # serving that model way later in the process stops working.
    102 possible_signatures = []
--> 103 for name, child in children:
    104   if not isinstance(child, (def_function.Function, defun.ConcreteFunction)):
    105     continue

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:180, in _AugmentedGraphView.list_children(self, obj)
    177 if obj not in self._children_cache:
    178   children = self._children_cache[obj] = {}
--> 180   for name, child in super(_AugmentedGraphView, self).list_children(
    181       obj,
    182       save_type=base.SaveType.SAVEDMODEL,
    183       cache=self._serialization_cache):
    184     if isinstance(child, defun.ConcreteFunction):
    185       child = self._maybe_uncache_variable_captures(child)

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/checkpoint/graph_view.py:75, in ObjectGraphView.list_children(self, obj, save_type, **kwargs)
     64 """Returns list of all child trackables attached to obj.
     65 
     66 Args:
   (...)
     72   List of all children attached to the object.
     73 """
     74 children = []
---> 75 for name, ref in super(ObjectGraphView,
     76                        self).children(obj, save_type, **kwargs).items():
     77   children.append(base.TrackableReference(name, ref))
     79 # GraphView objects may define children of the root object that are not
     80 # actually attached, e.g. a Checkpoint object's save_counter.

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/checkpoint/trackable_view.py:84, in TrackableView.children(cls, obj, save_type, **kwargs)
     82 obj._maybe_initialize_trackable()
     83 children = {}
---> 84 for name, ref in obj._trackable_children(save_type, **kwargs).items():
     85   ref = converter.convert_to_trackable(ref, parent=obj)
     86   children[name] = ref

File /usr/local/lib/python3.10/dist-packages/keras/engine/training.py:3733, in Model._trackable_children(self, save_type, **kwargs)
   3730     self.predict_function = None
   3731     self.train_tf_function = None
-> 3733 children = super()._trackable_children(save_type, **kwargs)
   3735 if save_type == "savedmodel":
   3736     self.train_function = train_function

File /usr/local/lib/python3.10/dist-packages/keras/engine/base_layer.py:3466, in Layer._trackable_children(self, save_type, **kwargs)
   3462     cache = kwargs["cache"]
   3463     # TODO(b/213628533): This must be called before super() to ensure
   3464     # that any input shape changes are applied before getting the config
   3465     # of the model.
-> 3466     children = self._trackable_saved_model_saver.trackable_children(
   3467         cache
   3468     )
   3469 else:
   3470     children = {}

File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/base_serialization.py:61, in SavedModelSaver.trackable_children(self, serialization_cache)
     58 if not utils.should_save_traces():
     59     return {}
---> 61 children = self.objects_to_serialize(serialization_cache)
     62 children.update(self.functions_to_serialize(serialization_cache))
     63 return children

File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/layer_serialization.py:79, in LayerSavedModelSaver.objects_to_serialize(self, serialization_cache)
     78 def objects_to_serialize(self, serialization_cache):
---> 79     return self._get_serialized_attributes(
     80         serialization_cache
     81     ).objects_to_serialize

File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/layer_serialization.py:106, in LayerSavedModelSaver._get_serialized_attributes(self, serialization_cache)
    100 if (
    101     save_impl.should_skip_serialization(self.obj)
    102     or self.obj._must_restore_from_config
    103 ):
    104     return serialized_attr
--> 106 object_dict, function_dict = self._get_serialized_attributes_internal(
    107     serialization_cache
    108 )
    110 serialized_attr.set_and_validate_objects(object_dict)
    111 serialized_attr.set_and_validate_functions(function_dict)

File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/model_serialization.py:53, in ModelSavedModelSaver._get_serialized_attributes_internal(self, serialization_cache)
     50 # Create a default signature function if this is the only object in the
     51 # cache (i.e. this is the root level object).
     52 if len(serialization_cache[constants.KERAS_CACHE_KEY]) == 1:
---> 53     default_signature = save_impl.default_save_signature(self.obj)
     55 # Other than the default signature function, all other attributes match
     56 # with the ones serialized by Layer.
     57 objects, functions = super()._get_serialized_attributes_internal(
     58     serialization_cache
     59 )

File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/save_impl.py:234, in default_save_signature(layer)
    232 def default_save_signature(layer):
    233     original_losses = _reset_layer_losses(layer)
--> 234     fn = saving_utils.trace_model_call(layer)
    235     _restore_layer_losses(original_losses)
    236     return fn

File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saving_utils.py:158, in trace_model_call(model, input_signature)
    155     outputs = tf.nest.flatten(outputs)
    156     return {name: output for name, output in zip(output_names, outputs)}
--> 158 return _wrapped_model.get_concrete_function(*model_args, **model_kwargs)

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:1258, in Function.get_concrete_function(self, *args, **kwargs)
   1256 def get_concrete_function(self, *args, **kwargs):
   1257   # Implements GenericFunction.get_concrete_function.
-> 1258   concrete = self._get_concrete_function_garbage_collected(*args, **kwargs)
   1259   concrete._garbage_collector.release()  # pylint: disable=protected-access
   1260   return concrete

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:1238, in Function._get_concrete_function_garbage_collected(self, *args, **kwargs)
   1236   if self._variable_creation_fn is None:
   1237     initializers = []
-> 1238     self._initialize(args, kwargs, add_initializers_to=initializers)
   1239     self._initialize_uninitialized_variables(initializers)
   1241 if self._created_variables:
   1242   # In this case we have created variables on the first call, so we run the
   1243   # version which is guaranteed to never create variables.

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:763, in Function._initialize(self, args, kwds, add_initializers_to)
    760 self._lifted_initializer_graph = lifted_initializer_graph
    761 self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph)
    762 self._concrete_variable_creation_fn = (
--> 763     self._variable_creation_fn    # pylint: disable=protected-access
    764     ._get_concrete_function_internal_garbage_collected(
    765         *args, **kwds))
    767 def invalid_creator_scope(*unused_args, **unused_kwds):
    768   """Disables variable creation."""

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compiler.py:171, in TracingCompiler._get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
    169 """Returns a concrete function which cleans up its graph function."""
    170 with self._lock:
--> 171   concrete_function, _ = self._maybe_define_concrete_function(args, kwargs)
    172 return concrete_function

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compiler.py:166, in TracingCompiler._maybe_define_concrete_function(self, args, kwargs)
    163   args = self.input_signature
    164   kwargs = {}
--> 166 return self._maybe_define_function(args, kwargs)

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compiler.py:396, in TracingCompiler._maybe_define_function(self, args, kwargs)
    393   args = placeholder_bound_args.args
    394 kwargs = placeholder_bound_args.kwargs
--> 396 concrete_function = self._create_concrete_function(
    397     args, kwargs, func_graph)
    399 # TODO(b/263520817): Remove access to private attribute.
    400 graph_capture_container = concrete_function.graph._function_captures  # pylint: disable=protected-access

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compiler.py:300, in TracingCompiler._create_concrete_function(self, args, kwargs, func_graph)
    296 else:
    297   arg_names = base_arg_names
    299 concrete_function = monomorphic_function.ConcreteFunction(
--> 300     func_graph_module.func_graph_from_py_func(
    301         self._name,
    302         self._python_function,
    303         args,
    304         kwargs,
    305         None,
    306         func_graph=func_graph,
    307         autograph=self._autograph,
    308         autograph_options=self._autograph_options,
    309         arg_names=arg_names,
    310         capture_by_value=self._capture_by_value,
    311         create_placeholders=False),
    312     self._function_attributes,
    313     spec=self.function_spec,
    314     # Tell the ConcreteFunction to clean up its graph once it goes out of
    315     # scope. This is not the default behavior since it gets used in some
    316     # places (like Keras) where the FuncGraph lives longer than the
    317     # ConcreteFunction.
    318     shared_func_graph=False)
    319 return concrete_function

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/func_graph.py:1214, in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, create_placeholders, acd_record_initial_resource_uses)
   1211 else:
   1212   _, original_func = tf_decorator.unwrap(python_func)
-> 1214 func_outputs = python_func(*func_args, **func_kwargs)
   1216 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
   1217 # TensorArrays and `None`s.
   1218 func_outputs = variable_utils.convert_variables_to_tensors(func_outputs)

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:667, in Function._compiler_with_scope.<locals>.wrapped_fn(*args, **kwds)
    663 with default_graph._variable_creator_scope(scope, priority=50):  # pylint: disable=protected-access
    664   # __wrapped__ allows AutoGraph to swap in a converted function. We give
    665   # the function a weak reference to itself to avoid a reference cycle.
    666   with OptionalXlaContext(compile_with_xla):
--> 667     out = weak_wrapped_fn().__wrapped__(*args, **kwds)
    668   return out

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/func_graph.py:1200, in func_graph_from_py_func.<locals>.autograph_handler(*args, **kwargs)
   1198 except Exception as e:  # pylint:disable=broad-except
   1199   if hasattr(e, "ag_error_metadata"):
-> 1200     raise e.ag_error_metadata.to_exception(e)
   1201   else:
   1202     raise

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/func_graph.py:1189, in func_graph_from_py_func.<locals>.autograph_handler(*args, **kwargs)
   1187 # TODO(mdan): Push this block higher in tf.function's call stack.
   1188 try:
-> 1189   return autograph.converted_call(
   1190       original_func,
   1191       args,
   1192       kwargs,
   1193       options=autograph.ConversionOptions(
   1194           recursive=True,
   1195           optional_features=autograph_options,
   1196           user_requested=True,
   1197       ))
   1198 except Exception as e:  # pylint:disable=broad-except
   1199   if hasattr(e, "ag_error_metadata"):

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py:439, in converted_call(f, args, kwargs, caller_fn_scope, options)
    437 try:
    438   if kwargs is not None:
--> 439     result = converted_f(*effective_args, **kwargs)
    440   else:
    441     result = converted_f(*effective_args)

File /tmp/__autograph_generated_file4rb9ekl5.py:14, in outer_factory.<locals>.inner_factory.<locals>.tf___wrapped_model(*args, **kwargs)
     12 (args, kwargs) = ag__.converted_call(ag__.ld(model)._call_spec.set_arg_value, ('training', False, ag__.ld(args), ag__.ld(kwargs)), dict(inputs_in_args=True), fscope)
     13 with ag__.ld(base_layer_utils).call_context().enter(ag__.ld(model), inputs=None, build_graph=False, training=False, saving=True):
---> 14     outputs = ag__.converted_call(ag__.ld(model), tuple(ag__.ld(args)), dict(**ag__.ld(kwargs)), fscope)
     15 output_names = ag__.ld(model).output_names
     17 def get_state():

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py:377, in converted_call(f, args, kwargs, caller_fn_scope, options)
    374   return _call_unconverted(f, args, kwargs, options)
    376 if not options.user_requested and conversion.is_allowlisted(f):
--> 377   return _call_unconverted(f, args, kwargs, options)
    379 # internal_convert_user_code is for example turned off when issuing a dynamic
    380 # call conversion from generated code while in nonrecursive mode. In that
    381 # case we evidently don't want to recurse, but we still have to convert
    382 # things like builtins.
    383 if not options.internal_convert_user_code:

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py:458, in _call_unconverted(f, args, kwargs, options, update_cache)
    455   return f.__self__.call(args, kwargs)
    457 if kwargs is not None:
--> 458   return f(*args, **kwargs)
    459 return f(*args)

File /usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
     67     filtered_tb = _process_traceback_frames(e.__traceback__)
     68     # To get the full stack trace, call:
     69     # `tf.debugging.disable_traceback_filtering()`
---> 70     raise e.with_traceback(filtered_tb) from None
     71 finally:
     72     del filtered_tb

File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/function_deserialization.py:295, in recreate_function.<locals>.restored_function_body(*args, **kwargs)
    291   positional, keyword = concrete_function.structured_input_signature
    292   signature_descriptions.append(
    293       "Option {}:\n  {}\n  Keyword arguments: {}".format(
    294           index + 1, _pretty_format_positional(positional), keyword))
--> 295 raise ValueError(
    296     "Could not find matching concrete function to call loaded from the "
    297     f"SavedModel. Got:\n  {_pretty_format_positional(args)}\n  Keyword "
    298     f"arguments: {kwargs}\n\n Expected these arguments to match one of the "
    299     f"following {len(saved_function.concrete_functions)} option(s):\n\n"
    300     f"{(chr(10)+chr(10)).join(signature_descriptions)}")

ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saving_utils.py", line 147, in _wrapped_model  *
        outputs = model(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None

    ValueError: Could not find matching concrete function to call loaded from the SavedModel. Got:
      Positional arguments (5 total):
        * {'category-list__offsets': <tf.Tensor 'inputs:0' shape=(None,) dtype=int32>,
     'category-list__values': <tf.Tensor 'inputs_1:0' shape=(None,) dtype=int64>,
     'dayofweek-first': <tf.Tensor 'inputs_2:0' shape=(None,) dtype=int64>,
     'item_id-list__offsets': <tf.Tensor 'inputs_3:0' shape=(None,) dtype=int32>,
     'item_id-list__values': <tf.Tensor 'inputs_4:0' shape=(None,) dtype=int64>}
        * None
        * False
        * None
        * None
      Keyword arguments: {}

     Expected these arguments to match one of the following 4 option(s):

    Option 1:
      Positional arguments (5 total):
        * {'category-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='category-list__offsets'),
     'category-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='category-list__values'),
     'dayofweek-first': TensorSpec(shape=(None,), dtype=tf.int64, name='dayofweek-first'),
     'item_id-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='item_id-list__offsets'),
     'item_id-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='item_id-list__values')}
        * None
        * True
        * False
        * False
      Keyword arguments: {}

    Option 2:
      Positional arguments (5 total):
        * {'category-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='category-list__offsets'),
     'category-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='category-list__values'),
     'dayofweek-first': TensorSpec(shape=(None,), dtype=tf.int64, name='dayofweek-first'),
     'item_id-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='item_id-list__offsets'),
     'item_id-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='item_id-list__values')}
        * None
        * False
        * False
        * False
      Keyword arguments: {}

    Option 3:
      Positional arguments (5 total):
        * {'category-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='inputs_category_list__offsets'),
     'category-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_category_list__values'),
     'dayofweek-first': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_dayofweek_first'),
     'item_id-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='inputs_item_id_list__offsets'),
     'item_id-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_item_id_list__values')}
        * None
        * True
        * False
        * False
      Keyword arguments: {}

    Option 4:
      Positional arguments (5 total):
        * {'category-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='inputs_category_list__offsets'),
     'category-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_category_list__values'),
     'dayofweek-first': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_dayofweek_first'),
     'item_id-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='inputs_item_id_list__offsets'),
     'item_id-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_item_id_list__values')}
        * None
        * False
        * False
        * False
      Keyword arguments: {}

Steps/Code to reproduce bug

Please run the code in this gist first and then do the steps below:

model_transformer.save('./saved_model')

import os
from nvtabular.workflow import Workflow

from merlin.systems.dag.ops.tensorflow import PredictTensorflow
from merlin.systems.dag.ensemble import Ensemble
from merlin.systems.dag.ops.workflow import TransformWorkflow

wf = Workflow.load(os.path.join(DATA_FOLDER, "workflow_etl"))
inf_ops = wf.input_schema.column_names >> TransformWorkflow(wf) >> PredictTensorflow(reloaded_model)
ensemble = Ensemble(inf_ops, wf.input_schema)
ensemble.export(os.path.join('/workspace/', 'ensemble_reloaded'))

Expected behavior

Environment details

Additional context