Closed rnyak closed 1 year ago
I am getting the following error when I try to serve a loaded model on Triton.
ValueError Traceback (most recent call last) Cell In[7], line 12 6 from merlin.systems.dag.ops.workflow import TransformWorkflow 9 wf = Workflow.load(os.path.join(DATA_FOLDER, "workflow_etl")) ---> 12 inf_ops = wf.input_schema.column_names >> TransformWorkflow(wf) >> PredictTensorflow(reloaded_model) 14 ensemble = Ensemble(inf_ops, wf.input_schema) 15 ensemble.export(os.path.join('/workspace/', 'ensemble_reloaded')) File /usr/local/lib/python3.10/dist-packages/merlin/systems/dag/ops/tensorflow.py:57, in PredictTensorflow.__init__(self, model_or_path, custom_objects) 54 self.path = None 55 self.model = model_or_path ---> 57 self.input_schema, self.output_schema = _construct_schemas_from_model(self.model) File /usr/local/lib/python3.10/dist-packages/merlin/systems/dag/ops/tensorflow.py:144, in _construct_schemas_from_model(model, signature_name, tag_set) 142 # save to disk to generate signature from saved model 143 with tempfile.TemporaryDirectory() as saved_model_dir: --> 144 tf.saved_model.save(model, saved_model_dir) 145 meta_graph_def = saved_model_utils.get_meta_graph_def(saved_model_dir, tag_set) 146 signature_def = meta_graph_def.signature_def[signature_name] File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:1240, in save(obj, export_dir, signatures, options) 1238 # pylint: enable=line-too-long 1239 metrics.IncrementWriteApi(_SAVE_V2_LABEL) -> 1240 save_and_return_nodes(obj, export_dir, signatures, options) 1242 metrics.IncrementWrite(write_version="2") File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:1276, in save_and_return_nodes(obj, export_dir, signatures, options, experimental_skip_checkpoint) 1272 saved_model = saved_model_pb2.SavedModel() 1273 meta_graph_def = saved_model.meta_graphs.add() 1275 _, exported_graph, object_saver, asset_info, saved_nodes, node_paths = ( -> 1276 _build_meta_graph(obj, signatures, options, meta_graph_def)) 1277 saved_model.saved_model_schema_version = ( 1278 constants.SAVED_MODEL_SCHEMA_VERSION) 1280 # Write the checkpoint, copy assets into the assets directory, and write out 1281 # the SavedModel proto itself. File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:1455, in _build_meta_graph(obj, signatures, options, meta_graph_def) 1428 """Creates a MetaGraph under a save context. 1429 1430 Args: (...) 1451 saveable_view.node_paths: _SaveableView paths. 1452 """ 1454 with save_context.save_context(options): -> 1455 return _build_meta_graph_impl(obj, signatures, options, meta_graph_def) File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:1398, in _build_meta_graph_impl(obj, signatures, options, meta_graph_def) 1396 augmented_graph_view = _AugmentedGraphView(obj) 1397 if signatures is None: -> 1398 signatures = signature_serialization.find_function_to_export( 1399 augmented_graph_view) 1401 signatures, wrapped_functions = ( 1402 signature_serialization.canonicalize_signatures(signatures)) 1403 signature_serialization.validate_augmented_graph_view(augmented_graph_view) File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/signature_serialization.py:103, in find_function_to_export(saveable_view) 99 # TODO(b/205014194): Discuss removing this behaviour. It can lead to WTFs when 100 # a user decides to annotate more functions with tf.function and suddenly 101 # serving that model way later in the process stops working. 102 possible_signatures = [] --> 103 for name, child in children: 104 if not isinstance(child, (def_function.Function, defun.ConcreteFunction)): 105 continue File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/save.py:180, in _AugmentedGraphView.list_children(self, obj) 177 if obj not in self._children_cache: 178 children = self._children_cache[obj] = {} --> 180 for name, child in super(_AugmentedGraphView, self).list_children( 181 obj, 182 save_type=base.SaveType.SAVEDMODEL, 183 cache=self._serialization_cache): 184 if isinstance(child, defun.ConcreteFunction): 185 child = self._maybe_uncache_variable_captures(child) File /usr/local/lib/python3.10/dist-packages/tensorflow/python/checkpoint/graph_view.py:75, in ObjectGraphView.list_children(self, obj, save_type, **kwargs) 64 """Returns list of all child trackables attached to obj. 65 66 Args: (...) 72 List of all children attached to the object. 73 """ 74 children = [] ---> 75 for name, ref in super(ObjectGraphView, 76 self).children(obj, save_type, **kwargs).items(): 77 children.append(base.TrackableReference(name, ref)) 79 # GraphView objects may define children of the root object that are not 80 # actually attached, e.g. a Checkpoint object's save_counter. File /usr/local/lib/python3.10/dist-packages/tensorflow/python/checkpoint/trackable_view.py:84, in TrackableView.children(cls, obj, save_type, **kwargs) 82 obj._maybe_initialize_trackable() 83 children = {} ---> 84 for name, ref in obj._trackable_children(save_type, **kwargs).items(): 85 ref = converter.convert_to_trackable(ref, parent=obj) 86 children[name] = ref File /usr/local/lib/python3.10/dist-packages/keras/engine/training.py:3733, in Model._trackable_children(self, save_type, **kwargs) 3730 self.predict_function = None 3731 self.train_tf_function = None -> 3733 children = super()._trackable_children(save_type, **kwargs) 3735 if save_type == "savedmodel": 3736 self.train_function = train_function File /usr/local/lib/python3.10/dist-packages/keras/engine/base_layer.py:3466, in Layer._trackable_children(self, save_type, **kwargs) 3462 cache = kwargs["cache"] 3463 # TODO(b/213628533): This must be called before super() to ensure 3464 # that any input shape changes are applied before getting the config 3465 # of the model. -> 3466 children = self._trackable_saved_model_saver.trackable_children( 3467 cache 3468 ) 3469 else: 3470 children = {} File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/base_serialization.py:61, in SavedModelSaver.trackable_children(self, serialization_cache) 58 if not utils.should_save_traces(): 59 return {} ---> 61 children = self.objects_to_serialize(serialization_cache) 62 children.update(self.functions_to_serialize(serialization_cache)) 63 return children File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/layer_serialization.py:79, in LayerSavedModelSaver.objects_to_serialize(self, serialization_cache) 78 def objects_to_serialize(self, serialization_cache): ---> 79 return self._get_serialized_attributes( 80 serialization_cache 81 ).objects_to_serialize File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/layer_serialization.py:106, in LayerSavedModelSaver._get_serialized_attributes(self, serialization_cache) 100 if ( 101 save_impl.should_skip_serialization(self.obj) 102 or self.obj._must_restore_from_config 103 ): 104 return serialized_attr --> 106 object_dict, function_dict = self._get_serialized_attributes_internal( 107 serialization_cache 108 ) 110 serialized_attr.set_and_validate_objects(object_dict) 111 serialized_attr.set_and_validate_functions(function_dict) File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/model_serialization.py:53, in ModelSavedModelSaver._get_serialized_attributes_internal(self, serialization_cache) 50 # Create a default signature function if this is the only object in the 51 # cache (i.e. this is the root level object). 52 if len(serialization_cache[constants.KERAS_CACHE_KEY]) == 1: ---> 53 default_signature = save_impl.default_save_signature(self.obj) 55 # Other than the default signature function, all other attributes match 56 # with the ones serialized by Layer. 57 objects, functions = super()._get_serialized_attributes_internal( 58 serialization_cache 59 ) File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saved_model/save_impl.py:234, in default_save_signature(layer) 232 def default_save_signature(layer): 233 original_losses = _reset_layer_losses(layer) --> 234 fn = saving_utils.trace_model_call(layer) 235 _restore_layer_losses(original_losses) 236 return fn File /usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saving_utils.py:158, in trace_model_call(model, input_signature) 155 outputs = tf.nest.flatten(outputs) 156 return {name: output for name, output in zip(output_names, outputs)} --> 158 return _wrapped_model.get_concrete_function(*model_args, **model_kwargs) File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:1258, in Function.get_concrete_function(self, *args, **kwargs) 1256 def get_concrete_function(self, *args, **kwargs): 1257 # Implements GenericFunction.get_concrete_function. -> 1258 concrete = self._get_concrete_function_garbage_collected(*args, **kwargs) 1259 concrete._garbage_collector.release() # pylint: disable=protected-access 1260 return concrete File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:1238, in Function._get_concrete_function_garbage_collected(self, *args, **kwargs) 1236 if self._variable_creation_fn is None: 1237 initializers = [] -> 1238 self._initialize(args, kwargs, add_initializers_to=initializers) 1239 self._initialize_uninitialized_variables(initializers) 1241 if self._created_variables: 1242 # In this case we have created variables on the first call, so we run the 1243 # version which is guaranteed to never create variables. File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:763, in Function._initialize(self, args, kwds, add_initializers_to) 760 self._lifted_initializer_graph = lifted_initializer_graph 761 self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph) 762 self._concrete_variable_creation_fn = ( --> 763 self._variable_creation_fn # pylint: disable=protected-access 764 ._get_concrete_function_internal_garbage_collected( 765 *args, **kwds)) 767 def invalid_creator_scope(*unused_args, **unused_kwds): 768 """Disables variable creation.""" File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compiler.py:171, in TracingCompiler._get_concrete_function_internal_garbage_collected(self, *args, **kwargs) 169 """Returns a concrete function which cleans up its graph function.""" 170 with self._lock: --> 171 concrete_function, _ = self._maybe_define_concrete_function(args, kwargs) 172 return concrete_function File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compiler.py:166, in TracingCompiler._maybe_define_concrete_function(self, args, kwargs) 163 args = self.input_signature 164 kwargs = {} --> 166 return self._maybe_define_function(args, kwargs) File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compiler.py:396, in TracingCompiler._maybe_define_function(self, args, kwargs) 393 args = placeholder_bound_args.args 394 kwargs = placeholder_bound_args.kwargs --> 396 concrete_function = self._create_concrete_function( 397 args, kwargs, func_graph) 399 # TODO(b/263520817): Remove access to private attribute. 400 graph_capture_container = concrete_function.graph._function_captures # pylint: disable=protected-access File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/tracing_compiler.py:300, in TracingCompiler._create_concrete_function(self, args, kwargs, func_graph) 296 else: 297 arg_names = base_arg_names 299 concrete_function = monomorphic_function.ConcreteFunction( --> 300 func_graph_module.func_graph_from_py_func( 301 self._name, 302 self._python_function, 303 args, 304 kwargs, 305 None, 306 func_graph=func_graph, 307 autograph=self._autograph, 308 autograph_options=self._autograph_options, 309 arg_names=arg_names, 310 capture_by_value=self._capture_by_value, 311 create_placeholders=False), 312 self._function_attributes, 313 spec=self.function_spec, 314 # Tell the ConcreteFunction to clean up its graph once it goes out of 315 # scope. This is not the default behavior since it gets used in some 316 # places (like Keras) where the FuncGraph lives longer than the 317 # ConcreteFunction. 318 shared_func_graph=False) 319 return concrete_function File /usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/func_graph.py:1214, in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, create_placeholders, acd_record_initial_resource_uses) 1211 else: 1212 _, original_func = tf_decorator.unwrap(python_func) -> 1214 func_outputs = python_func(*func_args, **func_kwargs) 1216 # invariant: `func_outputs` contains only Tensors, CompositeTensors, 1217 # TensorArrays and `None`s. 1218 func_outputs = variable_utils.convert_variables_to_tensors(func_outputs) File /usr/local/lib/python3.10/dist-packages/tensorflow/python/eager/polymorphic_function/polymorphic_function.py:667, in Function._compiler_with_scope.<locals>.wrapped_fn(*args, **kwds) 663 with default_graph._variable_creator_scope(scope, priority=50): # pylint: disable=protected-access 664 # __wrapped__ allows AutoGraph to swap in a converted function. We give 665 # the function a weak reference to itself to avoid a reference cycle. 666 with OptionalXlaContext(compile_with_xla): --> 667 out = weak_wrapped_fn().__wrapped__(*args, **kwds) 668 return out File /usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/func_graph.py:1200, in func_graph_from_py_func.<locals>.autograph_handler(*args, **kwargs) 1198 except Exception as e: # pylint:disable=broad-except 1199 if hasattr(e, "ag_error_metadata"): -> 1200 raise e.ag_error_metadata.to_exception(e) 1201 else: 1202 raise File /usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/func_graph.py:1189, in func_graph_from_py_func.<locals>.autograph_handler(*args, **kwargs) 1187 # TODO(mdan): Push this block higher in tf.function's call stack. 1188 try: -> 1189 return autograph.converted_call( 1190 original_func, 1191 args, 1192 kwargs, 1193 options=autograph.ConversionOptions( 1194 recursive=True, 1195 optional_features=autograph_options, 1196 user_requested=True, 1197 )) 1198 except Exception as e: # pylint:disable=broad-except 1199 if hasattr(e, "ag_error_metadata"): File /usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py:439, in converted_call(f, args, kwargs, caller_fn_scope, options) 437 try: 438 if kwargs is not None: --> 439 result = converted_f(*effective_args, **kwargs) 440 else: 441 result = converted_f(*effective_args) File /tmp/__autograph_generated_file4rb9ekl5.py:14, in outer_factory.<locals>.inner_factory.<locals>.tf___wrapped_model(*args, **kwargs) 12 (args, kwargs) = ag__.converted_call(ag__.ld(model)._call_spec.set_arg_value, ('training', False, ag__.ld(args), ag__.ld(kwargs)), dict(inputs_in_args=True), fscope) 13 with ag__.ld(base_layer_utils).call_context().enter(ag__.ld(model), inputs=None, build_graph=False, training=False, saving=True): ---> 14 outputs = ag__.converted_call(ag__.ld(model), tuple(ag__.ld(args)), dict(**ag__.ld(kwargs)), fscope) 15 output_names = ag__.ld(model).output_names 17 def get_state(): File /usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py:377, in converted_call(f, args, kwargs, caller_fn_scope, options) 374 return _call_unconverted(f, args, kwargs, options) 376 if not options.user_requested and conversion.is_allowlisted(f): --> 377 return _call_unconverted(f, args, kwargs, options) 379 # internal_convert_user_code is for example turned off when issuing a dynamic 380 # call conversion from generated code while in nonrecursive mode. In that 381 # case we evidently don't want to recurse, but we still have to convert 382 # things like builtins. 383 if not options.internal_convert_user_code: File /usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py:458, in _call_unconverted(f, args, kwargs, options, update_cache) 455 return f.__self__.call(args, kwargs) 457 if kwargs is not None: --> 458 return f(*args, **kwargs) 459 return f(*args) File /usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs) 67 filtered_tb = _process_traceback_frames(e.__traceback__) 68 # To get the full stack trace, call: 69 # `tf.debugging.disable_traceback_filtering()` ---> 70 raise e.with_traceback(filtered_tb) from None 71 finally: 72 del filtered_tb File /usr/local/lib/python3.10/dist-packages/tensorflow/python/saved_model/function_deserialization.py:295, in recreate_function.<locals>.restored_function_body(*args, **kwargs) 291 positional, keyword = concrete_function.structured_input_signature 292 signature_descriptions.append( 293 "Option {}:\n {}\n Keyword arguments: {}".format( 294 index + 1, _pretty_format_positional(positional), keyword)) --> 295 raise ValueError( 296 "Could not find matching concrete function to call loaded from the " 297 f"SavedModel. Got:\n {_pretty_format_positional(args)}\n Keyword " 298 f"arguments: {kwargs}\n\n Expected these arguments to match one of the " 299 f"following {len(saved_function.concrete_functions)} option(s):\n\n" 300 f"{(chr(10)+chr(10)).join(signature_descriptions)}") ValueError: in user code: File "/usr/local/lib/python3.10/dist-packages/keras/saving/legacy/saving_utils.py", line 147, in _wrapped_model * outputs = model(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py", line 70, in error_handler ** raise e.with_traceback(filtered_tb) from None ValueError: Could not find matching concrete function to call loaded from the SavedModel. Got: Positional arguments (5 total): * {'category-list__offsets': <tf.Tensor 'inputs:0' shape=(None,) dtype=int32>, 'category-list__values': <tf.Tensor 'inputs_1:0' shape=(None,) dtype=int64>, 'dayofweek-first': <tf.Tensor 'inputs_2:0' shape=(None,) dtype=int64>, 'item_id-list__offsets': <tf.Tensor 'inputs_3:0' shape=(None,) dtype=int32>, 'item_id-list__values': <tf.Tensor 'inputs_4:0' shape=(None,) dtype=int64>} * None * False * None * None Keyword arguments: {} Expected these arguments to match one of the following 4 option(s): Option 1: Positional arguments (5 total): * {'category-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='category-list__offsets'), 'category-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='category-list__values'), 'dayofweek-first': TensorSpec(shape=(None,), dtype=tf.int64, name='dayofweek-first'), 'item_id-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='item_id-list__offsets'), 'item_id-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='item_id-list__values')} * None * True * False * False Keyword arguments: {} Option 2: Positional arguments (5 total): * {'category-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='category-list__offsets'), 'category-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='category-list__values'), 'dayofweek-first': TensorSpec(shape=(None,), dtype=tf.int64, name='dayofweek-first'), 'item_id-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='item_id-list__offsets'), 'item_id-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='item_id-list__values')} * None * False * False * False Keyword arguments: {} Option 3: Positional arguments (5 total): * {'category-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='inputs_category_list__offsets'), 'category-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_category_list__values'), 'dayofweek-first': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_dayofweek_first'), 'item_id-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='inputs_item_id_list__offsets'), 'item_id-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_item_id_list__values')} * None * True * False * False Keyword arguments: {} Option 4: Positional arguments (5 total): * {'category-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='inputs_category_list__offsets'), 'category-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_category_list__values'), 'dayofweek-first': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_dayofweek_first'), 'item_id-list__offsets': TensorSpec(shape=(None,), dtype=tf.int32, name='inputs_item_id_list__offsets'), 'item_id-list__values': TensorSpec(shape=(None,), dtype=tf.int64, name='inputs_item_id_list__values')} * None * False * False * False Keyword arguments: {}
Please run the code in this gist first and then do the steps below:
model_transformer.save('./saved_model')
import merlin.models.tf as mm import tensorflow as tf reloaded_model= tf.keras.models.load_model('./saved_model/')
import os from nvtabular.workflow import Workflow from merlin.systems.dag.ops.tensorflow import PredictTensorflow from merlin.systems.dag.ensemble import Ensemble from merlin.systems.dag.ops.workflow import TransformWorkflow wf = Workflow.load(os.path.join(DATA_FOLDER, "workflow_etl")) inf_ops = wf.input_schema.column_names >> TransformWorkflow(wf) >> PredictTensorflow(reloaded_model) ensemble = Ensemble(inf_ops, wf.input_schema) ensemble.export(os.path.join('/workspace/', 'ensemble_reloaded'))
Bug description
I am getting the following error when I try to serve a loaded model on Triton.
Steps/Code to reproduce bug
Please run the code in this gist first and then do the steps below:
model_transformer.save('./saved_model')
Expected behavior
Environment details
Additional context