SeldonIO / alibi

Algorithms for explaining machine learning models
https://docs.seldon.io/projects/alibi/en/stable/
Other
2.41k stars 252 forks source link

Eager mode support in Counterfactuals and CEM #425

Closed InterferencePattern closed 1 year ago

InterferencePattern commented 3 years ago

Hi, I'm finding that my ONNX image classification model (loaded with the ONNX package and converted to TensorFlow) works with AnchorImage but not with Counterfactuals or CEM. I've tried providing the model directly to the CounterFactual object, but I've also tried with a predict function (since the model expects inputs of a different shape). Neither way is successful, but the latter approach is shown below.

Is there a reason that AnchorImage and Counterfactuals/CEM would treat this model differently under the hood?

Here is the failing code, followed by the error.

Thank you for your help.

tf.compat.v1.disable_eager_execution()
tf.compat.v1.reset_default_graph()

model = onnx.load('./model.onnx')
tf_model = prepare(model)

shape = (1,) + img_stack.shape[1:] # replace first dimension with 1 (one explanation at a time)
target_proba = 1.0
tol = 0.01 # want counterfactuals with p(class)>0.99
target_class = 'other' # any class other than the predicted one will do
max_iter = 1000
lam_init = 1e-1
max_lam_steps = 10
learning_rate_init = 0.1
feature_range = (img_stack.min(),img_stack.max())

def predict_fn(img_stack):
    img_stack = img_stack.transpose(0,3,1,2)
    img_stack = np.asarray(img_stack, dtype=np.float32)
    return tf_model.run(img_stack)[0] #Inference

cf = CounterFactual(predict_fn, shape=shape, target_proba=target_proba, tol=tol,
                    target_class=target_class, max_iter=max_iter, lam_init=lam_init,
                    max_lam_steps=max_lam_steps, learning_rate_init=learning_rate_init,
                    feature_range=feature_range)

RuntimeError Traceback (most recent call last)

in 12 target_class=target_class, max_iter=max_iter, lam_init=lam_init, 13 max_lam_steps=max_lam_steps, learning_rate_init=learning_rate_init, ---> 14 feature_range=feature_range) 15 16 start_time = time() /.../lib/python3.7/site-packages/alibi/explainers/counterfactual.py in __init__(self, predict_fn, shape, distance_fn, target_proba, target_class, max_iter, early_stop, lam_init, max_lam_steps, tol, learning_rate_init, feature_range, eps, init, decay, write_dir, debug, sess) 177 self.model = False 178 --> 179 self.n_classes = self.predict_fn(np.zeros(shape)).shape[1] 180 181 # flag to keep track if explainer is fit or not in predict_img(img_stack) 3 # img = img.reshape(1,3,224,224) # Transform to Input Tensor 4 img_stack = np.asarray(img_stack, dtype=np.float32) ----> 5 return tf_model.run(img_stack)[0] #Inference /.../lib/python3.7/site-packages/onnx_tf/backend_rep.py in run(self, inputs, **kwargs) 91 input_dict = dict([(x[0], tf.constant(x[1])) for x in feed_dict.items()]) 92 ---> 93 output_values = self.tf_module(**input_dict) 94 output_values = [ 95 val.numpy() if isinstance(val, tf.Tensor) else val /.../lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds) 826 tracing_count = self.experimental_get_tracing_count() 827 with trace.Trace(self._name) as tm: --> 828 result = self._call(*args, **kwds) 829 compiler = "xla" if self._experimental_compile else "nonXla" 830 new_tracing_count = self.experimental_get_tracing_count() /.../lib/python3.7/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds) 860 # In this case we have not created variables on the first call. So we can 861 # run the first trace but we should fail if variables are created. --> 862 results = self._stateful_fn(*args, **kwds) 863 if self._created_variables: 864 raise ValueError("Creating variables on a non-first call to a function" /.../lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs) 2941 filtered_flat_args) = self._maybe_define_function(args, kwargs) 2942 return graph_function._call_flat( -> 2943 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access 2944 2945 @property /.../lib/python3.7/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager) 1930 {"PartitionedCall": self._get_gradient_function(), 1931 "StatefulPartitionedCall": self._get_gradient_function()}): -> 1932 flat_outputs = forward_function.call(ctx, args_with_tangents) 1933 forward_backward.record(flat_outputs) 1934 return self._build_call_outputs(flat_outputs) /.../lib/python3.7/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager) 587 executing_eagerly=executing_eagerly, 588 config=config, --> 589 executor_type=executor_type) 590 591 for i, func_graph_output in enumerate(self._func_graph_outputs): /...t/lib/python3.7/site-packages/tensorflow/python/ops/functional_ops.py in partitioned_call(args, f, tout, executing_eagerly, config, executor_type) 1187 # The generated binding returns an empty list for functions that don't 1188 # return any Tensors, hence the need to use `create_op` directly. -> 1189 args = [ops.convert_to_tensor(x) for x in args] 1190 tin_attr = attr_value_pb2.AttrValue( 1191 list=attr_value_pb2.AttrValue.ListValue( /.../lib/python3.7/site-packages/tensorflow/python/ops/functional_ops.py in (.0) 1187 # The generated binding returns an empty list for functions that don't 1188 # return any Tensors, hence the need to use `create_op` directly. -> 1189 args = [ops.convert_to_tensor(x) for x in args] 1190 tin_attr = attr_value_pb2.AttrValue( 1191 list=attr_value_pb2.AttrValue.ListValue( /.../lib/python3.7/site-packages/tensorflow/python/profiler/trace.py in wrapped(*args, **kwargs) 161 with Trace(trace_name, **trace_kwargs): 162 return func(*args, **kwargs) --> 163 return func(*args, **kwargs) 164 165 return wrapped /.../lib/python3.7/site-packages/tensorflow/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types) 1497 graph = get_default_graph() 1498 if not graph.building_function: -> 1499 raise RuntimeError("Attempting to capture an EagerTensor without " 1500 "building a function.") 1501 return graph.capture(value, name=name) RuntimeError: Attempting to capture an EagerTensor without building a function.
jklaise commented 3 years ago

@jimbudarz interesting, we would have to dig deeper to determine the root cause, but I suspect it's some mismatch between eager mode behaviour (does onnx runtime work in eager mode only?) and the graph mode that is required to run CFs in alibi due to core algorithm being written using TF1.x constructs (for now). The reason Anchors work is because they don't have any TF code internally.

Just as an aside, do Anchors work if you disable eager mode and pass the same predict_fn as in the example?

InterferencePattern commented 3 years ago

@jklaise Thanks for the response. Anchors also fails when eager mode is disabled. RuntimeError: Attempting to capture an EagerTensor without building a function.

However, if I don't disable eager mode for CF and CEM, I get the following:


RuntimeError Traceback (most recent call last)

in 10 gamma=100, max_iterations=1000, 11 c_init=1., c_steps=10, learning_rate_init=1e-2, ---> 12 clip=(-1000.,1000.), no_info_val=-1.) /.../lib/python3.7/site-packages/alibi/explainers/cem.py in __init__(self, predict, mode, shape, kappa, beta, feature_range, gamma, ae_model, learning_rate_init, max_iterations, c_init, c_steps, eps, clip, update_num_grad, no_info_val, write_dir, sess) 148 149 # define placeholders that will be assigned to relevant variables --> 150 self.assign_orig = tf.placeholder(tf.float32, shape, name='assign_orig') 151 self.assign_adv = tf.placeholder(tf.float32, shape, name='assign_adv') 152 self.assign_adv_s = tf.placeholder(tf.float32, shape, name='assign_adv_s') /.../lib/python3.7/site-packages/tensorflow/python/ops/array_ops.py in placeholder(dtype, shape, name) 3174 """ 3175 if context.executing_eagerly(): -> 3176 raise RuntimeError("tf.placeholder() is not compatible with " 3177 "eager execution.") 3178 RuntimeError: tf.placeholder() is not compatible with eager execution.
InterferencePattern commented 3 years ago

Sorry- to answer your first question:

does onnx runtime work in eager mode only?

Making predictions with eager mode disabled results in the same error: RuntimeError: Attempting to capture an EagerTensor without building a function.

jklaise commented 3 years ago

@jimbudarz thanks for the follow-up. To summarize, eager mode needs to be disabled to run CF/CEM algorithms because we use TF1.x constructs in the code. This seems to be incompatible with the way ONNX runtime works with TF2.x models.

In the long run we will port the CF/CEM code to TF2.x constructs (see #403) but there are a few hurdles with performance and some more development time is needed.

In the short term, we would need to investigate if running ONNX TF2.x models with eager mode disabled is something that is feasible at all, this would require digging into the ONNX protocol a bit more. A very simple thing to check first (if you can) is if you can make predictions using the ONNX runtime with a TF2.x model with eager mode disabled.

InterferencePattern commented 3 years ago

Thanks for looking into this.

A very simple thing to check first (if you can) is if you can make predictions using the ONNX runtime with a TF2.x model with eager mode disabled.

I was able to test this- it fails to make predictions with eager mode disabled.

jklaise commented 3 years ago

@jimbudarz right, that confirms my suspicion. The next step would be to look into ONNX docs wrt running TensorFlow models with eager mode disabled, I suspect either this is not supported (for TF2.x models) or some extra things need to be done when either exporting to ONNX or cofiguring the runtime. Another avenue worth exploring is TF1.x ONNX support as TF1.x doesn't have a concept of eager mode so if support exists then possible lessons can be taken to enable similar functionality for TF2.x (i.e. running without eager mode).

Another thing to explore is ONNX support with TF2.x models that have tf.function decorators as that would also force graph mode.

InterferencePattern commented 3 years ago

I've just replicated this experiment with the original Keras model, and I realize that this might not be an ONNX issue- it might be incompatibility with any keras model trained with eager mode enabled.

When I try to run contrastive explanations without eager mode disabled

shape = (1,) + img_stack.shape[1:]
mode = 'PN'
cem = CEM(model.predict, mode, shape, kappa=0., beta=.1,
          feature_range=(img_stack.min(), img_stack.max()),
          gamma=100, max_iterations=1000,
          c_init=1., c_steps=10, learning_rate_init=1e-2,
          clip=(-1000.,1000.), no_info_val=-1.)

I get the following error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-17-dff1c063c46f> in <module>
     12                     target_class=target_class, max_iter=max_iter, lam_init=lam_init,
     13                     max_lam_steps=max_lam_steps, learning_rate_init=learning_rate_init,
---> 14                     feature_range=feature_range)
     15 
     16 start_time = time()

/.../lib/python3.7/site-packages/alibi/explainers/counterfactual.py in __init__(self, predict_fn, shape, distance_fn, target_proba, target_class, max_iter, early_stop, lam_init, max_lam_steps, tol, learning_rate_init, feature_range, eps, init, decay, write_dir, debug, sess)
    199 
    200             # lambda hyperparameter - placeholder instead of variable as annealed in first epoch
--> 201             self.lam = tf.placeholder(tf.float32, shape=(self.batch_size), name='lam')
    202 
    203             # define placeholders that will be assigned to relevant variables

/.../lib/python3.7/site-packages/tensorflow/python/ops/array_ops.py in placeholder(dtype, shape, name)
   3174   """
   3175   if context.executing_eagerly():
-> 3176     raise RuntimeError("tf.placeholder() is not compatible with "
   3177                        "eager execution.")
   3178 

RuntimeError: tf.placeholder() is not compatible with eager execution.

And if I run it with eager mode disabled, I can't make predictions at all:

ValueError: Calling `Model.predict` in graph mode is not supported when the `Model` instance was constructed with eager mode enabled. Please construct your `Model` instance in graph mode or call `Model.predict` with eager mode enabled.

I don't know if this is valuable information for you as you build out TF2 support, but I figured I could provide it anyway.

InterferencePattern commented 1 year ago

Stale