tensorflow / model-optimization

A toolkit to optimize ML models for deployment for Keras and TensorFlow, including quantization and pruning.
https://www.tensorflow.org/model_optimization
Apache License 2.0
1.49k stars 319 forks source link

Activity Regularizer not working with quantization aware training (QAT) TF 2.6 #842

Open bayesian-mind opened 2 years ago

bayesian-mind commented 2 years ago

Describe the Issue Activity Regularizer not working with quantization aware training (QAT). TypeError: An op outside of the function building code is being passed a "Graph" tensor.

System information

TensorFlow version (installed from source or binary): TF 2.6

TensorFlow Model Optimization version (installed from source or binary): pip install

Python version: 3.6.9

Describe the expected behavior This happens only on quantized models, non-quantized model have numpy value for activity regularization loss. The model losses i.e. regularization losses should be accumulated without empty tensors/placeholders. And should be able to operate on them outside the graph when executing in eager mode.

Describe the current behavior Currently the activity regularization loss tensor is of type <class 'tensorflow.python.framework.ops.Tensor'> with no numpy values, the bias regularization tensor is also tf.tensor but that does carry a numpy value. But I am not sure if that is throwing the error, where I am unable to operate on the the tf op.

Code to reproduce the issue

import numpy as np
import tensorflow as tf
from tensorflow_model_optimization.python.core.quantization.keras import quantize
from tensorflow.python import keras

l = tf.keras.layers

tf.config.run_functions_eagerly(True)

def functional_model():
    """Builds an MNIST functional model."""
    inp = tf.keras.Input(shape=image_input_shape())
    x = l.Conv2D(filters=32, kernel_size=5, padding='same', activation='relu',
                 kernel_regularizer=tf.keras.regularizers.l2(l=0.0001))(inp)
    x = tf.keras.layers.ActivityRegularization(l2=0.0001)(x)
    x = l.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')(x)
    # TODO(pulkitb): Add BatchNorm when transformations are ready.
    # x = l.BatchNormalization()(x)
    x = l.Conv2D(filters=64, kernel_size=5, padding='same', activation='relu',
                 kernel_regularizer=tf.keras.regularizers.l2(l=0.0001))(x)
    x = tf.keras.layers.ActivityRegularization(l2=0.0001)(x)    
    x = l.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')(x)
    x = l.Flatten()(x)
    x = l.Dense(1024, activation='relu')(x)
    x = l.Dropout(0.4)(x)
    out = l.Dense(10, activation='softmax')(x)

    return tf.keras.Model(inp, [out])

def image_input_shape(img_rows=28, img_cols=28):
    if tf.keras.backend.image_data_format() == 'channels_first':
        return 1, img_rows, img_cols
    else:
        return img_rows, img_cols, 1

def preprocessed_data(img_rows=28,
                      img_cols=28,
                      num_classes=10):
    """Get data for mnist training and evaluation."""
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

    if tf.keras.backend.image_data_format() == 'channels_first':
        x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
        x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    else:
        x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
        x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # convert class vectors to binary class matrices
    y_train = tf.keras.utils.to_categorical(y_train, num_classes)
    y_test = tf.keras.utils.to_categorical(y_test, num_classes)

    return x_train, y_train, x_test, y_test

model = functional_model()
model.summary()
x_train, y_train, x_test, y_test = preprocessed_data()

model.compile(
    loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=500)
_, model_accuracy = model.evaluate(x_test, y_test, verbose=0)

print("Quantizing model")

quantized_model = quantize.quantize_model(model)
print(quantized_model.losses)
quantized_model.compile(
    loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
print(quantized_model.losses)

quantized_model.fit(x_train, y_train, batch_size=500)
_, quantized_model_accuracy = quantized_model.evaluate(
    x_test, y_test, verbose=0)

Error Output

  120/120 [==============================] - 3s 19ms/step - loss: 2.1668 - accuracy: 0.3804
Quantizing model
[<tf.Tensor: shape=(), dtype=float32, numpy=0.00022464147>, <tf.Tensor 'activity_regularization/ActivityRegularizer/truediv:0' shape=() dtype=float32>, <tf.Tensor: shape=(), dtype=float32, numpy=0.0042893193>, <tf.Tensor 'activity_regularization_1/ActivityRegularizer/truediv:0' shape=() dtype=float32>]
[<tf.Tensor: shape=(), dtype=float32, numpy=0.00022464147>, <tf.Tensor 'activity_regularization/ActivityRegularizer/truediv:0' shape=() dtype=float32>, <tf.Tensor: shape=(), dtype=float32, numpy=0.0042893193>, <tf.Tensor 'activity_regularization_1/ActivityRegularizer/truediv:0' shape=() dtype=float32>]
Traceback (most recent call last):
  File "/home/user/git/archive-code/tempTrain.py", line 80, in <module>
    quantized_model.fit(x_train, y_train, batch_size=500)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/keras/engine/training.py", line 1184, in fit
    tmp_logs = self.train_function(iterator)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/keras/engine/training.py", line 853, in train_function
    return step_function(self, iterator)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/keras/engine/training.py", line 842, in step_function
    outputs = model.distribute_strategy.run(run_step, args=(data,))
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1286, in run
    return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py", line 2849, in call_for_each_replica
    return self._call_for_each_replica(fn, args, kwargs)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py", line 3632, in _call_for_each_replica
    return fn(*args, **kwargs)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py", line 597, in wrapper
    return func(*args, **kwargs)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/keras/engine/training.py", line 835, in run_step
    outputs = model.train_step(data)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/keras/engine/training.py", line 789, in train_step
    y, y_pred, sample_weight, regularization_losses=self.losses)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/keras/engine/compile_utils.py", line 231, in __call__
    reg_loss = tf.add_n(regularization_losses)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py", line 206, in wrapper
    return target(*args, **kwargs)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 4000, in add_n
    return gen_math_ops.add_n(inputs, name=name)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 392, in add_n
    inputs, name=name, ctx=_ctx)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 425, in add_n_eager_fallback
    ctx=ctx, name=name)
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/eager/execute.py", line 75, in quick_execute
    raise e
  File "/home/user/TF2_6Venv/lib/python3.6/site-packages/tensorflow/python/eager/execute.py", line 60, in quick_execute
    inputs, attrs, num_outputs)
TypeError: An op outside of the function building code is being passed
a "Graph" tensor. It is possible to have Graph tensors
leak out of the function building context by including a
tf.init_scope in your function building code.
For example, the following function will fail:
  @tf.function
  def has_init_scope():
    my_constant = tf.constant(1.)
    with tf.init_scope():
      added = my_constant * 2
The graph tensor has name: activity_regularization/ActivityRegularizer/truediv:0
wellido commented 2 years ago

Any suggestions for this problem?

UlionTse commented 2 years ago

My problem in version TF2.7.

class TNN(tfk.Model):
    def __init__(self, input_dim):
        super(TNN, self).__init__(name='TNN')
        self.fn_dense_input = tfk.layers.Dense(units=256, input_dim=input_dim)
        self.fn_dense_hidden = tfk.layers.Dense(units=128)
        self.fn_dense_output = tfk.layers.Dense(units=1, activation='sigmoid')
        self.fn_reg_1 = tfk.layers.ActivityRegularization(l1=0.01, l2=0.01, input_dim=256)
        self.fn_reg_2 = tfk.layers.ActivityRegularization(l1=0, l2=0.01, input_dim=128)
        self.fn_bn_1 = tfk.layers.BatchNormalization()
        self.fn_bn_2 = tfk.layers.BatchNormalization()  # not repeat
        self.fn_af = tfk.layers.Activation(activation='relu')
        self.fn_dropout = tfk.layers.AlphaDropout(rate=0.85)

    @tf.function
    def call(self, inputs):
        x = self.fn_dense_input(inputs)
        x = self.fn_reg_1(x)
        x = self.fn_bn_1(x)
        x = self.fn_af(x)
        # x = self.fn_dropout(x)

        x = self.fn_dense_hidden(x)
        x = self.fn_reg_2(x)
        x = self.fn_bn_2(x)
        x = self.fn_af(x)
        # x = self.fn_dropout(x)

        outputs = self.fn_dense_output(x)
        return outputs

Error output:

InaccessibleTensorError: tf.Graph captured an external symbolic tensor. The symbolic tensor <tf.Tensor 'activity_regularization/ActivityRegularizer/truediv:0' shape=() dtype=float32> is captured by FuncGraph(name=train_function, id=139760322273232), but it is defined at FuncGraph(name=call, id=139760322164240). A tf.Graph is not allowed to capture symoblic tensors from another graph. Use return values, explicit Python locals or TensorFlow collections to access it. Please see https://www.tensorflow.org/guide/function#all_outputs_of_a_tffunction_must_be_return_values for more information.
mrj-taffy commented 2 years ago

same problem in TF2.4.2. any suggestion?

EClemMarq commented 1 year ago

My problem in version TF2.7.

class TNN(tfk.Model):
    def __init__(self, input_dim):
        super(TNN, self).__init__(name='TNN')
        self.fn_dense_input = tfk.layers.Dense(units=256, input_dim=input_dim)
        self.fn_dense_hidden = tfk.layers.Dense(units=128)
        self.fn_dense_output = tfk.layers.Dense(units=1, activation='sigmoid')
        self.fn_reg_1 = tfk.layers.ActivityRegularization(l1=0.01, l2=0.01, input_dim=256)
        self.fn_reg_2 = tfk.layers.ActivityRegularization(l1=0, l2=0.01, input_dim=128)
        self.fn_bn_1 = tfk.layers.BatchNormalization()
        self.fn_bn_2 = tfk.layers.BatchNormalization()  # not repeat
        self.fn_af = tfk.layers.Activation(activation='relu')
        self.fn_dropout = tfk.layers.AlphaDropout(rate=0.85)

    @tf.function
    def call(self, inputs):
        x = self.fn_dense_input(inputs)
        x = self.fn_reg_1(x)
        x = self.fn_bn_1(x)
        x = self.fn_af(x)
        # x = self.fn_dropout(x)

        x = self.fn_dense_hidden(x)
        x = self.fn_reg_2(x)
        x = self.fn_bn_2(x)
        x = self.fn_af(x)
        # x = self.fn_dropout(x)

        outputs = self.fn_dense_output(x)
        return outputs

Error output:

InaccessibleTensorError: tf.Graph captured an external symbolic tensor. The symbolic tensor <tf.Tensor 'activity_regularization/ActivityRegularizer/truediv:0' shape=() dtype=float32> is captured by FuncGraph(name=train_function, id=139760322273232), but it is defined at FuncGraph(name=call, id=139760322164240). A tf.Graph is not allowed to capture symoblic tensors from another graph. Use return values, explicit Python locals or TensorFlow collections to access it. Please see https://www.tensorflow.org/guide/function#all_outputs_of_a_tffunction_must_be_return_values for more information.

I see the same issue, TF2.7, model_optimization v0.7.3. I'm just removing the ActivityRegularization for now.