tensorflow / addons

Useful extra functionality for TensorFlow 2.x maintained by SIG-addons
Apache License 2.0
1.69k stars 611 forks source link

Multi-GPU support for WeightNormalization #740

Closed shreyaskamathkm closed 4 years ago

shreyaskamathkm commented 4 years ago

Describe the feature and the current behavior/state. When I try to run the weight normalization on a single GPU it works as expected. However, when I try to run it on Mirror Strategy it throws an error? Could it be extended to support multi GPUs?

Relevant information

Which API type would this fall under (layer, metric, optimizer, etc.) layer/wrapper

Who will benefit with this feature?

Any other info.

seanpmorgan commented 4 years ago

Hi @shreyaskamathkm thank you for identifying this! Would you mind adding a minimal reproducible example along with the error that you receive?

shreyaskamathkm commented 4 years ago

Sure thing. I will add it over the weekend.

shreyaskamathkm commented 4 years ago

Hi, @seanpmorgan I edited the distributed strategy version from the TensorFlow example. Setting n_gpu = 1, works as expected. When n_gpu = 2, we get an error. The complete log is also attached after the code.

from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.debugging.set_log_device_placement(False)  # COmment this if display of where the variables are goin is not requried
tf.compat.v1.reset_default_graph()
from tensorflow.python.framework import ops
ops.reset_default_graph()
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
import numpy as np
import os
import tensorflow_addons as tfa

print(tf.__version__)

def conv2d_weightnorm(filters, kernel_size, padding='valid', activation=None, **kwargs):
    return tfa.layers.WeightNormalization( tf.keras.layers.Conv2D(filters, kernel_size, padding=padding, activation=activation, **kwargs), data_init=False)

"""## Download the fashion MNIST dataset"""
n_gpus = 2
fashion_mnist = tf.keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images[..., None]
test_images = test_images[..., None]

# Getting the images in [0, 1] range.
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)

# If the list of devices is not specified in the
# `tf.distribute.MirroredStrategy` constructor, it will be auto-detected.
if n_gpus == 1:
    strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
elif n_gpus > 1 :
    strategy = tf.distribute.MirroredStrategy()

print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))

BUFFER_SIZE = len(train_images)

BATCH_SIZE_PER_REPLICA = 64
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

EPOCHS = 10

train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(BUFFER_SIZE).batch(GLOBAL_BATCH_SIZE, drop_remainder=True) 
test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(GLOBAL_BATCH_SIZE, drop_remainder=True) 

train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)

def create_model():
  model = tf.keras.Sequential([
      conv2d_weightnorm(filters=32,kernel_size= 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      conv2d_weightnorm(filters=64, kernel_size= 3, activation='relu'),
      tf.keras.layers.MaxPooling2D(),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10, activation='softmax')
    ])

  return model

# Create a checkpoint directory to store the checkpoints.
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

with strategy.scope():
  # Set reduction to `none` so we can do the reduction afterwards and divide by
  # global batch size.
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
      reduction=tf.keras.losses.Reduction.NONE)
  # or loss_fn = tf.keras.losses.sparse_categorical_crossentropy
  def compute_loss(labels, predictions):
    per_example_loss = loss_object(labels, predictions)
    return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

with strategy.scope():
  test_loss = tf.keras.metrics.Mean(name='test_loss')

  train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
      name='train_accuracy')
  test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
      name='test_accuracy')

"""## Training loop"""

# model and optimizer must be created under `strategy.scope`.
with strategy.scope():
  model = create_model()

  optimizer = tf.keras.optimizers.Adam()

  checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

with strategy.scope():
  def train_step(inputs):
    images, labels = inputs

    with tf.GradientTape() as tape:
      predictions = model(images, training=True)
      loss = compute_loss(labels, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_accuracy.update_state(labels, predictions)
    return loss 

  def test_step(inputs):
    images, labels = inputs

    predictions = model(images, training=False)
    t_loss = loss_object(labels, predictions)

    test_loss.update_state(t_loss)
    test_accuracy.update_state(labels, predictions)

with strategy.scope():
  # `experimental_run_v2` replicates the provided computation and runs it
  # with the distributed input.
  @tf.function
  def distributed_train_step(dataset_inputs):
    per_replica_losses = strategy.experimental_run_v2(train_step,
                                                      args=(dataset_inputs,))
    return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                           axis=None)

  @tf.function
  def distributed_test_step(dataset_inputs):
    return strategy.experimental_run_v2(test_step, args=(dataset_inputs,))

  for epoch in range(EPOCHS):
    # TRAIN LOOP
    total_loss = 0.0
    num_batches = 0
    for x in train_dist_dataset:
      total_loss += distributed_train_step(x)
      num_batches += 1
    train_loss = total_loss / num_batches

    # TEST LOOP
    for x in test_dist_dataset:
      distributed_test_step(x)

    if epoch % 2 == 0:
      checkpoint.save(checkpoint_prefix)

    template = ("Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, "
                "Test Accuracy: {}")
    print (template.format(epoch+1, train_loss,
                           train_accuracy.result()*100, test_loss.result(),
                           test_accuracy.result()*100))

    test_loss.reset_states()
    train_accuracy.reset_states()
    test_accuracy.reset_states()

eval_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
      name='eval_accuracy')

new_model = create_model()
new_optimizer = tf.keras.optimizers.Adam()

test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(GLOBAL_BATCH_SIZE)

@tf.function
def eval_step(images, labels):
  predictions = new_model(images, training=False)
  eval_accuracy(labels, predictions)

checkpoint = tf.train.Checkpoint(optimizer=new_optimizer, model=new_model)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

for images, labels in test_dataset:
  eval_step(images, labels)

print ('Accuracy after restoring the saved model without strategy: {}'.format(
    eval_accuracy.result()*100))

with strategy.scope():
  for _ in range(EPOCHS):
    total_loss = 0.0
    num_batches = 0
    train_iter = iter(train_dist_dataset)

    for _ in range(10):
      total_loss += distributed_train_step(next(train_iter))
      num_batches += 1
    average_train_loss = total_loss / num_batches

    template = ("Epoch {}, Loss: {}, Accuracy: {}")
    print (template.format(epoch+1, average_train_loss, train_accuracy.result()*100))
    train_accuracy.reset_states()

with strategy.scope():
  @tf.function
  def distributed_train_epoch(dataset):
    total_loss = 0.0
    num_batches = 0
    for x in dataset:
      per_replica_losses = strategy.experimental_run_v2(train_step,
                                                        args=(x,))
      total_loss += strategy.reduce(
        tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
      num_batches += 1
    return total_loss / tf.cast(num_batches, dtype=tf.float32)

  for epoch in range(EPOCHS):
    train_loss = distributed_train_epoch(train_dist_dataset)

    template = ("Epoch {}, Loss: {}, Accuracy: {}")
    print (template.format(epoch+1, train_loss, train_accuracy.result()*100))

    train_accuracy.reset_states()

Log :

2019-12-15 13:33:56.842121: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-12-15 13:33:56.865580: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3600000000 Hz
2019-12-15 13:33:56.866834: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x559827eb2c80 executing computations on platform Host. Devices:
2019-12-15 13:33:56.866850: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): Host, Default Version
2019-12-15 13:33:56.868376: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2019-12-15 13:33:56.950021: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.950311: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x559827f5e360 executing computations on platform CUDA. Devices:
2019-12-15 13:33:56.950322: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): GeForce RTX 2080, Compute Capability 7.5
2019-12-15 13:33:56.950424: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.950624: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties: 
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.71
pciBusID: 0000:01:00.0
2019-12-15 13:33:56.950753: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:56.951395: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2019-12-15 13:33:56.952012: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2019-12-15 13:33:56.952152: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2019-12-15 13:33:56.952910: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2019-12-15 13:33:56.953559: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2019-12-15 13:33:56.955396: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2019-12-15 13:33:56.955462: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.955694: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.955885: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2019-12-15 13:33:56.955906: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:56.956475: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-12-15 13:33:56.956483: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165]      0 
2019-12-15 13:33:56.956486: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0:   N 
2019-12-15 13:33:56.956540: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.956756: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.956964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6802 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:01:00.0, compute capability: 7.5)
2.0.0
2019-12-15 13:33:57.263344: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.263612: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties: 
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.71
pciBusID: 0000:01:00.0
2019-12-15 13:33:57.263649: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:57.263658: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2019-12-15 13:33:57.263666: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2019-12-15 13:33:57.263674: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2019-12-15 13:33:57.263682: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2019-12-15 13:33:57.263690: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2019-12-15 13:33:57.263698: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2019-12-15 13:33:57.263733: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.263968: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.264151: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2019-12-15 13:33:57.264552: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.264742: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties: 
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.71
pciBusID: 0000:01:00.0
2019-12-15 13:33:57.264755: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:57.264764: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2019-12-15 13:33:57.264771: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2019-12-15 13:33:57.264779: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2019-12-15 13:33:57.264787: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2019-12-15 13:33:57.264794: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2019-12-15 13:33:57.264802: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2019-12-15 13:33:57.264827: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.265505: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.265690: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2019-12-15 13:33:57.265705: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-12-15 13:33:57.265708: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165]      0 
2019-12-15 13:33:57.265710: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0:   N 
2019-12-15 13:33:57.265751: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.265957: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.266145: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6802 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:01:00.0, compute capability: 7.5)
2019-12-15 13:33:57.267710: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.268112: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties: 
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.71
pciBusID: 0000:01:00.0
2019-12-15 13:33:57.268128: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:57.268137: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2019-12-15 13:33:57.268145: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2019-12-15 13:33:57.268153: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2019-12-15 13:33:57.268160: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2019-12-15 13:33:57.268168: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2019-12-15 13:33:57.268176: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2019-12-15 13:33:57.268204: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.268417: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.268654: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2019-12-15 13:33:57.268664: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-12-15 13:33:57.268667: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165]      0 
2019-12-15 13:33:57.268669: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0:   N 
2019-12-15 13:33:57.268742: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.269018: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.269267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/device:GPU:0 with 6802 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:01:00.0, compute capability: 7.5)
Number of devices: 1
WARNING:tensorflow:From /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:84: Layer.add_variable (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.add_weight` method instead.
INFO:tensorflow:Error reported to Coordinator: in converted code:

    /home/shreyas/Desktop/custom_training.py:113 train_step  *
        predictions = model(images, training=True)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py:847 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:113 _update_weights  *
        with tf.control_dependencies(self._initialize_weights(inputs)):
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1389 cond_for_tf_v2
        return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py:507 new_func
        return func(*args, **kwargs)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1174 cond
        return cond_v2.cond_v2(pred, true_fn, false_fn, name)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/cond_v2.py:91 cond_v2
        op_return_value=pred)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py:915 func_graph_from_py_func
        func_outputs = python_func(*func_args, **func_kwargs)
    /tmp/tmp0jpsvtyk.py:31 _update_weights
        with tf.control_dependencies(self._initialize_weights(inputs)):
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:148 _initialize_weights
        assign_tensors = self._init_norm()
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:157 _init_norm
        g_tensor = self.g.assign(tf.reshape(v_norm, (self.layer_depth,)))
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1036 assign
        return self._assign_func(f=assign_fn, *args, **kwargs)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1016 _assign_func
        variable_type="MirroredVariable"))

    ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
Traceback (most recent call last):
  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/training/coordinator.py", line 297, in stop_on_exception
    yield
  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py", line 879, in run
    self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
    raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:

    /home/shreyas/Desktop/custom_training.py:113 train_step  *
        predictions = model(images, training=True)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py:847 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:113 _update_weights  *
        with tf.control_dependencies(self._initialize_weights(inputs)):
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1389 cond_for_tf_v2
        return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py:507 new_func
        return func(*args, **kwargs)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1174 cond
        return cond_v2.cond_v2(pred, true_fn, false_fn, name)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/cond_v2.py:91 cond_v2
        op_return_value=pred)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py:915 func_graph_from_py_func
        func_outputs = python_func(*func_args, **func_kwargs)
    /tmp/tmp0jpsvtyk.py:31 _update_weights
        with tf.control_dependencies(self._initialize_weights(inputs)):
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:148 _initialize_weights
        assign_tensors = self._init_norm()
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:157 _init_norm
        g_tensor = self.g.assign(tf.reshape(v_norm, (self.layer_depth,)))
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1036 assign
        return self._assign_func(f=assign_fn, *args, **kwargs)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1016 _assign_func
        variable_type="MirroredVariable"))

    ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.

Traceback (most recent call last):

  File "/home/shreyas/Desktop/custom_training.py", line 150, in <module>
    total_loss += distributed_train_step(x)

  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 457, in __call__
    result = self._call(*args, **kwds)

  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 503, in _call
    self._initialize(args, kwds, add_initializers_to=initializer_map)

  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 408, in _initialize
    *args, **kwds))

  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 1848, in _get_concrete_function_internal_garbage_collected
    graph_function, _, _ = self._maybe_define_function(args, kwargs)

  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 2150, in _maybe_define_function
    graph_function = self._create_graph_function(args, kwargs)

  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 2041, in _create_graph_function
    capture_by_value=self._capture_by_value),

  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py", line 915, in func_graph_from_py_func
    func_outputs = python_func(*func_args, **func_kwargs)

  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 358, in wrapped_fn
    return weak_wrapped_fn().__wrapped__(*args, **kwds)

  File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py", line 905, in wrapper
    raise e.ag_error_metadata.to_exception(e)

ValueError: in converted code:

    /home/shreyas/Desktop/custom_training.py:136 distributed_train_step  *
        per_replica_losses = strategy.experimental_run_v2(train_step,
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/distribute_lib.py:760 experimental_run_v2
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/shreyas/Desktop/custom_training.py:113 train_step  *
        predictions = model(images, training=True)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py:847 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:113 _update_weights  *
        with tf.control_dependencies(self._initialize_weights(inputs)):
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1389 cond_for_tf_v2
        return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py:507 new_func
        return func(*args, **kwargs)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1174 cond
        return cond_v2.cond_v2(pred, true_fn, false_fn, name)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/cond_v2.py:91 cond_v2
        op_return_value=pred)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py:915 func_graph_from_py_func
        func_outputs = python_func(*func_args, **func_kwargs)
    /tmp/tmp0jpsvtyk.py:31 _update_weights
        with tf.control_dependencies(self._initialize_weights(inputs)):
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:148 _initialize_weights
        assign_tensors = self._init_norm()
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:157 _init_norm
        g_tensor = self.g.assign(tf.reshape(v_norm, (self.layer_depth,)))
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1036 assign
        return self._assign_func(f=assign_fn, *args, **kwargs)
    /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1016 _assign_func
        variable_type="MirroredVariable"))

    ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.

Please take a look.

Thank you, Best, Shreyas Kamath

Squadrick commented 4 years ago

@shreyaskamathkm Could you please send the pytorch code you used that worked on multiple GPUs?

shreyaskamathkm commented 4 years ago

I was working on super resolution and had used the code provided in here. To perform data parallellism I had used torch.nn.DataParallel(model). Please let me know if you require a complete pytorch code for demonstration.

Thank you, Best, SK

Squadrick commented 4 years ago

@shreyaskamathkm Figured out the problem, the fix should be easy. I'll open a PR once #819 is merged.

I, unfortunately, don't have access to multiple-GPUs, could you help out with the testing?

shreyaskamathkm commented 4 years ago

Sure, I can test the code.

veqtor commented 4 years ago

Any updates on this?

Squadrick commented 4 years ago

@veqtor Didn't find the time to get around to solving this, I'll try to get it done soon.

sourcecode369 commented 4 years ago

I am facing the same issue. I am using TensorFlow 2.2.0-rc3 and TensorFlow Addons 0.9.1. Below is my code to reproduce this,

import tensorflow as tf
print(f"tf.__version__: {tf.__version__}")
tf.config.optimizer.set_jit(True)
import tensorflow_addons as tfa;tfa.options.TF_ADDONS_PY_OPS = True
from tensorflow.keras import backend as K
from tensorflow.keras.datasets import mnist
import tensorflow_datasets as tfds
import os

strategy = tf.distribute.MirroredStrategy()
def get_dataset(batch_size=200):
  datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True,
                             try_gcs=True)
  mnist_train, mnist_test = datasets['train'], datasets['test']

  def scale(image, label):
    image = tf.cast(image, tf.float32)
    image /= 255.0

    return image, label

  train_dataset = mnist_train.map(scale).cache().shuffle(10000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
  test_dataset = mnist_test.map(scale).cache().batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

  return train_dataset, test_dataset

def create_model():
  return tf.keras.Sequential(
      [tfa.layers.WeightNormalization(tf.keras.layers.Conv2D(32, 3, activation=tfa.activations.mish, input_shape=(28, 28, 1))),
       tf.keras.layers.Flatten(),
       tfa.layers.WeightNormalization(tf.keras.layers.Dense(128, activation=tfa.activations.mish)),
       tfa.layers.WeightNormalization(tf.keras.layers.Dense(10))])

train_dataset, test_dataset = get_dataset()
with strategy.scope():
  model = create_model()
  model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['sparse_categorical_accuracy'])

model.fit(train_dataset,epochs=10,validation_data=test_dataset, callbacks=[tfa.callbacks.TQDMProgressBar()],verbose=0)

Error:

INFO:tensorflow:Error reported to Coordinator: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
    yield
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py", line 321, in run
    self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 259, in wrapper
    return converted_call(f, args, kwargs, options=options)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 526, in converted_call
    return _call_unconverted(f, args, kwargs, options)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 343, in _call_unconverted
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 611, in run_step
    outputs = model.train_step(data)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 569, in train_step
    y_pred = self(x, training=True)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 935, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/sequential.py", line 393, in call
    outputs = layer(inputs, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 905, in __call__
    self._maybe_build(inputs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 2452, in _maybe_build
    self.build(input_shapes)  # pylint:disable=not-callable
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_addons/layers/wrappers.py", line 119, in build
    self._naked_clone_layer.set_weights(self.layer.get_weights())
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1638, in set_weights
    backend.batch_set_value(weight_value_tuples)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 3427, in batch_set_value
    x.assign(np.asarray(value, dtype=dtype(x)))
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py", line 830, in assign
    read_value=read_value)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py", line 779, in _mirrored_update
    _aggregation_error_msg.format(variable_type="MirroredVariable"))
ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
INFO:tensorflow:Error reported to Coordinator: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
    yield
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py", line 321, in run
    self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 259, in wrapper
    return converted_call(f, args, kwargs, options=options)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 526, in converted_call
    return _call_unconverted(f, args, kwargs, options)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 343, in _call_unconverted
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 611, in run_step
    outputs = model.train_step(data)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 569, in train_step
    y_pred = self(x, training=True)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 935, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/sequential.py", line 393, in call
    outputs = layer(inputs, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 905, in __call__
    self._maybe_build(inputs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 2452, in _maybe_build
    self.build(input_shapes)  # pylint:disable=not-callable
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_addons/layers/wrappers.py", line 119, in build
    self._naked_clone_layer.set_weights(self.layer.get_weights())
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1638, in set_weights
    backend.batch_set_value(weight_value_tuples)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 3427, in batch_set_value
    x.assign(np.asarray(value, dtype=dtype(x)))
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py", line 830, in assign
    read_value=read_value)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py", line 779, in _mirrored_update
    _aggregation_error_msg.format(variable_type="MirroredVariable"))
ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-1-b1626bb45201> in <module>()
     39                 metrics=['sparse_categorical_accuracy'])
     40 
---> 41 model.fit(train_dataset,epochs=10,validation_data=test_dataset, callbacks=[tfa.callbacks.TQDMProgressBar()],verbose=0)

10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
     69   def _method_wrapper(self, *args, **kwargs):
     70     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
---> 71       return method(self, *args, **kwargs)
     72 
     73     # Running inside `run_distribute_coordinator` already.

/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    915                 batch_size=batch_size):
    916               callbacks.on_train_batch_begin(step)
--> 917               tmp_logs = train_function(iterator)
    918               if data_handler.should_sync:
    919                 context.async_wait()

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    693       else:
    694         compiler = "nonXla"
--> 695         result = self._call(*args, **kwds)
    696 
    697       new_tracing_count = self._get_tracing_count()

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    735       # This is the first call of __call__, so we have to initialize.
    736       initializers = []
--> 737       self._initialize(args, kwds, add_initializers_to=initializers)
    738     finally:
    739       # At this point we know that the initialization is complete (or less

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
    615     self._concrete_stateful_fn = (
    616         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
--> 617             *args, **kwds))
    618 
    619     def invalid_creator_scope(*unused_args, **unused_kwds):

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
   2445       args, kwargs = None, None
   2446     with self._lock:
-> 2447       graph_function, _, _ = self._maybe_define_function(args, kwargs)
   2448     return graph_function
   2449 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
   2773 
   2774       self._function_cache.missed.add(call_context_key)
-> 2775       graph_function = self._create_graph_function(args, kwargs)
   2776       self._function_cache.primary[cache_key] = graph_function
   2777       return graph_function, args, kwargs

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   2663             arg_names=arg_names,
   2664             override_flat_arg_shapes=override_flat_arg_shapes,
-> 2665             capture_by_value=self._capture_by_value),
   2666         self._function_attributes,
   2667         # Tell the ConcreteFunction to clean up its graph once it goes out of

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    979         _, original_func = tf_decorator.unwrap(python_func)
    980 
--> 981       func_outputs = python_func(*func_args, **func_kwargs)
    982 
    983       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
    526         # __wrapped__ allows AutoGraph to swap in a converted function. We give
    527         # the function a weak reference to itself to avoid a reference cycle.
--> 528         return weak_wrapped_fn().__wrapped__(*args, **kwds)
    529     weak_wrapped_fn = weakref.ref(wrapped_fn)
    530 

/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
    966           except Exception as e:  # pylint:disable=broad-except
    967             if hasattr(e, "ag_error_metadata"):
--> 968               raise e.ag_error_metadata.to_exception(e)
    969             else:
    970               raise

ValueError: in user code:

    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:628 train_function  *
        return step_function(self, iterator)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:618 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:952 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:2292 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_strategy.py:584 _call_for_each_replica
        args, kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py:96 call_for_each_replica
        return _call_for_each_replica(strategy, fn, args, kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py:235 _call_for_each_replica
        coord.join(threads)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/coordinator.py:389 join
        six.reraise(*self._exc_info_to_raise)
    /usr/local/lib/python3.6/dist-packages/six.py:693 reraise
        raise value
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/coordinator.py:297 stop_on_exception
        yield
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py:321 run
        self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:611 run_step  **
        outputs = model.train_step(data)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:569 train_step
        y_pred = self(x, training=True)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py:935 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/sequential.py:393 call
        outputs = layer(inputs, **kwargs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py:905 __call__
        self._maybe_build(inputs)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py:2452 _maybe_build
        self.build(input_shapes)  # pylint:disable=not-callable
    /usr/local/lib/python3.6/dist-packages/tensorflow_addons/layers/wrappers.py:119 build
        self._naked_clone_layer.set_weights(self.layer.get_weights())
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py:1638 set_weights
        backend.batch_set_value(weight_value_tuples)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py:3427 batch_set_value
        x.assign(np.asarray(value, dtype=dtype(x)))
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py:830 assign
        read_value=read_value)
    /usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py:779 _mirrored_update
        _aggregation_error_msg.format(variable_type="MirroredVariable"))

    ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.

Hope to see a fix soon. Thank you.

leoffx commented 4 years ago

Facing the same problem on TF 2.2.0, tfa 0.10 on Colab's TPU. Is there any workaround? @Squadrick