Closed shreyaskamathkm closed 4 years ago
Hi @shreyaskamathkm thank you for identifying this! Would you mind adding a minimal reproducible example along with the error that you receive?
Sure thing. I will add it over the weekend.
Hi, @seanpmorgan I edited the distributed strategy version from the TensorFlow example. Setting n_gpu = 1, works as expected. When n_gpu = 2, we get an error. The complete log is also attached after the code.
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.debugging.set_log_device_placement(False) # COmment this if display of where the variables are goin is not requried
tf.compat.v1.reset_default_graph()
from tensorflow.python.framework import ops
ops.reset_default_graph()
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
import numpy as np
import os
import tensorflow_addons as tfa
print(tf.__version__)
def conv2d_weightnorm(filters, kernel_size, padding='valid', activation=None, **kwargs):
return tfa.layers.WeightNormalization( tf.keras.layers.Conv2D(filters, kernel_size, padding=padding, activation=activation, **kwargs), data_init=False)
"""## Download the fashion MNIST dataset"""
n_gpus = 2
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_images = train_images[..., None]
test_images = test_images[..., None]
# Getting the images in [0, 1] range.
train_images = train_images / np.float32(255)
test_images = test_images / np.float32(255)
# If the list of devices is not specified in the
# `tf.distribute.MirroredStrategy` constructor, it will be auto-detected.
if n_gpus == 1:
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
elif n_gpus > 1 :
strategy = tf.distribute.MirroredStrategy()
print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))
BUFFER_SIZE = len(train_images)
BATCH_SIZE_PER_REPLICA = 64
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
EPOCHS = 10
train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(BUFFER_SIZE).batch(GLOBAL_BATCH_SIZE, drop_remainder=True)
test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(GLOBAL_BATCH_SIZE, drop_remainder=True)
train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
test_dist_dataset = strategy.experimental_distribute_dataset(test_dataset)
def create_model():
model = tf.keras.Sequential([
conv2d_weightnorm(filters=32,kernel_size= 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
conv2d_weightnorm(filters=64, kernel_size= 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')
])
return model
# Create a checkpoint directory to store the checkpoints.
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
with strategy.scope():
# Set reduction to `none` so we can do the reduction afterwards and divide by
# global batch size.
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
reduction=tf.keras.losses.Reduction.NONE)
# or loss_fn = tf.keras.losses.sparse_categorical_crossentropy
def compute_loss(labels, predictions):
per_example_loss = loss_object(labels, predictions)
return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)
with strategy.scope():
test_loss = tf.keras.metrics.Mean(name='test_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
name='train_accuracy')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
name='test_accuracy')
"""## Training loop"""
# model and optimizer must be created under `strategy.scope`.
with strategy.scope():
model = create_model()
optimizer = tf.keras.optimizers.Adam()
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)
with strategy.scope():
def train_step(inputs):
images, labels = inputs
with tf.GradientTape() as tape:
predictions = model(images, training=True)
loss = compute_loss(labels, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_accuracy.update_state(labels, predictions)
return loss
def test_step(inputs):
images, labels = inputs
predictions = model(images, training=False)
t_loss = loss_object(labels, predictions)
test_loss.update_state(t_loss)
test_accuracy.update_state(labels, predictions)
with strategy.scope():
# `experimental_run_v2` replicates the provided computation and runs it
# with the distributed input.
@tf.function
def distributed_train_step(dataset_inputs):
per_replica_losses = strategy.experimental_run_v2(train_step,
args=(dataset_inputs,))
return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
axis=None)
@tf.function
def distributed_test_step(dataset_inputs):
return strategy.experimental_run_v2(test_step, args=(dataset_inputs,))
for epoch in range(EPOCHS):
# TRAIN LOOP
total_loss = 0.0
num_batches = 0
for x in train_dist_dataset:
total_loss += distributed_train_step(x)
num_batches += 1
train_loss = total_loss / num_batches
# TEST LOOP
for x in test_dist_dataset:
distributed_test_step(x)
if epoch % 2 == 0:
checkpoint.save(checkpoint_prefix)
template = ("Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, "
"Test Accuracy: {}")
print (template.format(epoch+1, train_loss,
train_accuracy.result()*100, test_loss.result(),
test_accuracy.result()*100))
test_loss.reset_states()
train_accuracy.reset_states()
test_accuracy.reset_states()
eval_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
name='eval_accuracy')
new_model = create_model()
new_optimizer = tf.keras.optimizers.Adam()
test_dataset = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(GLOBAL_BATCH_SIZE)
@tf.function
def eval_step(images, labels):
predictions = new_model(images, training=False)
eval_accuracy(labels, predictions)
checkpoint = tf.train.Checkpoint(optimizer=new_optimizer, model=new_model)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
for images, labels in test_dataset:
eval_step(images, labels)
print ('Accuracy after restoring the saved model without strategy: {}'.format(
eval_accuracy.result()*100))
with strategy.scope():
for _ in range(EPOCHS):
total_loss = 0.0
num_batches = 0
train_iter = iter(train_dist_dataset)
for _ in range(10):
total_loss += distributed_train_step(next(train_iter))
num_batches += 1
average_train_loss = total_loss / num_batches
template = ("Epoch {}, Loss: {}, Accuracy: {}")
print (template.format(epoch+1, average_train_loss, train_accuracy.result()*100))
train_accuracy.reset_states()
with strategy.scope():
@tf.function
def distributed_train_epoch(dataset):
total_loss = 0.0
num_batches = 0
for x in dataset:
per_replica_losses = strategy.experimental_run_v2(train_step,
args=(x,))
total_loss += strategy.reduce(
tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
num_batches += 1
return total_loss / tf.cast(num_batches, dtype=tf.float32)
for epoch in range(EPOCHS):
train_loss = distributed_train_epoch(train_dist_dataset)
template = ("Epoch {}, Loss: {}, Accuracy: {}")
print (template.format(epoch+1, train_loss, train_accuracy.result()*100))
train_accuracy.reset_states()
Log :
2019-12-15 13:33:56.842121: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-12-15 13:33:56.865580: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3600000000 Hz
2019-12-15 13:33:56.866834: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x559827eb2c80 executing computations on platform Host. Devices:
2019-12-15 13:33:56.866850: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): Host, Default Version
2019-12-15 13:33:56.868376: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2019-12-15 13:33:56.950021: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.950311: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x559827f5e360 executing computations on platform CUDA. Devices:
2019-12-15 13:33:56.950322: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): GeForce RTX 2080, Compute Capability 7.5
2019-12-15 13:33:56.950424: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.950624: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.71
pciBusID: 0000:01:00.0
2019-12-15 13:33:56.950753: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:56.951395: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2019-12-15 13:33:56.952012: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2019-12-15 13:33:56.952152: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2019-12-15 13:33:56.952910: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2019-12-15 13:33:56.953559: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2019-12-15 13:33:56.955396: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2019-12-15 13:33:56.955462: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.955694: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.955885: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2019-12-15 13:33:56.955906: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:56.956475: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-12-15 13:33:56.956483: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165] 0
2019-12-15 13:33:56.956486: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0: N
2019-12-15 13:33:56.956540: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.956756: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:56.956964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6802 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:01:00.0, compute capability: 7.5)
2.0.0
2019-12-15 13:33:57.263344: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.263612: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.71
pciBusID: 0000:01:00.0
2019-12-15 13:33:57.263649: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:57.263658: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2019-12-15 13:33:57.263666: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2019-12-15 13:33:57.263674: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2019-12-15 13:33:57.263682: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2019-12-15 13:33:57.263690: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2019-12-15 13:33:57.263698: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2019-12-15 13:33:57.263733: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.263968: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.264151: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2019-12-15 13:33:57.264552: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.264742: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.71
pciBusID: 0000:01:00.0
2019-12-15 13:33:57.264755: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:57.264764: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2019-12-15 13:33:57.264771: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2019-12-15 13:33:57.264779: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2019-12-15 13:33:57.264787: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2019-12-15 13:33:57.264794: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2019-12-15 13:33:57.264802: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2019-12-15 13:33:57.264827: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.265505: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.265690: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2019-12-15 13:33:57.265705: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-12-15 13:33:57.265708: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165] 0
2019-12-15 13:33:57.265710: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0: N
2019-12-15 13:33:57.265751: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.265957: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.266145: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6802 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:01:00.0, compute capability: 7.5)
2019-12-15 13:33:57.267710: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.268112: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties:
name: GeForce RTX 2080 major: 7 minor: 5 memoryClockRate(GHz): 1.71
pciBusID: 0000:01:00.0
2019-12-15 13:33:57.268128: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.0
2019-12-15 13:33:57.268137: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10.0
2019-12-15 13:33:57.268145: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10.0
2019-12-15 13:33:57.268153: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10.0
2019-12-15 13:33:57.268160: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10.0
2019-12-15 13:33:57.268168: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10.0
2019-12-15 13:33:57.268176: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2019-12-15 13:33:57.268204: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.268417: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.268654: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1746] Adding visible gpu devices: 0
2019-12-15 13:33:57.268664: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1159] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-12-15 13:33:57.268667: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1165] 0
2019-12-15 13:33:57.268669: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1178] 0: N
2019-12-15 13:33:57.268742: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.269018: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1006] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-12-15 13:33:57.269267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/device:GPU:0 with 6802 MB memory) -> physical GPU (device: 0, name: GeForce RTX 2080, pci bus id: 0000:01:00.0, compute capability: 7.5)
Number of devices: 1
WARNING:tensorflow:From /opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:84: Layer.add_variable (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.add_weight` method instead.
INFO:tensorflow:Error reported to Coordinator: in converted code:
/home/shreyas/Desktop/custom_training.py:113 train_step *
predictions = model(images, training=True)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py:847 __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:113 _update_weights *
with tf.control_dependencies(self._initialize_weights(inputs)):
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1389 cond_for_tf_v2
return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py:507 new_func
return func(*args, **kwargs)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1174 cond
return cond_v2.cond_v2(pred, true_fn, false_fn, name)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/cond_v2.py:91 cond_v2
op_return_value=pred)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py:915 func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
/tmp/tmp0jpsvtyk.py:31 _update_weights
with tf.control_dependencies(self._initialize_weights(inputs)):
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:148 _initialize_weights
assign_tensors = self._init_norm()
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:157 _init_norm
g_tensor = self.g.assign(tf.reshape(v_norm, (self.layer_depth,)))
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1036 assign
return self._assign_func(f=assign_fn, *args, **kwargs)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1016 _assign_func
variable_type="MirroredVariable"))
ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/mirrored_strategy.py", line 879, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/autograph/impl/api.py", line 237, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
/home/shreyas/Desktop/custom_training.py:113 train_step *
predictions = model(images, training=True)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py:847 __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:113 _update_weights *
with tf.control_dependencies(self._initialize_weights(inputs)):
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1389 cond_for_tf_v2
return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py:507 new_func
return func(*args, **kwargs)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1174 cond
return cond_v2.cond_v2(pred, true_fn, false_fn, name)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/cond_v2.py:91 cond_v2
op_return_value=pred)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py:915 func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
/tmp/tmp0jpsvtyk.py:31 _update_weights
with tf.control_dependencies(self._initialize_weights(inputs)):
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:148 _initialize_weights
assign_tensors = self._init_norm()
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:157 _init_norm
g_tensor = self.g.assign(tf.reshape(v_norm, (self.layer_depth,)))
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1036 assign
return self._assign_func(f=assign_fn, *args, **kwargs)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1016 _assign_func
variable_type="MirroredVariable"))
ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
Traceback (most recent call last):
File "/home/shreyas/Desktop/custom_training.py", line 150, in <module>
total_loss += distributed_train_step(x)
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 457, in __call__
result = self._call(*args, **kwds)
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 503, in _call
self._initialize(args, kwds, add_initializers_to=initializer_map)
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 408, in _initialize
*args, **kwds))
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 1848, in _get_concrete_function_internal_garbage_collected
graph_function, _, _ = self._maybe_define_function(args, kwargs)
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 2150, in _maybe_define_function
graph_function = self._create_graph_function(args, kwargs)
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 2041, in _create_graph_function
capture_by_value=self._capture_by_value),
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py", line 915, in func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py", line 358, in wrapped_fn
return weak_wrapped_fn().__wrapped__(*args, **kwds)
File "/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py", line 905, in wrapper
raise e.ag_error_metadata.to_exception(e)
ValueError: in converted code:
/home/shreyas/Desktop/custom_training.py:136 distributed_train_step *
per_replica_losses = strategy.experimental_run_v2(train_step,
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/distribute_lib.py:760 experimental_run_v2
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/home/shreyas/Desktop/custom_training.py:113 train_step *
predictions = model(images, training=True)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py:847 __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:113 _update_weights *
with tf.control_dependencies(self._initialize_weights(inputs)):
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1389 cond_for_tf_v2
return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py:507 new_func
return func(*args, **kwargs)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/control_flow_ops.py:1174 cond
return cond_v2.cond_v2(pred, true_fn, false_fn, name)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/cond_v2.py:91 cond_v2
op_return_value=pred)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/func_graph.py:915 func_graph_from_py_func
func_outputs = python_func(*func_args, **func_kwargs)
/tmp/tmp0jpsvtyk.py:31 _update_weights
with tf.control_dependencies(self._initialize_weights(inputs)):
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:148 _initialize_weights
assign_tensors = self._init_norm()
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_addons/layers/wrappers.py:157 _init_norm
g_tensor = self.g.assign(tf.reshape(v_norm, (self.layer_depth,)))
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1036 assign
return self._assign_func(f=assign_fn, *args, **kwargs)
/opt/anaconda/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/distribute/values.py:1016 _assign_func
variable_type="MirroredVariable"))
ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
Please take a look.
Thank you, Best, Shreyas Kamath
@shreyaskamathkm Could you please send the pytorch code you used that worked on multiple GPUs?
I was working on super resolution and had used the code provided in here. To perform data parallellism I had used torch.nn.DataParallel(model). Please let me know if you require a complete pytorch code for demonstration.
Thank you, Best, SK
@shreyaskamathkm Figured out the problem, the fix should be easy. I'll open a PR once #819 is merged.
I, unfortunately, don't have access to multiple-GPUs, could you help out with the testing?
Sure, I can test the code.
Any updates on this?
@veqtor Didn't find the time to get around to solving this, I'll try to get it done soon.
I am facing the same issue. I am using TensorFlow 2.2.0-rc3 and TensorFlow Addons 0.9.1. Below is my code to reproduce this,
import tensorflow as tf
print(f"tf.__version__: {tf.__version__}")
tf.config.optimizer.set_jit(True)
import tensorflow_addons as tfa;tfa.options.TF_ADDONS_PY_OPS = True
from tensorflow.keras import backend as K
from tensorflow.keras.datasets import mnist
import tensorflow_datasets as tfds
import os
strategy = tf.distribute.MirroredStrategy()
def get_dataset(batch_size=200):
datasets, info = tfds.load(name='mnist', with_info=True, as_supervised=True,
try_gcs=True)
mnist_train, mnist_test = datasets['train'], datasets['test']
def scale(image, label):
image = tf.cast(image, tf.float32)
image /= 255.0
return image, label
train_dataset = mnist_train.map(scale).cache().shuffle(10000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = mnist_test.map(scale).cache().batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
return train_dataset, test_dataset
def create_model():
return tf.keras.Sequential(
[tfa.layers.WeightNormalization(tf.keras.layers.Conv2D(32, 3, activation=tfa.activations.mish, input_shape=(28, 28, 1))),
tf.keras.layers.Flatten(),
tfa.layers.WeightNormalization(tf.keras.layers.Dense(128, activation=tfa.activations.mish)),
tfa.layers.WeightNormalization(tf.keras.layers.Dense(10))])
train_dataset, test_dataset = get_dataset()
with strategy.scope():
model = create_model()
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['sparse_categorical_accuracy'])
model.fit(train_dataset,epochs=10,validation_data=test_dataset, callbacks=[tfa.callbacks.TQDMProgressBar()],verbose=0)
Error:
INFO:tensorflow:Error reported to Coordinator: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py", line 321, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 259, in wrapper
return converted_call(f, args, kwargs, options=options)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 526, in converted_call
return _call_unconverted(f, args, kwargs, options)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 343, in _call_unconverted
return f(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 611, in run_step
outputs = model.train_step(data)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 569, in train_step
y_pred = self(x, training=True)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 935, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/sequential.py", line 393, in call
outputs = layer(inputs, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 905, in __call__
self._maybe_build(inputs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 2452, in _maybe_build
self.build(input_shapes) # pylint:disable=not-callable
File "/usr/local/lib/python3.6/dist-packages/tensorflow_addons/layers/wrappers.py", line 119, in build
self._naked_clone_layer.set_weights(self.layer.get_weights())
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1638, in set_weights
backend.batch_set_value(weight_value_tuples)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 3427, in batch_set_value
x.assign(np.asarray(value, dtype=dtype(x)))
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py", line 830, in assign
read_value=read_value)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py", line 779, in _mirrored_update
_aggregation_error_msg.format(variable_type="MirroredVariable"))
ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
INFO:tensorflow:Error reported to Coordinator: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py", line 321, in run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 259, in wrapper
return converted_call(f, args, kwargs, options=options)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 526, in converted_call
return _call_unconverted(f, args, kwargs, options)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/autograph/impl/api.py", line 343, in _call_unconverted
return f(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 611, in run_step
outputs = model.train_step(data)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 569, in train_step
y_pred = self(x, training=True)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 935, in __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/sequential.py", line 393, in call
outputs = layer(inputs, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 905, in __call__
self._maybe_build(inputs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 2452, in _maybe_build
self.build(input_shapes) # pylint:disable=not-callable
File "/usr/local/lib/python3.6/dist-packages/tensorflow_addons/layers/wrappers.py", line 119, in build
self._naked_clone_layer.set_weights(self.layer.get_weights())
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1638, in set_weights
backend.batch_set_value(weight_value_tuples)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py", line 3427, in batch_set_value
x.assign(np.asarray(value, dtype=dtype(x)))
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py", line 830, in assign
read_value=read_value)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py", line 779, in _mirrored_update
_aggregation_error_msg.format(variable_type="MirroredVariable"))
ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-1-b1626bb45201> in <module>()
39 metrics=['sparse_categorical_accuracy'])
40
---> 41 model.fit(train_dataset,epochs=10,validation_data=test_dataset, callbacks=[tfa.callbacks.TQDMProgressBar()],verbose=0)
10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
69 def _method_wrapper(self, *args, **kwargs):
70 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
---> 71 return method(self, *args, **kwargs)
72
73 # Running inside `run_distribute_coordinator` already.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
915 batch_size=batch_size):
916 callbacks.on_train_batch_begin(step)
--> 917 tmp_logs = train_function(iterator)
918 if data_handler.should_sync:
919 context.async_wait()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
693 else:
694 compiler = "nonXla"
--> 695 result = self._call(*args, **kwds)
696
697 new_tracing_count = self._get_tracing_count()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
735 # This is the first call of __call__, so we have to initialize.
736 initializers = []
--> 737 self._initialize(args, kwds, add_initializers_to=initializers)
738 finally:
739 # At this point we know that the initialization is complete (or less
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
615 self._concrete_stateful_fn = (
616 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 617 *args, **kwds))
618
619 def invalid_creator_scope(*unused_args, **unused_kwds):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
2445 args, kwargs = None, None
2446 with self._lock:
-> 2447 graph_function, _, _ = self._maybe_define_function(args, kwargs)
2448 return graph_function
2449
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
2773
2774 self._function_cache.missed.add(call_context_key)
-> 2775 graph_function = self._create_graph_function(args, kwargs)
2776 self._function_cache.primary[cache_key] = graph_function
2777 return graph_function, args, kwargs
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2663 arg_names=arg_names,
2664 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2665 capture_by_value=self._capture_by_value),
2666 self._function_attributes,
2667 # Tell the ConcreteFunction to clean up its graph once it goes out of
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
979 _, original_func = tf_decorator.unwrap(python_func)
980
--> 981 func_outputs = python_func(*func_args, **func_kwargs)
982
983 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
526 # __wrapped__ allows AutoGraph to swap in a converted function. We give
527 # the function a weak reference to itself to avoid a reference cycle.
--> 528 return weak_wrapped_fn().__wrapped__(*args, **kwds)
529 weak_wrapped_fn = weakref.ref(wrapped_fn)
530
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
966 except Exception as e: # pylint:disable=broad-except
967 if hasattr(e, "ag_error_metadata"):
--> 968 raise e.ag_error_metadata.to_exception(e)
969 else:
970 raise
ValueError: in user code:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:628 train_function *
return step_function(self, iterator)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:618 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:952 run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:2292 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_strategy.py:584 _call_for_each_replica
args, kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py:96 call_for_each_replica
return _call_for_each_replica(strategy, fn, args, kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py:235 _call_for_each_replica
coord.join(threads)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/coordinator.py:389 join
six.reraise(*self._exc_info_to_raise)
/usr/local/lib/python3.6/dist-packages/six.py:693 reraise
raise value
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/coordinator.py:297 stop_on_exception
yield
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/mirrored_run.py:321 run
self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:611 run_step **
outputs = model.train_step(data)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:569 train_step
y_pred = self(x, training=True)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py:935 __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/sequential.py:393 call
outputs = layer(inputs, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py:905 __call__
self._maybe_build(inputs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py:2452 _maybe_build
self.build(input_shapes) # pylint:disable=not-callable
/usr/local/lib/python3.6/dist-packages/tensorflow_addons/layers/wrappers.py:119 build
self._naked_clone_layer.set_weights(self.layer.get_weights())
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py:1638 set_weights
backend.batch_set_value(weight_value_tuples)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py:3427 batch_set_value
x.assign(np.asarray(value, dtype=dtype(x)))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py:830 assign
read_value=read_value)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/values.py:779 _mirrored_update
_aggregation_error_msg.format(variable_type="MirroredVariable"))
ValueError: You must specify an aggregation method to update a MirroredVariable in Replica Context. You can do so by passing an explicit value for argument `aggregation` to tf.Variable(..).e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)``tf.VariableAggregation` lists the possible aggregation methods.This is required because MirroredVariable should always be kept in sync. When updating them or assigning to them in a replica context, we automatically try to aggregate the values before updating the variable. For this aggregation, we need to know the aggregation method. Another alternative is to not try to update such MirroredVariable in replica context, but in cross replica context. You can enter cross replica context by calling `tf.distribute.get_replica_context().merge_call(merge_fn, ..)`.Inside `merge_fn`, you can then update the MirroredVariable using `tf.distribute.StrategyExtended.update()`.
Hope to see a fix soon. Thank you.
Facing the same problem on TF 2.2.0, tfa 0.10 on Colab's TPU. Is there any workaround? @Squadrick
Describe the feature and the current behavior/state. When I try to run the weight normalization on a single GPU it works as expected. However, when I try to run it on Mirror Strategy it throws an error? Could it be extended to support multi GPUs?
Relevant information
Which API type would this fall under (layer, metric, optimizer, etc.) layer/wrapper
Who will benefit with this feature?
Any other info.