Errors from gen_mlc_ops.py on M1

Hi!

I'm trying to to run a DCGAN network on the M1. The code works fine on my Intel MacBook Pro with Catalina and standard version 2.4 non Apple Tensorflow. But on the M1 the exact same code crashes with this error:

  File "dgan-face2.py", line 371, in <module>
    train(train_images, EPOCHS)
  File "dgan-face2.py", line 317, in train
    train_step(image_batch)
  File "dgan-face2.py", line 289, in train_step
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tf24/lib/python3.8/site-packages/tensorflow/python/eager/backprop.py", line 1080, in gradient
    flat_grad = imperative_grad.imperative_grad(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tf24/lib/python3.8/site-packages/tensorflow/python/eager/imperative_grad.py", line 71, in imperative_grad
    return pywrap_tfe.TFE_Py_TapeGradient(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tf24/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 1283, in _backward_function_wrapper
    return backward._call_flat(  # pylint: disable=protected-access
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tf24/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 1918, in _call_flat
    return self._build_call_outputs(self._inference_function.call(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tf24/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 555, in call
    outputs = execute.execute(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tf24/lib/python3.8/site-packages/tensorflow/python/eager/execute.py", line 59, in quick_execute
    tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InternalError:  Missing 0-th output from node gradients/binary_crossentropy/logistic_loss/MLCSigmoidCrossEntropyWithLogits_grad/MLCSigmoidCrossEntropyWithLogitsGrad (defined at /opt/homebrew/Caskroom/miniforge/base/envs/tf24/lib/python3.8/site-packages/tensorflow/compiler/tf2mlcompute/ops/gen_mlc_ops.py:6409)  [Op:__inference___backward_generator_loss_879_946]

Errors may have originated from an input operation.
Input Source operations connected to node gradients/binary_crossentropy/logistic_loss/MLCSigmoidCrossEntropyWithLogits_grad/MLCSigmoidCrossEntropyWithLogitsGrad:
 gradients/AddN_3 (defined at dgan-face2.py:286)

Function call stack:
__backward_generator_loss_879

The dataset is available from here https://www.kaggle.com/jessicali9530/celeba-dataset . Here's the code used:

mlcompute.set_mlc_device(device_name='cpu')

import tensorflow as tf

#from tensorflow.python.framework.ops import disable_eager_execution
#disable_eager_execution()
#print(tf.executing_eagerly())

import glob # The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell, although results are returned in arbitrary order. 
import imageio # Imageio is a Python library that provides an easy interface to read and write a wide range of image data, including animated images, volumetric data, and scientific formats. 
import matplotlib.pyplot as plt 
import tensorflow.keras.layers as layers # Keras layers API
import time
from IPython import display # For displaying image

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
import PIL

path = "archive/img_align_celeba/img_align_celeba/"

def load_image( infilename ) :
    img = PIL.Image.open( infilename )
    img = img.crop([25,65,153,193])
    img = img.resize((64,64))
    data = np.asarray( img, dtype="int32" )
    return data

load_image(path + "000001.jpg").shape

plt.imshow(load_image(path + "000451.jpg"))

#plt.show()

train_images = np.array(os.listdir(path))
np.random.shuffle(train_images)
BUFFER_SIZE = 100#200000 # number of images in training i think
BATCH_SIZE = 5#500 # This is just the standard number for batch size. Google for more info
# shuffle and batch the data
np.random.shuffle(train_images)
train_images = np.split(train_images[:BUFFER_SIZE],BATCH_SIZE)

np.array(train_images).shape

def make_generator_model():
  # In Keras, you assemble layers to build models. A model is (usually) a graph of layers. The most common type of model is a stack of layers
  model = tf.keras.Sequential()

  """
  Add a densely-connected layer to the model
  the model will take as input arrays of shape (*, 100).
  And and output arrays of shape (*, 7*7*256)

  after the first layer, you don't need to specify the size of the input anymore
  Afterwards, we do automatic shape inference
  """
  model.add(layers.Dense(4*4*1024, use_bias = False, input_shape = (100,)))

  """
  You can think about batch normalization as doing preprocessing at every layer of the network.
  Accelerating Deep Network Training by Reducing Internal Covariate Shift
  """
  model.add(layers.BatchNormalization())

  """
  ReLU is linear (identity) for all positive values, and zero for all negative values. 
  Leaky ReLU has a small slope for negative values, instead of altogether zero. For example, leaky ReLU may have y = 0.01x when x < 0
  """
  model.add(layers.LeakyReLU())

  # reshape the output from something flattened to something with a shape of (7,7,256)
  model.add(layers.Reshape(( 4, 4, 1024)))
  assert model.output_shape == (None, 4, 4, 1024) # Note: None is the batch size

  """
  The generator uses a transposed convolutional layer (Upsampling) layers to produce an image from seed (random noise).

  128 is the dimensionality of the output space
  (5,5) specifies the height and width of the 2D convolution window
  strides = (1,1) specifies the strides of the convolution along the height and width 

  """
  model.add(layers.Conv2DTranspose(512, (5, 5), strides = (2,2), padding = "same", use_bias = False))
  assert model.output_shape == (None, 8, 8, 512)
  model.add(layers.BatchNormalization())
  model.add(layers.LeakyReLU())

  # Another transposed convolutional layer (upsampling)
  model.add(layers.Conv2DTranspose(256, (5,5), strides = (2,2), padding = "same", use_bias = False))
  assert model.output_shape == (None, 16, 16, 256)
  model.add(layers.BatchNormalization())
  model.add(layers.LeakyReLU())

  # Another transposed convolutional layer (upsampling)
  model.add(layers.Conv2DTranspose(128, (5,5), strides = (2,2), padding = "same", use_bias = False))
  assert model.output_shape == (None, 32, 32, 128)
  model.add(layers.BatchNormalization())
  model.add(layers.LeakyReLU())

  # Final output layer also a convolutional layer (upsampling), sigmoid goes from 0 to 1
  model.add(layers.Conv2DTranspose(3, (5,5), strides = (2,2), padding = "same", use_bias = False, activation = "sigmoid"))
  assert model.output_shape == (None, 64, 64, 3)

  return model

generator = make_generator_model()

noise = tf.random.normal([1,100]) # shape is 1, 100
generated_image = generator(noise, training = False)
#plt.imshow(generated_image[0], interpolation="nearest")

def make_discriminator_model():
    model = tf.keras.Sequential()
    model.add(layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same', 
                                     input_shape=[64, 64, 3]))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3)) # Dropout consists in randomly setting a fraction rate of input units to 0 at each update during training time, which helps prevent overfitting.

    model.add(layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    """
    Flattens the input. Does not affect the batch size.
    If inputs are shaped (batch,) without a channel dimension, then flattening adds an extra channel dimension and output shapes are (batch, 1).
    """
    model.add(layers.Flatten())
    model.add(layers.Dense(1))

    return model

discriminator = make_discriminator_model()
decision = discriminator(generated_image)
print(decision)

# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits = True)

"""
Discriminator Loss

This method quantifies how well the discriminator is able to distinguish real images from fakes. It compares the discriminator's predicitions on real images to an array of 1s
and the dicriminator's predicitons on fake (generated) images to an array of 0s.
"""
@tf.function
def discriminator_loss(real_output, fake_output):
  real_loss = cross_entropy(tf.ones_like(real_output), real_output)
  fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
  total_loss = real_loss + fake_loss

  return total_loss

"""
Generator Loss

The generator's loss quantifies how well it was able to trick the discrimator. Intuitively, if the generator is performing well, the discriminator will classify the fake images as real (or 1).
Here, we will compare the discriminators decisions on the generated images to an array of 1s.
"""
@tf.function
def generator_loss(fake_output):
  return cross_entropy(tf.ones_like(fake_output), fake_output)

"""
The discriminator and the generator optimizers are different since we will train two networks separately.
The Adam optimization algorithm is an extension to stochastic gradient descent.
Stochastic gradient descent maintains a single learning rate (termed alpha) for all weight updates and the learning rate does not change during training.
A learning rate is maintained for each network weight (parameter) and separately adapted as learning unfolds.

"""
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

checkpoint_dir = "./training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer = generator_optimizer,
                                 discriminator_optimizer = discriminator_optimizer,
                                 generator = generator,
                                 discriminator = discriminator)

checkpoint_prefix

# We will reuse this seed overtime (so it's easier) to visualize progress in the animated GIF
tf.random.set_seed(1234)
noise_dim = 100
num_examples_to_generate = 16
seed = tf.random.normal([num_examples_to_generate, noise_dim], seed=1)

seed

EPOCHS = 100

"""
The training loop begins with generator receiving a random seed as input. 
That seed is used to produce an image. The discriminator is then used to classify real images (drawn from the training set) and fakes images (produced by the generator). 
The loss is calculated for each of these models, and the gradients are used to update the generator and discriminator.
"""

# Notice the use of tf.function
# This annotation causes the function to be "compiled"
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, noise_dim])

    # take the image links and return a cropped image
    new_images = []
    for file_name in images:
        new_pic = load_image( path + file_name)
        new_images.append(new_pic)

    images = np.array(new_images)
    images = images.reshape(images.shape[0], 64, 64, 3).astype('float32') # puts each number in its own numpy array so instead of [1,2,3] gonna be [[1], [2], [3]]
    images = (images) / 255 # normalize to [0,1]

    """
GradientTape() Records operations for automatic differentiation. Operations are recorded if 
they are executed within this context manager and at least one of their inputs is being "watched".
    """
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
      generated_images = generator(noise, training=True)

      real_output = discriminator(images, training=True)
      fake_output = discriminator(generated_images, training=True)

      gen_loss = generator_loss(fake_output)
      disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables)) # The zip() function returns an iterator of tuples based on the iterable object.
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

    images = None

# ended at 20 epocsh

#@tf.function
def train(dataset, epochs):  
  tf.print("Starting man!")
  for epoch in range(epochs):
    start = time.time()
    tf.print("Starting Epoch:", epoch)
    batch_count = 1
    for image_batch in dataset:
      #tf.print("Batch:", batch_count)
      train_step(image_batch)
      #print("Batch:", batch_count, "Complete")
      batch_count += 1

    # Produce images for the GIF as we go
    display.clear_output(wait=True)
    generate_and_save_images(generator,
                             epoch + 1,
                             seed)

    tf.print("Epoch:", epoch, "finished")
    tf.print()

    # Save the model every epochs
    checkpoint.save(file_prefix = checkpoint_prefix)

    tf.print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

  # Generate after the final epoch
  display.clear_output(wait=True)
  generate_and_save_images(generator,
                           epochs,
                           seed)

def generate_and_save_images(model, epoch, test_input):
  # Notice `training` is set to False. 
  # This is so all layers run in inference mode (batchnorm).
  predictions = model(test_input, training=False).numpy()

  fig = plt.figure(figsize=(4,4))

  for i in range(predictions.shape[0]):
      plt.subplot(4, 4, i+1)
      plt.imshow(predictions[i])
      plt.axis('off')

  plt.savefig('log_images/image_at_epoch_{:04d}.png'.format(epoch))
  #plt.show()

from IPython.display import Image
#Image(filename='image_at_epoch_0060.png')

train(train_images, EPOCHS)

apple / tensorflow_macos

Errors from gen_mlc_ops.py on M1 #132