Simple GAN example from Tensorflow doesn't work on Mac M1 (alpha 3)

Hey! Following this easy tutorial (https://www.tensorflow.org/tutorials/generative/dcgan) I tried to implement a GAN on the MNIST dataset (simple CNN for the discriminator, transposed CNN for the generator) to do some benchmarks. The code works just fine on Colab, but there seems to be a problem on the M1, since the discriminator loss skyrockets after only 5 epochs, the generator loss goes to 0, and the final result is this image_at_epoch_0016

NOTE: The model is running on CPU. Eager Execution is enabled.

Anyone is able to correctly run this example on the M1?

CODE - Model class:

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import time

class GenerativeAdversarialNetwork:

  def __init__(self, image_size, latent_dim, batch_size, checkpoint_dir, img_num=16, save_every=5):
    self.__image_size = image_size # [height x width x 1] assuming it is black and white
    self.__latent_dim = latent_dim
    self.__batch_size = batch_size
    self.__checkpoint_dir = checkpoint_dir
    self.__img_num = img_num
    self.generator = self.__make_generator_model()
    self.discriminator = self.__make_discriminator_model()
    # Loss for both generator and discriminator
    self.__cross_entropy = keras.losses.BinaryCrossentropy(from_logits=True)
    self.__generator_optimizer = keras.optimizers.Adam(1e-4)
    self.__discriminator_optimizer = keras.optimizers.Adam(1e-4)
    self.__seed = tf.random.normal([self.__img_num, self.__latent_dim])
    # To save the model
    self.__checkpoint = tf.train.Checkpoint(step=tf.Variable(1),
                                            generator_optimizer=self.__generator_optimizer,
                                            discriminator_optimizer=self.__discriminator_optimizer,
                                            generator=self.generator,
                                            discriminator=self.discriminator)
    self.__manager = tf.train.CheckpointManager(self.__checkpoint, self.__checkpoint_dir, max_to_keep=3)
    self.__save_every = save_every

  def __make_generator_model(self):
    model = keras.Sequential()
    # Dense: transform input vector (latent dim) into 256 low resolution (7x7) images.
    # Note: 7 x 7 works with MNIST (final result is 28 x 28). We don't need bias here
    model.add(layers.Dense(256 * 7 * 7, input_shape=(self.__latent_dim,), use_bias=False))
    # To try to keep mean 0 and std 1
    model.add(layers.BatchNormalization())
    # This reshapes the output into 256 7x7 "images"
    model.add(layers.Reshape((7, 7, 256)))
    # Conv2DTranspose is the opposite of convolution. First parameter: how many output images
    # Second parameter: kernel size (height and width of the window). Third parameter: multiplier of the two input dim
    # Padding to pad evenly, so that we don't loose data if the kernel size is not sub-multiple of the size of the input
    model.add(layers.Conv2DTranspose(128, (4, 4), strides=(1, 1), padding='same'))
    model.add(layers.BatchNormalization())
    # For GAN it is often used LeakyReLU as act. function
    model.add(layers.LeakyReLU(alpha=0.2))
    # Output here is 64 images of 14x14
    model.add(layers.Conv2DTranspose(64, (4, 4), strides=(2, 2), padding='same'))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU(alpha=0.2))
    # This will output a single image. Activation is tanh because we normalize the data to be between -1 and 1
    # Instead of 0-255 (black & white image)
    model.add(layers.Conv2DTranspose(1, (4, 4), strides=(2, 2), padding='same', activation='tanh'))
    assert model.output_shape == (None, 28, 28, 1)

   return model

  def __make_discriminator_model(self):
    model = keras.Sequential()
    # Output 64 images of shape 14 x 14 assuming input is 28 x 28
    model.add(layers.Conv2D(64, (4, 4), strides=(2, 2), padding='same', input_shape=self.__image_size))
    model.add(layers.LeakyReLU(alpha=0.2))
    # Dropout to avoid overfitting
    model.add(layers.Dropout(0.4))
    # Output 128 images of 7x7
    model.add(layers.Conv2D(128, (4, 4), strides=(2,2), padding='same'))
    model.add(layers.LeakyReLU(alpha=0.2))
    model.add(layers.Dropout(0.4))
    # Flattening the output
    model.add(layers.Flatten())
    # Output neuron: one prediction between 0 (fake) and one (real)
    model.add(layers.Dense(1, activation='sigmoid'))

    return model

  def __discriminator_loss(self, real_output, fake_output):
    # Real loss: I should predict one for each of the images
    real_loss = self.__cross_entropy(tf.ones_like(real_output), real_output)
    # Fake loss: I should predict 0 for each of the fake images
    fake_loss = self.__cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss

    return total_loss

  def __generator_loss(self, fake_output):
    # Gen Loss: the discriminator should predict one for my images
    return self.__cross_entropy(tf.ones_like(fake_output), fake_output)

  def __train_step(self, images):
    # One single training step for both the generator and the discriminator. Will be called by train
    noise = tf.random.normal([self.__batch_size, self.__latent_dim])

    # Standard way to define a training step with gradient tapes
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
      # Will generate BATCH_SIZE images
      generated_images = self.generator(noise, training=True)

      # Feeding the discriminator with both real and fake images
      real_output = self.discriminator(images, training=True)
      fake_output = self.discriminator(generated_images, training=True)

      # Computing the two losses
      gen_loss = self.__generator_loss(fake_output)
      disc_loss = self.__discriminator_loss(real_output, fake_output)

      #print(f"Generator loss: {gen_loss}\nDiscriminator loss: {disc_loss}")

    # Computing the two gradients
    gen_gradient = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
    disc_gradient = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

    # Backpropagation: optimising the trainable variables according to the two gradients
    self.__generator_optimizer.apply_gradients(zip(gen_gradient, self.generator.trainable_variables))
    self.__discriminator_optimizer.apply_gradients((zip(disc_gradient, self.discriminator.trainable_variables)))

    return gen_loss, disc_loss

  def train(self, data, epochs, from_pretrained=False):
    if from_pretrained:
      self.load_model()

    for epoch in range(epochs):
      start = time.time()
      print("\nStart of epoch %d" % (epoch + 1,))
      for image_batch in data:
        gen_loss, disc_loss = self.__train_step(image_batch)

      self.__generate_and_save_images(epoch + 1, self.__seed)

      print('Time for epoch {} is {} sec'.format(epoch + 1, time.time() - start))
      self.__checkpoint.step.assign_add(1)
      if int(self.__checkpoint.step) % self.__save_every == 0:
        self.__save_model(gen_loss, disc_loss)

  def __generate_and_save_images(self, epoch, test_input):
    predictions = self.generator(test_input, training=False)

    fig = plt.figure(figsize=(16, 16))

    for i in range(predictions.shape[0]):
      plt.subplot(4, 4, i + 1)
      plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')
      plt.axis('off')

    plt.savefig('./images/image_at_epoch_{:04d}.png'.format(epoch))
    plt.show()

  def __save_model(self, gen_loss, disc_loss):
    save_path = self.__manager.save()
    print("Saved checkpoint for step {}: {}".format(int(self.__checkpoint.step), save_path))
    print("Generator Loss {:1.2f}".format(gen_loss.numpy()))
    print("Discriminator Loss {:1.2f}".format(disc_loss.numpy()))

  def load_model(self):
    self.__checkpoint.restore(self.__manager.latest_checkpoint)
    if self.__manager.latest_checkpoint:
      print("Restored from {}".format(self.__manager.latest_checkpoint))
    else:
      print("ERROR: Initializing from scratch.")

CODE - Main:

import tensorflow as tf
import matplotlib.pyplot as plt
from classes.model import GenerativeAdversarialNetwork

BATCH_SIZE = 256
EPOCHS = 50
NOISE_DIM = 100
NUMBER_OF_PREDS = 16

# We don't have a test. Unsupervided learning
(train_images, train_labels), (_, _) = tf.keras.datasets.mnist.load_data()

# Reshaping from 60k x 28 x 28 into 60k x 28 x 28 x 1 because conv2D needs for each image three dims
train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')

# Normalizing between -1 and 1 (that's why we have tanh at the end of the generator)
train_images = (train_images - 127.5)/127.5

# tf dataset from the array
train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(60000).batch(BATCH_SIZE)

# Defining the model
GAN = GenerativeAdversarialNetwork(image_size=[28, 28, 1], latent_dim=NOISE_DIM, batch_size=BATCH_SIZE,
                                   checkpoint_dir='./checkpoints')

# Sample of a mnist image
plt.imshow(train_images[0, :, :, 0], cmap='gray')
plt.show()
plt.close()

GAN.train(train_dataset, EPOCHS)

apple / tensorflow_macos

Simple GAN example from Tensorflow doesn't work on Mac M1 (alpha 3) #206