DXGI_ERROR_DEVICE_REMOVED issue

Hello! I've facing some issues when training a relatively simple generative model.

At a random iteration the training freezes (this might be at iteration 100 or 5000, but rarely after 10k iterations), and after a time with the training process freezed, the following message is given and the process is stopped

2023-05-28 18:58:27.210590: E tensorflow/c/logging.cc:40] The DirectML device has encountered an unrecoverable error (DXGI_ERROR_DEVICE_REMOVED). This is most often caused by a timeout occurring on the GPU. Please visit https://aka.ms/tfdmltimeout for more information and troubleshooting steps. 2023-05-28 18:58:27.210904: F tensorflow/c/logging.cc:43] HRESULT failed with 0x887a0005: readback_heap->Map(0, nullptr, &readback_heap_data)

I've already tried all solutions from https://aka.ms/tfdmltimeout, https://github.com/microsoft/tensorflow-directml/issues/209 and https://github.com/microsoft/DirectML/issues/95

A thing that I did not found in the internet is this warning message which appears when I run the code

2023-05-29 10:58:06.403006: W tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc:37] Ignoring the value of TF_FORCE_GPU_ALLOW_GROWTH because force_memory_growth was requested by the device.

Moreover, I've verified that the VRAM usage keeps increasing over the training, with some increasing steps at random iterations (this doesn't occur when I run the same code at Google Colab or in this computer with CPU Device).

I've monitored the VRAM usage during the training to check if there exists any correlation between the freezing and the usage, however, in many tries the training freezes with a VRAM usage below 512mb.

Here follows my computer DxDiag DxDiag.txt

And a example of the VRAM increasing (this print happened at 1k steps approximately) vram

Many thanks

EDIT2: Here follow some code to reproduce this growth in GPU memory usage (with this code, the growth happens every 5-10k iterations, approximately).


import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.initializers import constant, RandomNormal
import numpy as np

def build_baseline_generator(latent_dim, img_shape, nfilters=32, batch_norm=None, summary=True):
    ni, nj = img_shape[0], img_shape[1]
    nchannels = img_shape[2]

    input_layer = layers.Input(shape=(latent_dim,))
    g = layers.Dense(np.ceil(ni / 8).astype(int) * np.ceil(nj / 8).astype(int) * 8 * nfilters,
                     use_bias=False, kernel_initializer=RandomNormal(stddev=0.02))(input_layer)
    g = layers.BatchNormalization(momentum=batch_norm)(g)
    g = layers.Reshape((np.ceil(ni / 8).astype(int), np.ceil(nj / 8).astype(int), 8 * nfilters))(g)
    g = layers.Conv2D(8 * nfilters, (3, 3), padding='same',
                      use_bias=False, kernel_initializer=RandomNormal(stddev=0.02))(g)
    g = layers.BatchNormalization(momentum=batch_norm)(g)
    g = layers.ReLU()(g)

    g = tf.image.resize(g, size=(np.ceil(ni / 4).astype(int), np.ceil(nj / 4).astype(int)), method='nearest')
    g = layers.Conv2D(4 * nfilters, (3, 3), padding='same',
                      use_bias=False, kernel_initializer=RandomNormal(stddev=0.02))(g)
    g = layers.BatchNormalization(momentum=batch_norm)(g)
    g = layers.ReLU()(g)

    g = tf.image.resize(g, size=(np.ceil(ni / 2).astype(int), np.ceil(nj / 2).astype(int)), method='nearest')
    g = layers.Conv2D(2 * nfilters, (3, 3), padding='same',
                      use_bias=False, kernel_initializer=RandomNormal(stddev=0.02))(g)
    g = layers.BatchNormalization(momentum=batch_norm)(g)
    g = layers.ReLU()(g)

    g = tf.image.resize(g, size=(np.ceil(ni / 1).astype(int), np.ceil(nj / 1).astype(int)), method='nearest')

    g = layers.Conv2D(1 * nfilters, (3, 3), padding='same',
                      use_bias=False, kernel_initializer=RandomNormal(stddev=0.02))(g)
    g = layers.BatchNormalization(momentum=batch_norm)(g)
    g = layers.ReLU()(g)

    image = layers.Conv2D(nchannels, (5, 5), padding='same', activation='tanh', name='last_conv')(g)
    image = tf.image.resize(image, size=img_shape[0:2])  # assert correct output shape
    generator = Model(inputs=[input_layer], outputs=[image], name='Generator')
    if summary is True:
        generator.summary()
    return generator

def build_baseline_discriminator(img_shape, nfilters=32, batch_norm=None, ln=False, summary=True):
    input_layer = layers.Input(shape=img_shape)  # input layer from the img shape

    d = layers.Conv2D(nfilters, (5, 5), padding='same', use_bias=True,
                      bias_initializer=constant(0.), kernel_initializer=RandomNormal(stddev=0.02))(input_layer)
    d = layers.LeakyReLU(0.2)(d)

    d = layers.Conv2D(2 * nfilters, (3, 3), padding='same', strides=(2, 2),
                      use_bias=False, kernel_initializer=RandomNormal(stddev=0.02))(d)
    d = layers.BatchNormalization(momentum=batch_norm)(d)
    d = layers.LeakyReLU(0.2)(d)

    d = layers.Conv2D(4 * nfilters, (3, 3), padding='same', strides=(2, 2),
                      use_bias=False, kernel_initializer=RandomNormal(stddev=0.02))(d)
    d = layers.BatchNormalization(momentum=batch_norm)(d)
    d = layers.LeakyReLU(0.2)(d)

    d = layers.Conv2D(8 * nfilters, (3, 3), padding='same', strides=(2, 2),
                      use_bias=False, kernel_initializer=RandomNormal(stddev=0.02))(d)
    d = layers.BatchNormalization(momentum=batch_norm)(d)
    d = layers.LeakyReLU(0.2)(d)

    d = layers.Flatten()(d)
    out = layers.Dense(1, activation='linear', kernel_initializer=RandomNormal(stddev=0.02))(d)
    discriminator = Model(inputs=[input_layer], outputs=[out], name='Discriminator')
    if summary is True:
        discriminator.summary()
    return discriminator

class SimpleGAN(object):
    def __init__(self, img_shape, latent_dim=64):
        self.img_shape = img_shape
        self.n_channels = img_shape[-1]
        self.latent_dim = latent_dim
        self.generator = build_baseline_generator(latent_dim=self.latent_dim,
                                                           img_shape=self.img_shape, nfilters=32,
                                                           batch_norm=0.9)  # gen object
        self.discriminator = build_baseline_discriminator(img_shape=self.img_shape, nfilters=32,
                                                                   batch_norm=0.9)  # build dis object
        self.gen_opt = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.5, beta_2=0.9)  # gen optimizer
        self.dis_opt = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.5, beta_2=0.9)  # dis optimizer
        self.losses = []

    def train(self, X_train, epochs=5000, batch_size=64):
        @tf.function
        def train_step(batch, b_size):
            noise = tf.random.normal([b_size, self.latent_dim])
            with tf.GradientTape() as gen_tape, tf.GradientTape() as dis_tape:
                generated = self.generator(noise, training=True)  # gen batch of fake samples

                r_logits = self.discriminator(batch, training=True)
                f_logits = self.discriminator(generated, training=True)  # dis output for fake

                gen_loss = tf.reduce_mean(tf.nn.softplus(-f_logits))
                dis_loss_f = tf.reduce_mean(tf.nn.softplus(f_logits))
                dis_loss_r = tf.reduce_mean(tf.nn.softplus(-r_logits))
                dis_loss_total = (dis_loss_r + dis_loss_f)  # compute discriminator loss

                gen_grad = gen_tape.gradient(target=gen_loss, sources=self.generator.trainable_variables)
                dis_grad = dis_tape.gradient(target=dis_loss_total, sources=self.discriminator.trainable_variables)
                self.gen_opt.apply_gradients(zip(gen_grad, self.generator.trainable_variables))
                self.dis_opt.apply_gradients(zip(dis_grad, self.discriminator.trainable_variables))

            return gen_loss, dis_loss_total, dis_loss_r, dis_loss_f

        for epoch in range(epochs):
            idx = np.random.choice(X_train.shape[0], batch_size, replace=False)
            train_batch = X_train[idx]
            gen_loss, dis_loss, d_loss_r, d_loss_f = train_step(train_batch, batch_size)
            self.losses.append((gen_loss, dis_loss))

            output = "it: %d, [D loss: %f] [G loss: %f]" % (epoch + 1, dis_loss, gen_loss)
            print(output)

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = (x_train - 127.5) / 127.5

with tf.device('GPU:0'):
    gan = SimpleGAN((28, 28, 1), latent_dim=64)
    gan.train(x_train, 100000, batch_size=128)

microsoft / DirectML

DXGI_ERROR_DEVICE_REMOVED issue #462