Hi everyone,
In the process of my Bachelor Degree I am building a DCGAN in caffe2. After following all the tutorials I have figured out a architecture. But when I run my Code the Generator does not update at all. I made some debugging and found out, that the Generator's layers do not receive any Gradients at all.
Even my tutor can't find the source of the issue.
I am grateful for any help! =)
Here is my Code:
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import os
import shutil
#import pickle as pkl
import numpy as np
from matplotlib import pyplot#, image
from caffe2.python import brew, core, workspace, model_helper, optimizer, visualize, memonger, utils
from caffe2.proto import caffe2_pb2
###############################################################################
# Data Import
###############################################################################
workspace.ResetWorkspace()
def DownloadResource(url, path):
'''Downloads resources from s3 by url and unzips them to the provided path'''
import requests, StringIO, zipfile
print("Downloading... {} to {}".format(url, path))
r = requests.get(url, stream=True)
z = zipfile.ZipFile(StringIO.StringIO(r.content))
z.extractall(path)
print("Completed download and extraction.")
current_folder = os.getcwd()
print("The current folder is {}".format(current_folder) )
data_folder = os.path.join(current_folder, 'tutorial_data', 'mnist')
root_folder = os.path.join(current_folder, 'tutorial_files', 'tutorial_mnist')
db_missing = False
if not os.path.exists(data_folder):
os.makedirs(data_folder)
print("Your data folder was not found!! This was generated: {}".format(data_folder))
# Look for existing database: lmdb
if os.path.exists(os.path.join(data_folder,"mnist-train-nchw-lmdb")):
print("lmdb train db found!")
else:
db_missing = True
if os.path.exists(os.path.join(data_folder,"mnist-test-nchw-lmdb")):
print("lmdb test db found!")
else:
db_missing = True
# attempt the download of the db if either was missing
if db_missing:
print("one or both of the MNIST lmbd dbs not found!!")
db_url = "http://download.caffe2.ai/databases/mnist-lmdb.zip"
try:
DownloadResource(db_url, data_folder)
except Exception as ex:
print("Failed to download dataset. Please download it manually from {}".format(db_url))
print("Unzip it and place the two database folders here: {}".format(data_folder))
raise ex
if os.path.exists(root_folder):
print("Looks like you ran this before, so we need to cleanup those old files...")
shutil.rmtree(root_folder)
os.makedirs(root_folder)
workspace.ResetWorkspace(root_folder)
print("training data folder:" + data_folder)
print("workspace root folder:" + root_folder)
###############################################################################
# Memory Management
###############################################################################
# Kann zur Performanceoptimierung später eingesetzt werden.
def optimize_gradient_memory(model, loss):
model.net._net = memonger.share_grad_blobs(
model.net,
loss,
set(model.param_to_grad.values()),
# Due to memonger internals, we need a namescope here. Let's make one up; we'll need it later!
namescope="memonger_needs_a_namescope",
share_activations=False)
###############################################################################
# Global Parameters
###############################################################################
device_option = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA)
# Größere Batchsize oder learning_rate führt zu loss_d = nan
batch_size = 32
arg_scope = {"order": "NCHW"}
learning_rate = 0.00003
# For Leaky ReLU
alpha = 0.1
# Create labels for D_Real. Only ones. Soft and noisy.
label_real = np.random.rand(batch_size, 2048, 1, 1).astype('float32') * 0.1 + 0.9
# Create labels for D_Fake. Only zeros. Soft and noisy
label_fake = np.random.rand(batch_size, 2048, 1, 1).astype('float32') * 0.1
# Create labels for G. Only ones.
label_g = np.ones(batch_size, dtype=np.int32)
# Dummy Blobs for evading deadlock between G and d_fake.
# For G
dummyblob_g = np.ones(batch_size)
dummyblob_d_loss_fake = np.ones(1)
# For D
dummyblob_d = np.ones(batch_size*28*28).reshape(batch_size, 1, 28, 28)
# Noise Data Input for the Generator
noise = np.random.randn(batch_size, 100).astype(np.float32)
# Insert all relevant data to workspace
with core.DeviceScope(device_option):
workspace.FeedBlob("tanh", dummyblob_d.astype(np.float32))
workspace.FeedBlob("sigmoid_d_real", dummyblob_d.astype(np.float32))
workspace.FeedBlob("sigmoid_d_fake", dummyblob_g.astype(np.float32))
workspace.FeedBlob("d_loss_fake", dummyblob_d_loss_fake.astype(np.float32))
workspace.FeedBlob("label_g", label_g)
workspace.FeedBlob("label_fake", label_fake)
workspace.FeedBlob("label_real", label_real)
workspace.FeedBlob("noise", noise)
###############################################################################
# Define Models
###############################################################################
# Manage Input for D_real
def AddInput(model, batch_size, db, db_type):
# load the data
data_uint8, label = model.TensorProtosDBInput(
[], ["data_uint8", "label"], batch_size = batch_size,
db=db, db_type=db_type)
# cast the data to float
data = model.Cast(data_uint8, "data", to=core.DataType.FLOAT)
# scale data from [0,255] to [-1,1]
data = model.Scale(data, data, scale=float(1./256 * 2 - 1))
# Caffe2 ist sehr generisch! Tatsächlich wird hier (fast) alles als
# Operator behandelt. Dabei auch der ganz simple input! Die Backpropagation
# welche die Werte der hidden Layers updated hat hier jedoch nichts verloren.
# Deswegen muss caffe2, auf Grund seiner generischen Bauweise, noch einmal
# extra gesagt werden, dass hier bitte keine updates erfolgen sollen.
# Darum: StopGradient
data = model.StopGradient(data, data)
return data
###############################################################################
# Define Generator
def GenModel(model, data):
# Input Layer: 1x1x100
# data is noise
fc1 = brew.fc(model, data, "fc1", dim_in=100, dim_out=4096)
reshape, oldshape = model.net.Reshape(
["fc1"],
["reshape", "oldshape"],
shape = (batch_size, 1024, 2, 2))
relu0_g = brew.relu(model, reshape, "relu0_g")
# Deconv Block 1: 256x2x2
deconv1 = brew.conv_transpose(
model,
relu0_g,
"deconv1",
dim_in=1024,
dim_out=512,
kernel=2,
stride=2
)
batchnorm1_g = brew.spatial_bn(
model,
deconv1,
"batchnorm1_g",
512,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
relu1_g = brew.relu(model, batchnorm1_g, "relu1_g")
# Deconv Block 2: 128x4x4
deconv2 = brew.conv_transpose(
model,
relu1_g,
"deconv2",
dim_in=512,
dim_out=256,
kernel=4,
stride=1
)
batchnorm2_g = brew.spatial_bn(
model,
deconv2,
"batchnorm2_g",
256,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
relu2_g = brew.relu(model, batchnorm2_g, 'relu2_g')
# Deconv Block 3: 64x7x7
deconv3 = brew.conv_transpose(
model,
relu2_g,
"deconv3",
dim_in=256,
dim_out=128,
kernel=2,
stride=2
)
batchnorm3_g = brew.spatial_bn(
model,
deconv3,
"batchnorm3_g",
128,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
relu3_g = brew.relu(model, batchnorm3_g, 'relu3_g')
# Deconv Block 4: 32x14x14 -> 1x28x28
deconv4 = brew.conv_transpose(
model,
relu3_g,
"deconv4",
dim_in=128,
dim_out=1,
kernel=2,
stride=2
)
batchnorm4_g = brew.spatial_bn(
model,
deconv4,
"batchnorm4_g",
1,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
tanh = brew.tanh(model, batchnorm4_g, "tanh")
return tanh
###############################################################################
# Define Discriminator Real
def DisModel_real(model, data):
# convblock 1 28x28 -> 14x14
conv1_d = brew.conv(model, data, "conv1_d", dim_in=1, dim_out=128, kernel=2, stride=2, pad=0)
batchnorm1_d = brew.spatial_bn(
model,
conv1_d,
"batchnorm1_d",
128,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
lrelu1_d = model.net.LeakyRelu(batchnorm1_d, "lrelu1_d", alpha=alpha)
# convblock 2 14x14 -> 7x7
conv2_d = brew.conv(model, lrelu1_d, "conv2_d", dim_in=128, dim_out=256, kernel=2, stride=2, pad=0)
batchnorm2_d = brew.spatial_bn(
model,
conv2_d,
"batchnorm2_d",
256,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
lrelu2_d = model.net.LeakyRelu(batchnorm2_d, "lrelu2_d", alpha=alpha)
# convblock 3 7x7 -> 4x4
conv3_d = brew.conv(model, lrelu2_d, "conv3_d", dim_in=256, dim_out=512, kernel=1, stride=2, pad=0)
batchnorm3_d = brew.spatial_bn(
model,
conv3_d,
"batchnorm3_d",
512,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
lrelu3_d = model.net.LeakyRelu(batchnorm3_d, "lrelu3_d", alpha=alpha)
# convblock 4 4x4 -> 2x2
conv4_d = brew.conv(model, lrelu3_d, "conv4_d", dim_in=512, dim_out=512, kernel=2, stride=2, pad=0)
batchnorm4_d = brew.spatial_bn(
model,
conv4_d,
"batchnorm4_d",
512,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
lrelu4_d = model.net.LeakyRelu(batchnorm4_d, "lrelu4_d", alpha=alpha)
# Flatten 512x2x2 -> 2048x1x1
reshape_d, oldshape_d = model.net.Reshape(
["lrelu4_d"],
["reshape_d", "oldshape_d"],
shape=(batch_size,2048,1,1))
sigmoid_real = model.net.Sigmoid(reshape_d, "sigmoid_d_real")
return sigmoid_real
###############################################################################
# Define Discriminator Fake
def DisModel_fake(model, data):
# convblock 1 28x28 -> 14x14
conv1_d = brew.conv(model, data, "conv1_d", dim_in=1, dim_out=128, kernel=2, stride=2, pad=0)
batchnorm1_d = brew.spatial_bn(
model,
conv1_d,
"batchnorm1_d",
128,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
lrelu1_d = model.net.LeakyRelu(batchnorm1_d, "lrelu1_d", alpha=alpha)
# convblock 2 14x14 -> 7x7
conv2_d = brew.conv(model, lrelu1_d, "conv2_d", dim_in=128, dim_out=256, kernel=2, stride=2, pad=0)
batchnorm2_d = brew.spatial_bn(
model,
conv2_d,
"batchnorm2_d",
256,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
lrelu2_d = model.net.LeakyRelu(batchnorm2_d, "lrelu2_d", alpha=alpha)
# convblock 3 7x7 -> 4x4
conv3_d = brew.conv(model, lrelu2_d, "conv3_d", dim_in=256, dim_out=512, kernel=1, stride=2, pad=0)
batchnorm3_d = brew.spatial_bn(
model,
conv3_d,
"batchnorm3_d",
512,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
lrelu3_d = model.net.LeakyRelu(batchnorm3_d, "lrelu3_d", alpha=alpha)
# convblock 4 4x4 -> 2x2
conv4_d = brew.conv(model, lrelu3_d, "conv4_d", dim_in=512, dim_out=512, kernel=2, stride=2, pad=0)
batchnorm4_d = brew.spatial_bn(
model,
conv4_d,
"batchnorm4_d",
512,
epsilon=1e-5,
momentum=0.9,
is_test=False
)
lrelu4_d = model.net.LeakyRelu(batchnorm4_d, "lrelu4_d", alpha=alpha)
# Flatten 512x2x2 -> 2048x1x1
reshape_d, oldshape_d = model.net.Reshape(
["lrelu4_d"],
["reshape_d", "oldshape_d"],
shape=(batch_size,2048,1,1))
sigmoid_fake = model.net.Sigmoid(reshape_d, "sigmoid_d_fake")
return sigmoid_fake
###############################################################################
# Define Training
###############################################################################
# Training Operators for D
def AddTrainingOperators_D(model_r, model_f, sigmoid_r, sigmoid_f, lr = learning_rate):
xent_fake = model_f.net.SigmoidCrossEntropyWithLogits([sigmoid_f, "label_fake"], 'xent_fake')
d_loss_fake = model_f.net.AveragedLoss(xent_fake, "d_loss_fake")
xent_real = model_r.net.SigmoidCrossEntropyWithLogits([sigmoid_r, "label_real"], 'xent_real')
d_loss_real = model_r.net.AveragedLoss(xent_real, "d_loss_real")
d_loss_total_r = model_r.net.Add(["d_loss_real", "d_loss_fake"], "d_loss_total_r")
d_loss_total_f = model_f.net.Add(["d_loss_real", "d_loss_fake"], "d_loss_total_f")
model_r.AddGradientOperators([d_loss_total_r])
model_f.AddGradientOperators([d_loss_total_f])
optimizer.build_adam(model_f, lr)
optimizer.build_adam(model_r, lr)
###############################################################################
# Training Operators for G
def AddTrainingOperators_Gen(model, fake_sigmoid, learning_rate=learning_rate):
xent = model.net.LabelCrossEntropy([fake_sigmoid, "label_g"], 'xent_g')
# compute the expected loss with the help of xent.
loss_g = model.net.AveragedLoss(xent, "loss_g")
# use the average loss to to add gradient operators to the model
model.AddGradientOperators([loss_g])
# Use adam
optimizer.build_adam(model, learning_rate)
###############################################################################
# Create Models
###############################################################################
# Create G
generator = model_helper.ModelHelper(name="mnist_gen")
# Create D_real
discriminator_real = model_helper.ModelHelper(
name="mnist_dis_real", arg_scope=arg_scope)
# Create D_fake
discriminator_fake = model_helper.ModelHelper(
name="mnist_dis_fake", arg_scope=arg_scope)
# Apply net-definitions
with core.DeviceScope(device_option):
# Get Data
data = AddInput(
discriminator_real, batch_size=batch_size,
db=os.path.join(data_folder, 'mnist-test-nchw-lmdb'),
db_type='lmdb')
# With Data from noise vector
tanh_gen = GenModel(generator, "noise")
# Only filled with data from MNIST.
true_sigmoid = DisModel_real(discriminator_real, data)
# Only filled with data from G.
fake_sigmoid = DisModel_fake(discriminator_fake, tanh_gen)
# Add Trainingsoperators
# For G
AddTrainingOperators_Gen(generator, fake_sigmoid, learning_rate)
# For D
AddTrainingOperators_D(discriminator_real, discriminator_fake, true_sigmoid, fake_sigmoid, learning_rate)
###############################################################################
#Initialize the network
###############################################################################
workspace.RunNetOnce(discriminator_real.param_init_net)
workspace.CreateNet(discriminator_real.net)
workspace.RunNetOnce(generator.param_init_net)
workspace.CreateNet(generator.net)
workspace.RunNetOnce(discriminator_fake.param_init_net)
workspace.CreateNet(discriminator_fake.net)
print(workspace.Blobs())
###############################################################################
# Run the training procedure
###############################################################################
# times the training will be running
epochs = 10
steps = int(600/ batch_size) # MNIST size / batch_size -> 1 epoch
# Containers for plotting progress
loss_d = np.zeros(steps)
loss_g = np.zeros(steps)
images = np.empty((batch_size,1,28,28), np.float32)
print("Total Number of Runs: {}".format(epochs * steps))
for e in range (epochs):
# Zum Debuggen G Output als print. Bug: G updatet nicht. Werte bleiben gleich.
tanh_out = workspace.FetchBlob('tanh')
#print(tanh_out[0][0][0])
for i in range(steps):
# Train D
workspace.RunNet(discriminator_real.net)
workspace.RunNet(discriminator_fake.net)
# Noise Data Input for the Generator
noise = np.random.randn(batch_size, 100).astype(np.float32)
workspace.FeedBlob("noise", noise)
# Train G
workspace.RunNet(generator.net)
loss_d[i] = workspace.FetchBlob("d_loss_total_r")
loss_g[i] = workspace.FetchBlob("loss_g")
# Nach dem Debugging wieder dekommentieren.
"""if (i % 50) == 0:
print("Round: {}".format(i * (e+1)))
print("LOSS D")
print(workspace.FetchBlob("d_loss_total_r"))
print("LOSS G")
print(workspace.FetchBlob("loss_g")) """
###############################################################################
# After the execution is done, let's plot the values.
print("Final D loss: {}".format(workspace.FetchBlob("d_loss_total_r")))
print("Final G loss: {}".format(workspace.FetchBlob("loss_g")))
pyplot.plot(loss_d, 'b')
pyplot.plot(loss_g, 'r')
pyplot.title("Summary of Training Run")
pyplot.xlabel("Iteration")
pyplot.legend(('Loss_d', 'Loss_g'), loc='upper right')
# Plot G Results
# Use visualize module to show the examples from the last batch that was fed to the model
tanh_out = workspace.FetchBlob('tanh')
pyplot.figure()
pyplot.title("Last Batch from G")
_ = visualize.NCHW.ShowMultiple(tanh_out)
# Nur für das Debugging auskommentiert. Später wieder einfügen!
#pyplot.show()
###############################################################################
Hi everyone, In the process of my Bachelor Degree I am building a DCGAN in caffe2. After following all the tutorials I have figured out a architecture. But when I run my Code the Generator does not update at all. I made some debugging and found out, that the Generator's layers do not receive any Gradients at all. Even my tutor can't find the source of the issue.
I am grateful for any help! =)
Here is my Code:
And here are the networks blobs.
Thanks everyone! =)