training with nengo-dl - Githubissues

victkid commented 5 years ago

Hi,

I'm trying to use nengo-dl to train a customized dataset for a classification task. The input for my dataset is 224x224 greyscale image, and the output is one of 56 classes. I trained my data with a VGG like CNN architecture on Keras. It converges to 90% accuracy without any fine-tuning or data augmentation. I used the same architecture on Nengo, but it seems not to converge. I'm new to this framework, I just changed a few lines from the mnist example. Could you help with the possible issues with my code?


with open('config.json', 'r') as fp:
    cfg = json.load(fp)

input_path = os.path.join(cfg['root_path'], "train_test_data")
x_train, y_train = load_data(input_path)  # x_train shape: (n_samples, 224*224), y_train shape: (n_samples, 56) - onehot encoded
h, w = 224, 224

with nengo.Network() as net:
    net.config[nengo.Ensemble].max_rates = nengo.dists.Choice([100])
    net.config[nengo.Ensemble].intercepts = nengo.dists.Choice([0])
    neuron_type = nengo.LIF(amplitude=0.01)

    nengo_dl.configure_settings(trainable=False)
    kernel_size = 32
    inp = nengo.Node([0] * h * w)

    x = nengo_dl.tensor_layer(inp, tf.layers.conv2d, shape_in=(h, w, 1), filters=kernel_size, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.conv2d, shape_in=(h, w, kernel_size), filters=kernel_size, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.average_pooling2d, shape_in=(h, w, kernel_size), pool_size=2, strides=2)
    h, w = h // 2, w // 2

    x = nengo_dl.tensor_layer(x, tf.layers.conv2d, shape_in=(h, w, kernel_size), filters=kernel_size * 2, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.conv2d, shape_in=(h, w, kernel_size * 2), filters=kernel_size * 2, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.average_pooling2d, shape_in=(h, w, kernel_size * 2), pool_size=2, strides=2)
    h, w = h // 2, w // 2

    x = nengo_dl.tensor_layer(x, tf.layers.conv2d, shape_in=(h, w, kernel_size * 2), filters=kernel_size * 4, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.conv2d, shape_in=(h, w, kernel_size * 4), filters=kernel_size * 4, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.conv2d, shape_in=(h, w, kernel_size * 4), filters=kernel_size * 4, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.average_pooling2d, shape_in=(h, w, kernel_size * 4), pool_size=2, strides=2)
    h, w = h // 2, w // 2

    x = nengo_dl.tensor_layer(x, tf.layers.conv2d, shape_in=(h, w, kernel_size * 4), filters=kernel_size * 8, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.conv2d, shape_in=(h, w, kernel_size * 8), filters=kernel_size * 8, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.conv2d, shape_in=(h, w, kernel_size * 8), filters=kernel_size * 8, kernel_size=3)
    x = nengo_dl.tensor_layer(x, neuron_type)
    h, w = h - 2, w - 2
    x = nengo_dl.tensor_layer(x, tf.layers.average_pooling2d, shape_in=(h, w, kernel_size * 8), pool_size=2, strides=2)
    h, w = h // 2, w // 2
    # linear readout
    x = nengo_dl.tensor_layer(x, tf.layers.dense, units=56)

    out_p = nengo.Probe(x)
    out_p_filt = nengo.Probe(x, synapse=0.1)

minibatch_size = 8
sim = nengo_dl.Simulator(net, minibatch_size=minibatch_size)

# add the single timestep to the training data
train_data = {inp: x_train[:, None, :],
              out_p: y_train[:, None, :]}

n_steps = 30
n_test = 1000
test_data = {
    inp: np.tile(x_train[:n_test, None, :],
                 (1, n_steps, 1)),
    out_p_filt: np.tile(y_train[:n_test, None, :],
                        (1, n_steps, 1))}

def objective(outputs, targets):
    return tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=targets, logits=outputs)

# opt = tf.train.RMSPropOptimizer(learning_rate=0.001)
opt = tf.train.GradientDescentOptimizer(learning_rate=0.0001)

def classification_error(outputs, targets):
    return 100 * tf.reduce_mean(
        tf.cast(tf.not_equal(tf.argmax(outputs[:, -1], axis=-1),
                             tf.argmax(targets[:, -1], axis=-1)),
                tf.float32))

# print("error before training: %.2f%%" % sim.loss(
#     test_data, {out_p_filt: classification_error}))

do_training = True
epochs = 100
weights_name = "./data/temp3"
weights_name_ep = weights_name
if do_training:
    # run training
    for i in range(epochs):
        if os.path.exists(weights_name_ep + ".index"):
            sim.load_params(weights_name_ep)
            print("load", weights_name_ep)
        sim.train(train_data, opt, objective={out_p: objective}, n_epochs=1)
        # save the parameters to file
        weights_name_ep = weights_name + "_" + str(i)
        sim.save_params(weights_name_ep)

sim.close()

drasmuss commented 5 years ago

I notice that you're using LIF neurons, which are probably significantly different than what you were using in Keras. I'd start out by using the same nonlinearity as you were using before (probably nengo.RectifiedLinear) and try to replicate the original performance, to make sure that the model has been accurately translated to the Nengo syntax. And then gradually start changing elements like the nonlinearity to see how that impacts performance.

victkid commented 5 years ago

Hi Daniel,

Thank you for your reply. Do you know why the training in Nengo takes more time than in DNN frameworks? The model I used in Keras takes about 2 minutes to train one epoch, the same model in Nengo takes up to 4 hours to finish one epoch. Now I changed the neuron type to Rectified Linear to make it match with the "Relu" activation I used in Keras. The softmax loss I observed in Nengo is still very different than the loss from Keras. Does the model still consider as SNN models after changing the neuron type to Relu?

drasmuss commented 5 years ago

Do you know why the training in Nengo takes more time than in DNN frameworks? The model I used in Keras takes about 2 minutes to train one epoch, the same model in Nengo takes up to 4 hours to finish one epoch

We'd expect Nengo to be a little bit slower than a DNN framework when doing DNN tasks, since Nengo is optimized for simulating SNN networks (in particular, simulating networks over time), as opposed to DNN frameworks, which are optimized for simulating standard DNNs. However, that difference should be on the order of 1-2x, not like the times you're seeing.

When I run your code, it takes about 1.5 minutes per epoch, which seems more in line with what you were expecting.

If I had to guess, I'd say that you're accidentally running your training on the CPU, rather than GPU. Make sure that you have tensorflow-gpu installed in whatever environment you're using, not tensorflow.

The softmax loss I observed in Nengo is still very different than the loss from Keras. Does the model still consider as SNN models after changing the neuron type to Relu?

When using relus the output should be exactly the same as in Keras (assuming your keras model was using relu's as well). If the output is not the same, that tells us that something about the original keras model has not been translated into nengo-dl. Perhaps some input pre-processing?

One thing you could try, to help with debugging your model, is to take your entire Keras model and embed it inside a TensorNode (like in this example https://www.nengo.ai/nengo-dl/examples/tensorflow-models.html). Make sure that you get the same performance there as in Keras, then you'll know that you've captured everything about the input/output processing in that model. Then you can gradually start separating the model out into multiple tensor_layers, and make sure that performance remains unchanged.

drasmuss commented 5 years ago

Closing this since the general questions are answered I think. Feel free to reopen if that is not the case!

nengo / nengo-dl

training with nengo-dl #102