gradient normalisation brakes cudnn

frakc commented 5 years ago

training with gradient normalisation http://dl3.joxi.net/drive/2018/10/04/0030/1076/1967156/56/9e7ab7f3d2.jpg training of same model without gradient normalisation http://joxi.ru/v29x444T3JM1am

version

  compile "org.deeplearning4j:deeplearning4j-core:${dl4j_version}"
  compile "org.deeplearning4j:deeplearning4j-ui_2.11:${dl4j_version}"
  compile "org.nd4j:nd4j-cuda-9.2-platform:${dl4j_version}"
  compile "org.deeplearning4j:deeplearning4j-cuda-9.2:${dl4j_version}"
  dl4j_version = "1.0.0-beta2"

model

package etl;

import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.Objects;
import java.util.Random;
import org.datavec.api.io.filters.RandomPathFilter;
import org.datavec.api.io.labels.ParentPathLabelGenerator;
import org.datavec.api.split.FileSplit;
import org.datavec.api.split.InputSplit;
import org.datavec.image.loader.NativeImageLoader;
import org.datavec.image.recordreader.ImageRecordReader;
import org.deeplearning4j.api.storage.StatsStorage;
import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator;
import org.deeplearning4j.eval.Evaluation;
import org.deeplearning4j.nn.conf.GradientNormalization;
import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
import org.deeplearning4j.nn.conf.distribution.Distribution;
import org.deeplearning4j.nn.conf.distribution.GaussianDistribution;
import org.deeplearning4j.nn.conf.distribution.NormalDistribution;
import org.deeplearning4j.nn.conf.inputs.InputType;
import org.deeplearning4j.nn.conf.layers.ConvolutionLayer;
import org.deeplearning4j.nn.conf.layers.DenseLayer;
import org.deeplearning4j.nn.conf.layers.LocalResponseNormalization;
import org.deeplearning4j.nn.conf.layers.OutputLayer;
import org.deeplearning4j.nn.conf.layers.SubsamplingLayer;
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
import org.deeplearning4j.nn.weights.WeightInit;
import org.deeplearning4j.ui.api.UIServer;
import org.deeplearning4j.ui.stats.StatsListener;
import org.deeplearning4j.ui.storage.InMemoryStatsStorage;
import org.deeplearning4j.util.ModelSerializer;
import org.nd4j.linalg.activations.Activation;
import org.nd4j.linalg.dataset.DataSet;
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
import org.nd4j.linalg.dataset.api.preprocessor.DataNormalization;
import org.nd4j.linalg.dataset.api.preprocessor.ImagePreProcessingScaler;
import org.nd4j.linalg.learning.config.Adam;
import org.nd4j.linalg.lossfunctions.LossFunctions;
import org.nd4j.linalg.schedule.ScheduleType;
import org.nd4j.linalg.schedule.StepSchedule;

import static java.lang.Math.toIntExact;

public class ButtonNN {
  private static int height = 100;
  private static int width = 100;
  private static int channels = 1;
  private static int batchSize = 512;

  private static long seed = 123;
  private static Random rng = new Random(seed);
  private static final int epochs = 500;
  private static final double splitTrainTest = 0.9;
  private static final boolean save = true;
  private static int numLabels;

  private static void run() throws Exception {

    /*cd
      Data Setup -> organize and limit data file paths:
       - mainPath = path to image files
       - fileSplit = define basic dataset split with limits on format
       - pathFilter = define additional file load filter to limit size and balance batch content
     */
    ParentPathLabelGenerator labelMaker = new ParentPathLabelGenerator();
    File mainPath = new File("E:\\evestaar\\buttons_randomised");
    FileSplit fileSplit = new FileSplit(mainPath, NativeImageLoader.ALLOWED_FORMATS, rng);
    int numExamples = toIntExact(fileSplit.length());
    numLabels = Objects.requireNonNull(fileSplit.getRootDir()
        .listFiles(
            File::isDirectory)).length; //This only works if your root is clean: only label subdirs.
    RandomPathFilter
        pathFilter =
        new RandomPathFilter(rng, "png");

    /*
      Data Setup -> train test split
       - inputSplit = define train and test split
     */
    InputSplit[] inputSplit = fileSplit.sample(pathFilter, splitTrainTest, 1 - splitTrainTest);
    InputSplit trainData = inputSplit[0];
    InputSplit testData = inputSplit[1];

    /*
      Data Setup -> normalization
       - how to normalize images and generate large dataset to train on
     */
    DataNormalization scaler = new ImagePreProcessingScaler(0, 1);

    System.out.println("Build model....");

    /*
      Data Setup -> define how to load data into net:
       - recordReader = the reader that loads and converts image data pass in inputSplit to initialize
       - dataIter = a generator that only loads one batch at a time into memory to save memory
       - trainIter = uses MultipleEpochsIterator to ensure model runs through the data for all epochs
     */
    ImageRecordReader recordReader = new ImageRecordReader(height, width, channels, labelMaker);
    DataSetIterator dataIter;

    System.out.println("Train model....");
    recordReader.initialize(trainData);
    dataIter = new RecordReaderDataSetIterator(recordReader, batchSize, 1, numLabels);
    dataIter.setPreProcessor(scaler);
    int index = 0;
    MultiLayerNetwork network = alexnetModel();
    network.init();
    //MultiLayerNetwork network = ModelSerializer.restoreMultiLayerNetwork(" basePath + \"model_\" + index + \".bin\"");
    UIServer uiServer = UIServer.getInstance();
    StatsStorage statsStorage = new InMemoryStatsStorage();
    uiServer.attach(statsStorage);
    network.setListeners(new StatsListener(statsStorage, 10));
    while (index < epochs) {
      System.out.println("epoch " + index + " " + new Date());
      network.fit(dataIter);
      StepSchedule step = new StepSchedule(ScheduleType.ITERATION, 1e-2, 0.1, 20000);
      System.out.println(
          "score " + network.score() + " lr " + step.valueAt(network.getIterationCount(), 0));
      dataIter.reset();
      if (save) {
        String basePath = "E:\\evestaar\\buttonNN";
        ModelSerializer.writeModel(network, basePath + "model_" + index + ".bin", true);
      }
      index++;
    }

    _evaluate(testData, scaler, network, recordReader, index);
  }

  private static void _evaluate(InputSplit testData, DataNormalization scaler,
      MultiLayerNetwork network,
      ImageRecordReader recordReader, int index) throws IOException {
    DataSetIterator dataIter;
    System.out.println("Evaluate model....");
    recordReader.initialize(testData);
    dataIter = new RecordReaderDataSetIterator(recordReader, batchSize, 1, numLabels);
    scaler.fit(dataIter);
    dataIter.setPreProcessor(scaler);
    Evaluation eval = network.evaluate(dataIter);
    System.out.println(eval.stats(true));

    // Example on how to get predict results with trained model. Result for first example in minibatch is printed
    dataIter.reset();
    DataSet testDataSet = dataIter.next();
    List<String> allClassLabels = recordReader.getLabels();
    int labelIndex = testDataSet.getLabels().argMax(1).getInt(0);
    int[] predictedClasses = network.predict(testDataSet.getFeatures());
    String expectedResult = allClassLabels.get(labelIndex);
    String modelPrediction = allClassLabels.get(predictedClasses[0]);
    System.out.print("\nFor a single example that is labeled "
        + expectedResult
        + " the model predicted "
        + modelPrediction
        + "\n\n");

    System.out.println("****************Example finished********************");
  }

  private static ConvolutionLayer convInit(String name, int in, int out, int[] kernel, int[] stride,
      int[] pad, double bias) {
    return new ConvolutionLayer.Builder(kernel, stride, pad).name(name)
        .nIn(in)
        .nOut(out)
        .biasInit(bias)
        .build();
  }

  private static ConvolutionLayer conv3x3(String name, int out, double bias) {
    return new ConvolutionLayer.Builder(new int[] { 3, 3 }, new int[] { 1, 1 },
        new int[] { 1, 1 }).name(name).nOut(out).biasInit(bias).build();
  }

  private static ConvolutionLayer conv5x5(String name, int out, int[] stride, int[] pad,
      double bias) {
    return new ConvolutionLayer.Builder(new int[] { 5, 5 }, stride, pad).name(name)
        .nOut(out)
        .biasInit(bias)
        .build();
  }

  private static SubsamplingLayer maxPool(String name, int[] kernel) {
    return new SubsamplingLayer.Builder(kernel, new int[] { 2, 2 }).name(name).build();
  }

  private static DenseLayer fullyConnected(String name, int out, double bias, double dropOut,
      Distribution dist) {
    return new DenseLayer.Builder().name(name)
        .nOut(out)
        .biasInit(bias)
        .dropOut(dropOut)
        .dist(dist)
        .build();
  }

  public static MultiLayerNetwork alexnetModel() {
    /*
      AlexNet model interpretation based on the original paper ImageNet Classification with Deep Convolutional Neural Networks
      and the imagenetExample code referenced.
      http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
     */

    double nonZeroBias = 1;
    double dropOut = 0.5;

    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
        .seed(seed)
        .weightInit(WeightInit.DISTRIBUTION)
        .dist(new NormalDistribution(0.0, 0.01))
        .activation(Activation.RELU)
        .updater(new Adam(5e-3))
        //.biasUpdater(new Nesterovs(new StepSchedule(ScheduleType.ITERATION, 2e-2, 0.1, 20000), 0.9))
        //.gradientNormalization(
        //    GradientNormalization.RenormalizeL2PerLayer) // normalize to prevent vanishing or exploding gradients
        .l2(5 * 1e-4)
        .list()
        .layer(0, convInit("cnn1", channels, 96, new int[] { 11, 11 }, new int[] { 4, 4 },
            new int[] { 3, 3 }, 0))
        .layer(1, new LocalResponseNormalization.Builder().name("lrn1").build())
        .layer(2, maxPool("maxpool1", new int[] { 3, 3 }))
        .layer(3, conv5x5("cnn2", 256, new int[] { 1, 1 }, new int[] { 2, 2 }, nonZeroBias))
        .layer(4, new LocalResponseNormalization.Builder().name("lrn2").build())
        .layer(5, maxPool("maxpool2", new int[] { 3, 3 }))
        .layer(6, conv3x3("cnn3", 384, 0))
        .layer(7, conv3x3("cnn4", 384, nonZeroBias))
        .layer(8, conv3x3("cnn5", 256, nonZeroBias))
        .layer(9, maxPool("maxpool3", new int[] { 3, 3 }))
        .layer(10,
            fullyConnected("ffn1", 4096, nonZeroBias, dropOut, new GaussianDistribution(0, 0.005)))
        .layer(11,
            fullyConnected("ffn2", 4096, nonZeroBias, dropOut, new GaussianDistribution(0, 0.005)))
        .layer(12, new OutputLayer.Builder(LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD)
            .name("output")
            .nOut(numLabels)
            .activation(Activation.SOFTMAX)
            .build())
        .backprop(true)
        .pretrain(false)
        .setInputType(InputType.convolutional(height, width, channels))
        .build();

    return new MultiLayerNetwork(conf);
  }

  public static void main(String[] args) throws Exception {
    run();
  }
}

AlexDBlack commented 5 years ago

Thanks for the detailed bug report. I'll try to look at this in the next 1-3 days.

AlexDBlack commented 5 years ago

OK, so the problem wasn't actually CuDNN at all. The problem here is actually numerical underflow in the softmax that is occuring only when gradient normalization is enabled. Specifically, after a couple of iterations we have values into the softmax that differ by about 9 orders of magnitude (1e5 for largest vs. 1e-4 for the rest). This underflows to probabilites of exactly 1.0 and 0.0, which leads to 0 gradients. Gradient normalization does grad/l2(grad) which is 0.0/0.0 = NaN in this case.

Fixed here: https://github.com/deeplearning4j/deeplearning4j/pull/6546

lock[bot] commented 5 years ago

This thread has been automatically locked since there has not been any recent activity after it was closed. Please open a new issue for related bugs.

deeplearning4j / deeplearning4j

gradient normalisation brakes cudnn #6539