microsoft / CNTK

Microsoft Cognitive Toolkit (CNTK), an open source deep-learning toolkit
https://docs.microsoft.com/cognitive-toolkit/
Other
17.52k stars 4.28k forks source link

Can't continue training (CURAND failure 201) #3567

Open Aslyamovt opened 5 years ago

Aslyamovt commented 5 years ago

Hello, i have some problems with countinuing training my pretrained model. I trying to restore training progress from checkpoint by 2 ways. If I use this:

    class LSTM
    {
             internal struct InputModel
             {
                  public Variable InputSequenceForward;
                  public Variable LabelSequence;
             }

             InputModel CreateInputs()
             {
                   var fiaxis = Axis.NewUniqueDynamicAxis("finputAxis");
                   var oaxis = new Axis("outputAxis", false);
                   var finputSequence = Variable.InputVariable(new int[] { 500 }, DataType.Float, "features", new List<Axis> { fiaxis, Axis.DefaultBatchAxis() });
                   var labels = Variable.InputVariable(new int[] { 2 }, DataType.Float, "labels", new List<Axis> { Axis.DefaultBatchAxis() });
                   var inputModel = new InputModel
                   {
                        InputSequenceForward = finputSequence,
                        LabelSequence = labels
                   };
                        return inputModel;
                   }

    void train()
   {
        SetDevice(); //method for choosing device
        var inputModel = CreateInputs(); //method for creating input and labels (it's the same one I used when I created the model)
        var model = Function.Load($"newmodels/http_epoch30.dnn", device);
        var crossEntropy = CNTKLib.CrossEntropyWithSoftmax(model, inputModel.LabelSequence);
        var errors = CNTKLib.ClassificationError(model, inputModel.LabelSequence);
        var learningRatePerSample = new TrainingParameterScheduleDouble(0.01, 1);
        var learner = CNTKLib.AdaGradLearner(new ParameterVector(model.Parameters().ToList()), learningRatePerSample);
        var trainer = Trainer.CreateTrainer(model, crossEntropy, errors, new List<Learner>() { learner });
        trainer.RestoreFromCheckpoint("checkpoints/http_epochCheck30.dnn");

        int epochs = 42;
        int minibatchesPerEpoch = 23000;
        for (int i = 30; i < epochs; i++)
        {
            var start = DateTime.Now;
            Console.WriteLine($"Running training on epoch {i + 1} of {epochs}");
            for (int j = 0; j < minibatchesPerEpoch; j++)
            {
                var trainingData = GetData(j); //method for loading data (I'm shure that problem is not here because I've trainered first 30 epoches using this)
                var arguments = new Dictionary<Variable, Value>();
                var features1 = Value.CreateSequence<float>(inputModel.InputSequenceForward.Shape,
                    trainingData.InputSequenceForward, device);
                arguments.Add(inputModel.InputSequenceForward, features1);
                var labels = Value.CreateSequence(inputModel.LabelSequence.Shape,
                    trainingData.OutputSequence, device);
                arguments.Add(inputModel.LabelSequence, labels);
                trainer.TrainMinibatch(arguments,true, device);
          }
   }
   }
   }

I have "Values for 1 required arguments 'Input('features', [500], [finputAxis, #])', that the requested output(s) 'Output('aggregateLoss', [], []), Output('aggregateEvalMetric', [], []), Output('Block67_Output_0', [1], [#])' depend on, have not been provided." exception. If I try this:

    internal struct InputModel
    {
        public Variable InputSequenceForward;
        public Variable LabelSequence;
    }

    InputModel CreateInputs()
    {
        var fiaxis = Axis.NewUniqueDynamicAxis("finputAxis");
        var oaxis = new Axis("outputAxis", false);
        var finputSequence = Variable.InputVariable(new int[] { 500 }, DataType.Float, "features", new List<Axis> { fiaxis, Axis.DefaultBatchAxis() });
        var labels = Variable.InputVariable(new int[] { 2 }, DataType.Float, "labels", new List<Axis> { Axis.DefaultBatchAxis() });//oaxis,
        var inputModel = new InputModel
        {
            InputSequenceForward = finputSequence,
            LabelSequence = labels
        };
        return inputModel;
    }

   void train()
   {
        SetDevice(); //method for choosing device
        var inputModel = CreateInputs(); //method for creating input and labels (it's the same one I used when I created the model)
        var modelSequence = CreateModel(2, 1, 216);                                  //
        var model = modelSequence(inputModel.InputSequenceForward); //Methods I used to create 
                                                                                                                      model
        var crossEntropy = CNTKLib.CrossEntropyWithSoftmax(model, inputModel.LabelSequence);
        var errors = CNTKLib.ClassificationError(model, inputModel.LabelSequence);
        var learningRatePerSample = new TrainingParameterScheduleDouble(0.01, 1);
        var learner = CNTKLib.AdaGradLearner(new ParameterVector(model.Parameters().ToList()), learningRatePerSample);
        var trainer = Trainer.CreateTrainer(model, crossEntropy, errors, new List<Learner>() { learner });
        trainer.RestoreFromCheckpoint("checkpoints/http_epochCheck30.dnn");

        int epochs = 42;
        int minibatchesPerEpoch = 23000;
        for (int i = 30; i < epochs; i++)
        {
            var start = DateTime.Now;
            Console.WriteLine($"Running training on epoch {i + 1} of {epochs}");
            for (int j = 0; j < minibatchesPerEpoch; j++)
            {
                var trainingData = GetData(j); //method for loading data (I'm shure that problem is not here because I've trainered first 30 epoches using this)
                var arguments = new Dictionary<Variable, Value>();
                var features1 = Value.CreateSequence<float>(inputModel.InputSequenceForward.Shape,
                    trainingData.InputSequenceForward, device);
                arguments.Add(inputModel.InputSequenceForward, features1);
                var labels = Value.CreateSequence(inputModel.LabelSequence.Shape,
                    trainingData.OutputSequence, device);
                arguments.Add(inputModel.LabelSequence, labels);
                trainer.TrainMinibatch(arguments,true, device);
          }
   }
   }

I have "CURAND failure 201: (see curand.h & look for curandStatus or CURAND_STATUS_xxx) ; GPU=0 ; hostname=DEXP ; expr=curandGenerateUniformHelper(gpuRNGHandle->Generator(), Data(), GetNumElements())". This way worked when tried to continue training my previos 20-30 networks. I think problem could be somewhere in BatchNormalization(), because it is the only one unit I added into the model.

I use stack: CNTK 2.5.1 CUDA 9.0 cuDNN 7.4.1.5 VS 2015

Device NVIDIA GeForce 940M OS Win 8.1 I will be grateful for any help

Aslyamovt commented 5 years ago

UPD. I read some issues and examples and understood that the second way was better. By elimination method I found out that problem in Dropout layer. Could somebody explane how to resume train on networks with Dropout layer using GPU (I don't have this problem with CPU)?