tensorflow / swift-apis

Swift for TensorFlow Deep Learning Library
Apache License 2.0
795 stars 135 forks source link

adamax/AMSGrad not working with differentiableReduce #757

Open brettkoonce opened 4 years ago

brettkoonce commented 4 years ago

all credit to @s1ddok for reporting this in https://github.com/tensorflow/swift-models/pull/394!

something is failing (i think block-based reduces) with the adamax/AMSGrad optimizers:

[edit: updated 2020-05-17 to match latest swift-models api changes, still failing against master]

import Datasets
import ImageClassificationModels
import TensorFlow

let batchSize = 32

let dataset = Imagewoof(batchSize: batchSize, inputSize: .full, outputSize: 224)
var model = VGG16(classCount: 10)
let optimizer = AdaMax(for: model)

print("Starting training...")

for epoch in 1...90 {
    if epoch > 30 { optimizer.learningRate = 0.002 }
    if epoch > 60 { optimizer.learningRate = 0.0002 }

    Context.local.learningPhase = .training
    var trainingLossSum: Float = 0
    var trainingBatchCount = 0
    for batch in dataset.training.sequenced() {
        let (images, labels) = (batch.first, batch.second)
        let (loss, gradients) = valueWithGradient(at: model) { model -> Tensor<Float> in
            let logits = model(images)
            return softmaxCrossEntropy(logits: logits, labels: labels)
        }
        trainingLossSum += loss.scalarized()
        trainingBatchCount += 1
        optimizer.update(&model, along: gradients)
    }

    Context.local.learningPhase = .inference
    var testLossSum: Float = 0
    var testBatchCount = 0
    var correctGuessCount = 0
    var totalGuessCount = 0
    for batch in dataset.test.sequenced() {
        let (images, labels) = (batch.first, batch.second)
        let logits = model(images)
        testLossSum += softmaxCrossEntropy(logits: logits, labels: labels).scalarized()
        testBatchCount += 1

        let correctPredictions = logits.argmax(squeezingAxis: 1) .== labels
        correctGuessCount = correctGuessCount
            + Int(
                Tensor<Int32>(correctPredictions).sum().scalarized())
        totalGuessCount = totalGuessCount + batch.first.shape[0]
    }

    let accuracy = Float(correctGuessCount) / Float(totalGuessCount)
    print(
        """
        [Epoch \(epoch)] \
        Accuracy: \(correctGuessCount)/\(totalGuessCount) (\(accuracy)) \
        Loss: \(testLossSum / Float(testBatchCount))
        """
    )
}

full trace:

Precondition failed: Count mismatch: 2 and 0: file /home/skoonce/swift/swift-sou
rce/tensorflow-swift-apis/Sources/TensorFlow/StdlibExtensions.swift, line 269   
Current stack trace:                                                            
0    libswiftCore.so                    0x00007f91aeca6910 swift_reportError + $
0                                                                               
1    libswiftCore.so                    0x00007f91aed17b90 _swift_stdlib_reportF
atalErrorInFile + 115                                                           
2    libswiftCore.so                    0x00007f91ae9f7e31 <unavailable> + 14618
09                                                                              
3    libswiftCore.so                    0x00007f91ae9f7a77 <unavailable> + 14608
55                                                                              
4    libswiftCore.so                    0x00007f91ae9f8012 <unavailable> + 14622
90                                                                              
5    libswiftCore.so                    0x00007f91ae9f6460 _assertionFailure(_:_
:file:line:flags:) + 517                                                        
6    libswiftTensorFlow.so              0x00007f91af00e720 static Array.Differen
tiableView<A>..* infix(_:_:) + 977                                              
7    MobileNet-Imagenette               0x00005560555c06b4 <unavailable> + 4044468
8    MobileNet-Imagenette               0x00005560555c8365 <unavailable> + 4076389
9    MobileNet-Imagenette               0x00005560555d16d2 <unavailable> + 4114130
10   libswiftCore.so                    0x00007f91aec80980 dispatch thunk of static SignedInteger._maskingSubtract(_:_:) + 7
11   libswiftCore.so                    0x00007f91aec9a2e0 dispatch thunk of static PointwiseMultiplicative..* infix(_:_:) + 9
12   libswiftTensorFlow.so              0x00007f91af460e00 static PointwiseMultiplicative../ infix(_:_:) + 177
13   libswiftTensorFlow.so              0x00007f91af4f5a50 AdaMax.update(_:along:) + 3626
14   MobileNet-Imagenette               0x000055605564f987 <unavailable> + 4630919
15   libc.so.6                          0x00007f919666bab0 __libc_start_main + 231
16   MobileNet-Imagenette               0x000055605525292a <unavailable> + 448810
Illegal instruction (core dumped)
s1ddok commented 4 years ago

Thanks for opening up the issue alongside with repro case! I just want to add my 2 cents that I initially discovered this problem with AMSGrad which is not working as well.

brettkoonce commented 4 years ago

ugh you're right i copy pasted the wrong call, fixed!