Unable to run model update when neural networks contain LSTM layer

baicenxiao commented 12 months ago

❓Question

I converted a simple nerual network containing LSTM from Pytorch to mlmodel, and set only the last fully-connected layer to be updatable. Using this updatable model, I can run prediction, but model training gives error libc++abi: terminating due to uncaught exception of type Espresso::invalid_argument_error: Espresso exception: "Invalid argument": generic_expand_dims_kernel: Output rank cannot be more than 5 in expand_dims:transpose_1_expanded. Does it mean that, once LSTM is in the network, I cannot fine-tune the last layer even if it is just a fully-connected layer? Is there any temporary solution? Thanks.

I use torch==1.13.1 and coremltools==7.0

Here is the network structure:

Here is the Pytorch code:

import torch
import torch.nn as nn
import coremltools as ct
import coremltools
import numpy as np
from coremltools.models.neural_network import NeuralNetworkBuilder, SgdParams, AdamParams
from coremltools.models import datatypes

class LSTMClassificationModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTMClassificationModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer to get the final output
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Reshape the input to (batch_size, seq_length, input_size)
        x = x.unsqueeze(-1)  # Reshapes from (batch_size, 3) to (batch_size, 3, 1)

        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate through the LSTM layer
        out, _ = self.lstm(x, (h0, c0))

        # Take the output of the last time step
        out = out[:, -1, :]

        # Forward propagate through the fully connected layer
        out = self.fc(out)

        return out

# Example usage
input_size = 1    # Number of features in each time step (after reshaping)
hidden_size = 3  # Number of features in the hidden state
output_size = 5   # Number of output classes

model = LSTMClassificationModel(input_size, hidden_size, output_size)

# Create a sample input tensor
sample_input = torch.rand(1, 3)  # Adjust the shape according to your model's input

# Trace the model with a sample input
traced_model = torch.jit.trace(model, sample_input)

# Convert the traced model to Core ML format
input_features = [ct.TensorType(shape=(1, 3))]
output_features = ["output"]
mlmodel = ct.convert(
    traced_model,
    inputs=input_features,
    convert_to="neuralnetwork"
)
mlmodel.save("classification.mlmodel")

spec = coremltools.utils.load_spec('classification.mlmodel')

builder = coremltools.models.neural_network.NeuralNetworkBuilder(spec=spec)

# Load the model specification
spec = coremltools.utils.load_spec('classification.mlmodel')
builder = NeuralNetworkBuilder(spec=spec)

# Make layers updatable
builder.make_updatable(['linear_0'])

builder.add_softmax(name='output_prob', input_name='linear_0', output_name='output_prob')
builder.set_categorical_cross_entropy_loss(name='lossLayer', input='output_prob')

# define the optimizer (Adam in this example)
adam_params = AdamParams(lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8, batch=32)
builder.set_adam_optimizer(adam_params)

# Set the number of epochs
builder.set_epochs(100)

# Optionally, set descriptions for your training inputs
spec.description.trainingInput[0].shortDescription = 'Input data'
spec.description.trainingInput[1].shortDescription = 'Target output data'

spec.description.output[0].name = 'output_prob'
spec.description.output[0].shortDescription = 'Probability distribution over output classes'

# Save the updated model
updated_model = coremltools.models.MLModel(spec)
updated_model.save('updatable_classification11.mlmodel')

adam_params = AdamParams(lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8, batch=32)
builder.set_adam_optimizer(adam_params)

# Set the number of epochs
builder.set_epochs(100)

# Optionally, set descriptions for your training inputs
spec.description.trainingInput[0].shortDescription = 'Input data'
spec.description.trainingInput[1].shortDescription = 'Target output data'

spec.description.output[0].name = 'output_prob'
spec.description.output[0].shortDescription = 'Probability distribution over output classes'

# Save the updated model
updated_model = coremltools.models.MLModel(spec)
updated_model.save('updatable_classification11.mlmodel')

Here is my swift prediction code which works:

import CoreML

import GameKit

func generateSampleData(numSamples: Int, seed: UInt64) -> ([MLMultiArray], [MLMultiArray]) {
    var inputArray = [MLMultiArray]()
    var outputArray = [MLMultiArray]()

    let randomSource = GKLinearCongruentialRandomSource(seed: seed)
    let randomDistribution = GKRandomDistribution(randomSource: randomSource, lowestValue: 0, highestValue: 10)

    for _ in 0..<numSamples {
        do {
            let input = try MLMultiArray(shape: [1, 3], dataType: .float32)
            let output = try MLMultiArray(shape: [1], dataType: .int32)

            var sumInput: Float = 0
            for i in 0..<input.count {
                let inputValue = Float(randomDistribution.nextInt())
                input[i] = NSNumber(value: inputValue)
                sumInput += inputValue
            }

            // Assign a class based on the sum of inputs
            let outputClass = Int32(sumInput/6.0) % 5
            output[0] = NSNumber(value: outputClass)

            inputArray.append(input)
            outputArray.append(output)
        } catch {
            print("Error occurred while creating MLMultiArrays: \(error)")
        }
    }

    return (inputArray, outputArray)
}

func argmax(multiArray: MLMultiArray) -> Int {
    let length = multiArray.count
    let ptr = UnsafeMutablePointer<Float>(OpaquePointer(multiArray.dataPointer))
    var maxValue: Float = ptr[0]
    var maxIndex: Int = 0

    for i in 1..<length {
        if ptr[i] > maxValue {
            maxValue = ptr[i]
            maxIndex = i
        }
    }

    return maxIndex
}

func computeMetrics(model: MLModel, data: ([MLMultiArray], [MLMultiArray])) -> (loss: Double, accuracy: Double) {
    let (inputData, outputData) = data
    var totalLoss: Double = 0
    var correctPredictions: Int = 0

    for (index, input) in inputData.enumerated() {
        let output = outputData[index]

        if let prediction = try? model.prediction(from: MLDictionaryFeatureProvider(dictionary: ["x_1": MLFeatureValue(multiArray: input)])),
           let predictedOutputProb = prediction.featureValue(for: "output_prob")?.multiArrayValue {

            let trueClass = output[0].intValue
            let predictedClass = argmax(multiArray: predictedOutputProb)
            correctPredictions += (trueClass == predictedClass) ? 1 : 0

            // Calculate cross-entropy loss
            let predictedProb = predictedOutputProb[trueClass]
            totalLoss += -log(max(Double(predictedProb.doubleValue), 1e-10))
        }
    }
    let accuracy = Double(correctPredictions) / Double(inputData.count)
    return (totalLoss / Double(inputData.count), accuracy)
}

func testModel() {
    // Generate test data
    let (inputData, outputData) = generateSampleData(numSamples: 500, seed: 8)

    // Load the model from the main bundle
    guard let modelURL = Bundle.main.url(forResource: "updatable_classification11", withExtension: "mlmodelc"),
          let model = try? MLModel(contentsOf: modelURL) else {
        print("Failed to load the model from the bundle")
        return
    }

    // Make predictions
    for i in 0..<inputData.count {
        let inputX1 = inputData[i]
        let expectedOutputClass = outputData[i][0].intValue // Expected class

        do {
            let inputFeatureProvider = try MLDictionaryFeatureProvider(dictionary: ["x_1": MLFeatureValue(multiArray: inputX1)])
            let prediction = try model.prediction(from: inputFeatureProvider)

            if let predictedOutputProbArray = prediction.featureValue(for: "output_prob")?.multiArrayValue {
                let predictedClass = argmax(multiArray: predictedOutputProbArray) // Use the custom argmax function
                print("Sample \(i+1):")
                print("Predicted Class: \(predictedClass)")
                print("Expected Class: \(expectedOutputClass)")
            } else {
                print("Failed to extract prediction for sample \(i+1)")
            }
        } catch {
            print("Failed to make prediction for sample \(i+1): \(error.localizedDescription)")
        }
    }
}

Here is training code which raises the error message libc++abi: terminating due to uncaught exception of type Espresso::invalid_argument_error: Espresso exception: "Invalid argument": generic_expand_dims_kernel: Output rank cannot be more than 5 in expand_dims:expand_dims_0:

func trainModel() {
    // Load the updatable model
    guard let updatableModelURL = Bundle.main.url(forResource: "updatable_classification11", withExtension: "mlmodelc") else {
        print("Failed to load the updatable model")
        return
    }

    // Generate sample data
    let (inputData, outputData) = generateSampleData(numSamples: 500, seed: 8)
    let validationData = generateSampleData(numSamples: 100, seed: 18)

    // Create an MLArrayBatchProvider from the sample data
    var featureProviders = [MLFeatureProvider]()
    for (index, input) in inputData.enumerated() {
        let output = outputData[index]
        let dataPointFeatures: [String: MLFeatureValue] = [
            "x_1": MLFeatureValue(multiArray: input), // Make sure "input" matches the input feature name in your model
            "output_prob_true": MLFeatureValue(multiArray: output) // Make sure "output_true" matches the expected output feature name in your model for training
        ]
        if let provider = try? MLDictionaryFeatureProvider(dictionary: dataPointFeatures) {
            featureProviders.append(provider)
        }
    }
    let batchProvider = MLArrayBatchProvider(array: featureProviders)

    // Define progress handlers
    var lossValues: [Double] = []
    var accuracyValues: [Double] = []
    var validationMetrics: [(loss: Double, accuracy: Double)] = []

    let progressHandlers = MLUpdateProgressHandlers(forEvents: [.trainingBegin, .epochEnd],
        progressHandler: { context in
            switch context.event {
                case .trainingBegin:
                    print("Training began.")
                case .epochEnd:
                    let loss = context.metrics[.lossValue] as! Double
                    lossValues.append(loss)
                    let (validationLoss, validationAccuracy) = computeMetrics(model: context.model, data: validationData)
                    validationMetrics.append((validationLoss, validationAccuracy))
                    let (computedTrainLoss, computedTrainAccuracy) = computeMetrics(model: context.model, data: (inputData, outputData))
                    accuracyValues.append(computedTrainAccuracy)
//                    updateLoss(computedTrainLoss)
                    print("Epoch \(context.metrics[.epochIndex]!) ended. Training Loss: \(loss), Computed Training loss: \(computedTrainLoss), Training Accuracy: \(computedTrainAccuracy), Validation Loss: \(validationLoss), Validation Accuracy: \(validationAccuracy)")
                default:
                    break
            }
        },
        completionHandler: { context in
            if let error = context.task.error {
                print("Update task failed with error: \(error)")
            } else {
                let updatedModel = context.model
                do {
                    let fileManager = FileManager.default
                    let documentDirectory = try fileManager.url(for: .documentDirectory, in: .userDomainMask, appropriateFor:nil, create:true)
                    let fileURL = documentDirectory.appendingPathComponent("updated_classification.mlmodelc")
                    try updatedModel.write(to: fileURL)
                    print("Model updated and saved successfully to \(fileURL)")
                } catch {
                    print("Failed to save the updated model: \(error)")
                }
            }
//        completion()
        }
    )

    // Create an update task with progress handlers
    let updateTask = try! MLUpdateTask(forModelAt: updatableModelURL,
                                       trainingData: batchProvider,
                                       configuration: nil,
                                       progressHandlers: progressHandlers)

    // Start the update task
    updateTask.resume()
}

TobyRoseman commented 12 months ago

I don't think this is an issue with the coremltools Python package. This looks like an issue with the on device Core ML Framework.

I don't see anything obviously wrong with your Python code. Unless you think this issue is somehow related to converting your PyTorch model to Core ML, you should submit this bug using the Feedback Assistant. Before doing that, I recommend verifying that it is still an issue on the most recent version of the OS.

baicenxiao commented 12 months ago

Thanks @TobyRoseman for the response.

It is still an issue even on the most recent version of iOS (17.1). Using the converted model for inference works fine, the problem happens only when trying to update the model.

Here is the zipped mlmodel file in case it is helpful for debugging. updatable_classification11.mlmodel.zip

TobyRoseman commented 12 months ago

Since the converted model works for inference, I think that's even more reason to suspect it's an issue with the Core ML Framework. Did you submit this issue using the Feedback Assistant?

baicenxiao commented 12 months ago

Just submitted this issue using the Feedback Assistant.

apple / coremltools

Unable to run model update when neural networks contain LSTM layer #2074

❓Question