MethodError in ForwardDiff-gradient calculation when using recordValues in a NeuralFMU which enters batch-loss

If you have a ME_NeuralFMU defined with recordValues, which you use in a batchDataSolution, from which you create a FMIFlux.Losses.loss(neuralFMU, batch; ...), then the ForwardDiff.gradient cannot be calculated. You receive a MethodError: no method matching Float64(::ForwardDiff.Dual{ForwardDiff.Tag{var"#128#129", Float64}, Float64, 22}).

The "MWE" is a reduced variant of the juliacon2023.ipynb:

using FMI           # import FMUs into Julia 
using FMIFlux       # for NeuralFMUs
using FMIZoo        # a collection of demo models, including the VLDM
using FMIFlux.Flux  # Machine Learning in Julia
import Random       # for fixing the random seed
using Plots         # plotting results
include(joinpath(@__DIR__, "juliacon_2023_helpers.jl"));

showProgress=false;
dt = 0.1 
data = VLDM(:train, dt=dt) 
data_validation = VLDM(:validate, dt=dt)
n = 81#length(data.consumption_t)
tStart = data.consumption_t[1]
tStop = data.consumption_t[n]
tSave = data.consumption_t[1:n]
x0 = FMIZoo.getStateVector(data,tStart) 

function build_net(f::FMU2) 
    # pre- and post-processing
    preProcess = ShiftScale{Float64}([-5.363728491534626, -6.257465235165946e-6, -2444.0838108753733], [0.13987191723940867, 0.9502609504008034, 0.00013605664860124656])
    preProcess.scale[:] *= 0.25                         # add some additional "buffer"
    postProcess = ScaleShift(preProcess; indices=2:3)   # initialize the postPrcess as inverse of the preProcess, but only take indices 2 and 3 (we don't need 1, the vehcile velocity)

    # cache
    cache = CacheLayer()                        # allocate a cache layer
    cacheRetrieve = CacheRetrieveLayer(cache)   # allocate a cache retrieve layer, link it to the cache layer
    gates = ScaleSum([1.0, 1.0, 0.0, 0.0], [[1,3], [2,4]]) # gates with sum

    # setup the NeuralFMU topology
    model = Chain(x -> f(; x=x, dx_refs=:all),        # take `x`, put it into the FMU, retrieve all derivatives `dx`
                dx -> cache(dx),                    # cache `dx`
                dx -> dx[4:6],                      # forward only dx[4, 5, 6]
                preProcess,                         # pre-process `dx`
                Dense(3, 2, tanh),                 # Dense Layer 32 -> 2 with `tanh` activation 
                postProcess,                        # post process `dx`
                dx -> cacheRetrieve(5:6, dx),       # dynamics FMU | dynamics ANN
                gates,                              # compute resulting dx from ANN + FMU
                dx -> cacheRetrieve(1:4, dx))       # stack together: dx[1,2,3,4] from cache + dx[5,6] from gates

    return model
end
# prepare training data 
train_t = data.consumption_t[1:n] 
train_data = collect([d] for d in data.cumconsumption_val[1:n])

function _lossFct(solution::FMU2Solution, data::VLDM_Data, LOSS::Symbol, LASTWEIGHT::Real=1.0/length(data.consumption_t) )
    # determine the start/end indices `ts` and `te` in the data array (sampled with 10Hz)
    ts = dataIndexForTime(solution.states.t[1])
    te = dataIndexForTime(solution.states.t[end])   
    # retrieve the data from NeuralODE ("where we are") and data from measurements ("where we want to be") and an allowed deviation ("we are unsure about")
    nfmu_cumconsumption = fmiGetSolutionState(solution, 6; isIndex=true)
    cumconsumption = data.cumconsumption_val[ts:te]
    cumconsumption_dev = data.cumconsumption_dev[ts:te]
    Δcumconsumption = FMIFlux.Losses.mse_last_element_rel_dev(nfmu_cumconsumption,  cumconsumption, cumconsumption_dev, LASTWEIGHT)    
    return Δcumconsumption 
end

hyper_params = [0.0001,  0.9,  0.999,      4.0,        0.7, :MSE]
ressource = 8.0#tStop-tStart
TRAINDUR = ressource
ETA, BETA1, BETA2, BATCHDUR, LASTWEIGHT, LOSS = hyper_params
steps = max(round(Int, TRAINDUR/BATCHDUR), 1) 

# load our FMU (we take one from the FMIZoo.jl, exported with Dymola 2020x)
fmu = fmiLoad("VLDM", "Dymola", "2020x"; type=:ME) 
fmiSingleInstanceMode(fmu, true)

# built the NeuralFMU on basis of the loaded FMU `fmu`
net = build_net(fmu)
neuralFMU = ME_NeuralFMU(fmu, net, (tStart, tStop), saveat=tSave ,recordValues=:derivatives) 
neuralFMU.modifiedState = false 
params = FMIFlux.params(neuralFMU)

batch = batchDataSolution(neuralFMU,                            # our NeuralFMU model
                        t -> FMIZoo.getStateVector(data, t),  # a function returning a start state for a given time point `t`, to determine start states for batch elements
                        train_t,                              # data time points
                        train_data;                           # data cumulative consumption 
                        batchDuration=BATCHDUR,               # duration of one batch element
                        indicesModel=6:6,                     # model indices to train on (6 equals the state `cumulative consumption`)
                        plot=true,                           # don't show intermediate plots (try this outside of Jupyter)
                        parameters=data.params,               # use the parameters (map file paths) from *FMIZoo.jl*
                        showProgress=showProgress)            # show or don't show progess bar, as specified at the very beginning
solverKwargsTrain = Dict{Symbol, Any}(:maxiters => round(Int, 1000*BATCHDUR)) 

# a smaller dispatch for our custom loss function, only taking the solution object
lossFct = (solution::FMU2Solution) -> _lossFct(solution, data, LOSS, LASTWEIGHT)
scheduler = RandomScheduler(neuralFMU, batch; applyStep=1, plotStep=0)

# initialize the scheduler, keywords are passed to the NeuralFMU
initialize!(scheduler; parameters=data.params, p=params[1], showProgress=showProgress)
# loss for training, do a simulation run on a batch element taken from the scheduler
loss = p -> FMIFlux.Losses.loss(neuralFMU,                          # the NeuralFMU to simulate
                                batch;                              # the batch to take an element from
                                p=p,                                # the NeuralFMU training parameters (given as input)
                                parameters=data.params,             # the FMU paraemters
                                lossFct=lossFct,                    # our custom loss function
                                batchIndex=scheduler.elementIndex,  # the index of the batch element to take, determined by the choosen scheduler
                                logLoss=true,                       # log losses after every evaluation
                                showProgress=showProgress,          # show progress bar (or don't)
                                solverKwargsTrain...)               # the solver kwargs defined above
loss(params[1])

###############################################################
#->the interesting last bit: 
grads = zeros(Float64, length(params[1])) 
neuralFMU = ME_NeuralFMU(fmu, net, (tStart, tStop),saveat=tSave,recordValues=:derivatives)
FMIFlux.computeGradient!(grads, loss, params[1], :ForwardDiff, :auto_fmiflux, false)
#<-MethodError

################################################################
#redefine neuralFMU without recordValues:
neuralFMU = ME_NeuralFMU(fmu, net, (tStart, tStop),saveat=tSave)
FMIFlux.computeGradient!(grads, loss, params[1], :ForwardDiff, :auto_fmiflux, false)
#<-no error
###############################################################

From what I tested you don't have this problem as long as you don't try batching. In my current toy problem (of course not the one above ;-)) I only need the values of an FMU-state, which are returned anyway, so that's fine. However, I am not sure how to record the derivatives or a specific output of the FMU, if not via recordValues, which I will definitively need later.

ThummeTo / FMIFlux.jl

MethodError in ForwardDiff-gradient calculation when using recordValues in a NeuralFMU which enters batch-loss #120