I would like to get an help on why teh following code :
library(tidyverse)
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)
library(purrr)
library(listarrays)
library(abind)
library(deSolve)
parameters <- c(a = .2,
b = .2,
c = 5.7)
initial_state <-
c(x = 1,
y = 1,
z = 1.05)
roessler <- function(t, state, parameters) {
with(as.list(c(state, parameters)), {
dx <- -y - z
dy <- x + a * y
dz = b + z * (x - c)
list(c(dx, dy, dz))
})
}
times <- seq(0, 2500, length.out = 20000)
roessler_ts <-
ode(
y = initial_state,
times = times,
func = roessler,
parms = parameters,
method = "lsoda"
) %>% unclass() %>% as_tibble()
n <- 10000
roessler <- roessler_ts$x[1:n]
roessler <- scale(roessler)
# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler <- roessler + rnorm(10000, mean = 0, sd = noise)
str(roessler)
vae_encoder_model <- function(n_timesteps,
n_features,
n_latent,
name = NULL) {
keras_model_custom(name = name, function(self) {
self$conv1 <- layer_conv_1d(kernel_size = 3,
filters = 16,
strides = 2)
self$act1 <- layer_activation_leaky_relu()
self$batchnorm1 <- layer_batch_normalization()
self$conv2 <- layer_conv_1d(kernel_size = 7,
filters = 32,
strides = 2)
self$act2 <- layer_activation_leaky_relu()
self$batchnorm2 <- layer_batch_normalization()
self$conv3 <- layer_conv_1d(kernel_size = 9,
filters = 64,
strides = 2)
self$act3 <- layer_activation_leaky_relu()
self$batchnorm3 <- layer_batch_normalization()
self$conv4 <- layer_conv_1d(
kernel_size = 9,
filters = n_latent,
strides = 2,
activation = "linear"
)
self$batchnorm4 <- layer_batch_normalization()
self$flat <- layer_flatten()
function (x, mask = NULL) {
x %>%
self$conv1() %>%
self$act1() %>%
self$batchnorm1() %>%
self$conv2() %>%
self$act2() %>%
self$batchnorm2() %>%
self$conv3() %>%
self$act3() %>%
self$batchnorm3() %>%
self$conv4() %>%
self$batchnorm4() %>%
self$flat()
}
})
}
vae_decoder_model <- function(n_timesteps,
n_features,
n_latent,
name = NULL) {
keras_model_custom(name = name, function(self) {
self$reshape <- layer_reshape(target_shape = c(1, n_latent))
self$conv1 <- layer_conv_1d_transpose(kernel_size = 15,
filters = 64,
strides = 3)
self$act1 <- layer_activation_leaky_relu()
self$batchnorm1 <- layer_batch_normalization()
self$conv2 <- layer_conv_1d_transpose(kernel_size = 11,
filters = 32,
strides = 3)
self$act2 <- layer_activation_leaky_relu()
self$batchnorm2 <- layer_batch_normalization()
self$conv3 <- layer_conv_1d_transpose(
kernel_size = 9,
filters = 16,
strides = 2,
output_padding = 1
)
self$act3 <- layer_activation_leaky_relu()
self$batchnorm3 <- layer_batch_normalization()
self$conv4 <- layer_conv_1d_transpose(
kernel_size = 7,
filters = 1,
strides = 1,
activation = "linear"
)
self$batchnorm4 <- layer_batch_normalization()
function (x, mask = NULL) {
x %>%
self$reshape() %>%
self$conv1() %>%
self$act1() %>%
self$batchnorm1() %>%
self$conv2() %>%
self$act2() %>%
self$batchnorm2() %>%
self$conv3() %>%
self$act3() %>%
self$batchnorm3() %>%
self$conv4() %>%
self$batchnorm4()
}
})
}
# to reparameterize encoder output before calling decoder
reparameterize <- function(mean, logvar = 0) {
eps <- k_random_normal(shape = n_latent)
eps * k_exp(logvar * 0.5) + mean
}
# loss FNN
loss_false_nn <- function(x) {
# changing these parameters is equivalent to
# changing the strength of the regularizer, so we keep these fixed (these values
# correspond to the original values used in Kennel et al 1992).
rtol <- 10
atol <- 2
k_frac <- 0.01
k <- max(1, floor(k_frac * batch_size))
## Vectorized version of distance matrix calculation
tri_mask <-
tf$linalg$band_part(
tf$ones(
shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
dtype = tf$float32
),
num_lower = -1L,
num_upper = 0L
)
# latent x batch_size x latent
batch_masked <-
tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])
# latent x batch_size x 1
x_squared <-
tf$reduce_sum(batch_masked * batch_masked,
axis = 2L,
keepdims = TRUE)
# latent x batch_size x batch_size
pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))
#(latent, batch_size, batch_size)
all_dists <- pdist_vector
# latent
all_ra <-
tf$sqrt((1 / (
batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
)) *
tf$reduce_sum(tf$square(
batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
), axis = c(1L, 2L)))
# Avoid singularity in the case of zeros
#(latent, batch_size, batch_size)
all_dists <-
tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))
#inds = tf.argsort(all_dists, axis=-1)
top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
# (#(latent, batch_size, batch_size)
top_indices <- top_k[[1]]
#(latent, batch_size, batch_size)
neighbor_dists_d <-
tf$gather(all_dists, top_indices, batch_dims = -1L)
#(latent - 1, batch_size, batch_size)
neighbor_new_dists <-
tf$gather(all_dists[2:-1, , ],
top_indices[1:-2, , ],
batch_dims = -1L)
# Eq. 4 of Kennel et al.
#(latent - 1, batch_size, batch_size)
scaled_dist <- tf$sqrt((
tf$square(neighbor_new_dists) -
# (9, 8, 2)
tf$square(neighbor_dists_d[1:-2, , ])) /
# (9, 8, 2)
tf$square(neighbor_dists_d[1:-2, , ])
)
# Kennel condition #1
#(latent - 1, batch_size, batch_size)
is_false_change <- (scaled_dist > rtol)
# Kennel condition 2
#(latent - 1, batch_size, batch_size)
is_large_jump <-
(neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])
is_false_neighbor <-
tf$math$logical_or(is_false_change, is_large_jump)
#(latent - 1, batch_size, 1)
total_false_neighbors <-
tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]
# Pad zero to match dimensionality of latent space
# (latent - 1)
reg_weights <-
1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
# (latent,)
reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))
# Find batch average activity
# L2 Activity regularization
activations_batch_averaged <-
tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))
loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
loss
}
# loss has 3 components: NLL, KL, and FNN
# otherwise, this is just normal TF2-style training
train_step_vae <- function(batch) {
with (tf$GradientTape(persistent = TRUE) %as% tape, {
code <- encoder(batch[[1]])
z <- reparameterize(code)
prediction <- decoder(z)
l_mse <- mse_loss(batch[[2]], prediction)
# see loss_false_nn in 2 previous posts
l_fnn <- loss_false_nn(code)
# KL divergence to a standard normal
l_kl <- -0.5 * k_mean(1 - k_square(z))
# overall loss is a weighted sum of all 3 components
loss <- l_mse + fnn_weight * l_fnn + kl_weight * l_kl
})
encoder_gradients <-
tape$gradient(loss, encoder$trainable_variables)
decoder_gradients <-
tape$gradient(loss, decoder$trainable_variables)
optimizer$apply_gradients(purrr::transpose(list(
encoder_gradients, encoder$trainable_variables
)))
optimizer$apply_gradients(purrr::transpose(list(
decoder_gradients, decoder$trainable_variables
)))
train_loss(loss)
train_mse(l_mse)
train_fnn(l_fnn)
train_kl(l_kl)
}
# wrap it all in autograph
training_loop_vae <- tf_function(autograph(function(ds_train) {
for (batch in ds_train) {
train_step_vae(batch)
#str(batch)
}
tf$print("Loss: ", train_loss$result())
tf$print("MSE: ", train_mse$result())
tf$print("FNN loss: ", train_fnn$result())
tf$print("KL loss: ", train_kl$result())
train_loss$reset_states()
train_mse$reset_states()
train_fnn$reset_states()
train_kl$reset_states()
}))
n_latent <- 10L
n_features <- 1
encoder <- vae_encoder_model(n_timesteps,
n_features,
n_latent)
decoder <- vae_decoder_model(n_timesteps,
n_features,
n_latent)
mse_loss <-
tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)
train_loss <- tf$keras$metrics$Mean(name = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <- tf$keras$metrics$Mean(name = 'train_mse')
train_kl <- tf$keras$metrics$Mean(name = 'train_kl')
fnn_multiplier <- 1 # default value used in nearly all cases (see text)
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size
kl_weight <- 1
optimizer <- optimizer_adam(learning_rate = 1e-3)
n_timesteps <- 120
batch_size <- 32
gen_timesteps <- function(x, n_timesteps) {
do.call(rbind,
purrr::map(seq_along(x),
function(i) {
start <- i
end <- i + n_timesteps - 1
out <- x[start:end]
out
})
) %>%
na.omit()
}
train <- gen_timesteps(roessler[1:(n/2)], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n], 2 * n_timesteps)
dim(train) <- c(dim(train), 1)
dim(test) <- c(dim(test), 1)
x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]
ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
dataset_shuffle(nrow(x_train)) %>%
dataset_batch(batch_size)
x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]
ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
dataset_batch(nrow(x_test))
for (epoch in 1:100) {
cat("Epoch: ", epoch, " -----------\n")
training_loop_vae(ds_train)
test_batch <- as_iterator(ds_test) %>% iter_next()
encoded <- encoder(test_batch[[1]][1:1000])
test_var <- tf$math$reduce_variance(encoded, axis = 0L)
print(test_var %>% as.numeric() %>% round(5))
}
fails with the following message:
Error in py_call_impl(callable, call_args$unnamed, call_args$named) :
KeyError: 'The optimizer cannot recognize variable conv1d_transpose_24/kernel:0. This usually means you are trying to call the optimizer to update different parts of the model separately. Please call `optimizer.build(variables)` with the full list of trainable variables before the training loop or use legacy optimizer `tf.keras.optimizers.legacy.Adam.'
Run `reticulate::py_last_error()` for details.
Hello,
I am trying to replicate the results of the following article written by @skeydan :
https://blogs.rstudio.com/ai/posts/2020-07-31-fnn-vae-for-noisy-timeseries/
I would like to get an help on why teh following code :
fails with the following message:
Thanks.