rstudio / ai-blog

Repository for the RStudio AI Blog (formerly: TensorFlow for R Blog)
https://blogs.rstudio.com/ai/
48 stars 36 forks source link

VAE with FNN #224

Closed mg64ve closed 10 months ago

mg64ve commented 10 months ago

Hello,

I am trying to replicate the results of the following article written by @skeydan :

https://blogs.rstudio.com/ai/posts/2020-07-31-fnn-vae-for-noisy-timeseries/

I would like to get an help on why teh following code :

library(tidyverse)
library(tensorflow)
library(keras)
library(tfdatasets)
library(tfautograph)
library(reticulate)
library(purrr)
library(listarrays)
library(abind)
library(deSolve)

parameters <- c(a = .2,
                b = .2,
                c = 5.7)

initial_state <-
  c(x = 1,
    y = 1,
    z = 1.05)

roessler <- function(t, state, parameters) {
  with(as.list(c(state, parameters)), {
    dx <- -y - z
    dy <- x + a * y
    dz = b + z * (x - c)

    list(c(dx, dy, dz))
  })
}

times <- seq(0, 2500, length.out = 20000)

roessler_ts <-
  ode(
    y = initial_state,
    times = times,
    func = roessler,
    parms = parameters,
    method = "lsoda"
  ) %>% unclass() %>% as_tibble()

n <- 10000
roessler <- roessler_ts$x[1:n]

roessler <- scale(roessler)

# add noise
noise <- 1 # also used 1.5, 2, 2.5
roessler <- roessler + rnorm(10000, mean = 0, sd = noise)
str(roessler)

vae_encoder_model <- function(n_timesteps,
                              n_features,
                              n_latent,
                              name = NULL) {
  keras_model_custom(name = name, function(self) {
    self$conv1 <- layer_conv_1d(kernel_size = 3,
                                filters = 16,
                                strides = 2)
    self$act1 <- layer_activation_leaky_relu()
    self$batchnorm1 <- layer_batch_normalization()
    self$conv2 <- layer_conv_1d(kernel_size = 7,
                                filters = 32,
                                strides = 2)
    self$act2 <- layer_activation_leaky_relu()
    self$batchnorm2 <- layer_batch_normalization()
    self$conv3 <- layer_conv_1d(kernel_size = 9,
                                filters = 64,
                                strides = 2)
    self$act3 <- layer_activation_leaky_relu()
    self$batchnorm3 <- layer_batch_normalization()
    self$conv4 <- layer_conv_1d(
      kernel_size = 9,
      filters = n_latent,
      strides = 2,
      activation = "linear" 
    )
    self$batchnorm4 <- layer_batch_normalization()
    self$flat <- layer_flatten()

    function (x, mask = NULL) {
      x %>%
        self$conv1() %>%
        self$act1() %>%
        self$batchnorm1() %>%
        self$conv2() %>%
        self$act2() %>%
        self$batchnorm2() %>%
        self$conv3() %>%
        self$act3() %>%
        self$batchnorm3() %>%
        self$conv4() %>%
        self$batchnorm4() %>%
        self$flat()
    }
  })
}

vae_decoder_model <- function(n_timesteps,
                              n_features,
                              n_latent,
                              name = NULL) {
  keras_model_custom(name = name, function(self) {
    self$reshape <- layer_reshape(target_shape = c(1, n_latent))
    self$conv1 <- layer_conv_1d_transpose(kernel_size = 15,
                                          filters = 64,
                                          strides = 3)
    self$act1 <- layer_activation_leaky_relu()
    self$batchnorm1 <- layer_batch_normalization()
    self$conv2 <- layer_conv_1d_transpose(kernel_size = 11,
                                          filters = 32,
                                          strides = 3)
    self$act2 <- layer_activation_leaky_relu()
    self$batchnorm2 <- layer_batch_normalization()
    self$conv3 <- layer_conv_1d_transpose(
      kernel_size = 9,
      filters = 16,
      strides = 2,
      output_padding = 1
    )
    self$act3 <- layer_activation_leaky_relu()
    self$batchnorm3 <- layer_batch_normalization()
    self$conv4 <- layer_conv_1d_transpose(
      kernel_size = 7,
      filters = 1,
      strides = 1,
      activation = "linear"
    )
    self$batchnorm4 <- layer_batch_normalization()

    function (x, mask = NULL) {
      x %>%
        self$reshape() %>%
        self$conv1() %>%
        self$act1() %>%
        self$batchnorm1() %>%
        self$conv2() %>%
        self$act2() %>%
        self$batchnorm2() %>%
        self$conv3() %>%
        self$act3() %>%
        self$batchnorm3() %>%
        self$conv4() %>%
        self$batchnorm4()
    }
  })
}

# to reparameterize encoder output before calling decoder
reparameterize <- function(mean, logvar = 0) {
  eps <- k_random_normal(shape = n_latent)
  eps * k_exp(logvar * 0.5) + mean
}

# loss FNN

loss_false_nn <- function(x) {

  # changing these parameters is equivalent to
  # changing the strength of the regularizer, so we keep these fixed (these values
  # correspond to the original values used in Kennel et al 1992).
  rtol <- 10 
  atol <- 2
  k_frac <- 0.01

  k <- max(1, floor(k_frac * batch_size))

  ## Vectorized version of distance matrix calculation
  tri_mask <-
    tf$linalg$band_part(
      tf$ones(
        shape = c(tf$cast(n_latent, tf$int32), tf$cast(n_latent, tf$int32)),
        dtype = tf$float32
      ),
      num_lower = -1L,
      num_upper = 0L
    )

  # latent x batch_size x latent
  batch_masked <-
    tf$multiply(tri_mask[, tf$newaxis,], x[tf$newaxis, reticulate::py_ellipsis()])

  # latent x batch_size x 1
  x_squared <-
    tf$reduce_sum(batch_masked * batch_masked,
                  axis = 2L,
                  keepdims = TRUE)

  # latent x batch_size x batch_size
  pdist_vector <- x_squared + tf$transpose(x_squared, perm = c(0L, 2L, 1L)) -
    2 * tf$matmul(batch_masked, tf$transpose(batch_masked, perm = c(0L, 2L, 1L)))

  #(latent, batch_size, batch_size)
  all_dists <- pdist_vector
  # latent
  all_ra <-
    tf$sqrt((1 / (
      batch_size * tf$range(1, 1 + n_latent, dtype = tf$float32)
    )) *
      tf$reduce_sum(tf$square(
        batch_masked - tf$reduce_mean(batch_masked, axis = 1L, keepdims = TRUE)
      ), axis = c(1L, 2L)))

  # Avoid singularity in the case of zeros
  #(latent, batch_size, batch_size)
  all_dists <-
    tf$clip_by_value(all_dists, 1e-14, tf$reduce_max(all_dists))

  #inds = tf.argsort(all_dists, axis=-1)
  top_k <- tf$math$top_k(-all_dists, tf$cast(k + 1, tf$int32))
  # (#(latent, batch_size, batch_size)
  top_indices <- top_k[[1]]

  #(latent, batch_size, batch_size)
  neighbor_dists_d <-
    tf$gather(all_dists, top_indices, batch_dims = -1L)
  #(latent - 1, batch_size, batch_size)
  neighbor_new_dists <-
    tf$gather(all_dists[2:-1, , ],
              top_indices[1:-2, , ],
              batch_dims = -1L)

  # Eq. 4 of Kennel et al.
  #(latent - 1, batch_size, batch_size)
  scaled_dist <- tf$sqrt((
    tf$square(neighbor_new_dists) -
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])) /
      # (9, 8, 2)
      tf$square(neighbor_dists_d[1:-2, , ])
  )

  # Kennel condition #1
  #(latent - 1, batch_size, batch_size)
  is_false_change <- (scaled_dist > rtol)
  # Kennel condition 2
  #(latent - 1, batch_size, batch_size)
  is_large_jump <-
    (neighbor_new_dists > atol * all_ra[1:-2, tf$newaxis, tf$newaxis])

  is_false_neighbor <-
    tf$math$logical_or(is_false_change, is_large_jump)
  #(latent - 1, batch_size, 1)
  total_false_neighbors <-
    tf$cast(is_false_neighbor, tf$int32)[reticulate::py_ellipsis(), 2:(k + 2)]

  # Pad zero to match dimensionality of latent space
  # (latent - 1)
  reg_weights <-
    1 - tf$reduce_mean(tf$cast(total_false_neighbors, tf$float32), axis = c(1L, 2L))
  # (latent,)
  reg_weights <- tf$pad(reg_weights, list(list(1L, 0L)))

  # Find batch average activity

  # L2 Activity regularization
  activations_batch_averaged <-
    tf$sqrt(tf$reduce_mean(tf$square(x), axis = 0L))

  loss <- tf$reduce_sum(tf$multiply(reg_weights, activations_batch_averaged))
  loss

}

# loss has 3 components: NLL, KL, and FNN
# otherwise, this is just normal TF2-style training 
train_step_vae <- function(batch) {
  with (tf$GradientTape(persistent = TRUE) %as% tape, {
    code <- encoder(batch[[1]])
    z <- reparameterize(code)
    prediction <- decoder(z)

    l_mse <- mse_loss(batch[[2]], prediction)
    # see loss_false_nn in 2 previous posts
    l_fnn <- loss_false_nn(code)
    # KL divergence to a standard normal
    l_kl <- -0.5 * k_mean(1 - k_square(z))
    # overall loss is a weighted sum of all 3 components
    loss <- l_mse + fnn_weight * l_fnn + kl_weight * l_kl
  })

  encoder_gradients <-
    tape$gradient(loss, encoder$trainable_variables)
  decoder_gradients <-
    tape$gradient(loss, decoder$trainable_variables)

  optimizer$apply_gradients(purrr::transpose(list(
    encoder_gradients, encoder$trainable_variables
  )))
  optimizer$apply_gradients(purrr::transpose(list(
    decoder_gradients, decoder$trainable_variables
  )))

  train_loss(loss)
  train_mse(l_mse)
  train_fnn(l_fnn)
  train_kl(l_kl)
}

# wrap it all in autograph
training_loop_vae <- tf_function(autograph(function(ds_train) {

  for (batch in ds_train) {
    train_step_vae(batch) 
    #str(batch)
  }

  tf$print("Loss: ", train_loss$result())
  tf$print("MSE: ", train_mse$result())
  tf$print("FNN loss: ", train_fnn$result())
  tf$print("KL loss: ", train_kl$result())

  train_loss$reset_states()
  train_mse$reset_states()
  train_fnn$reset_states()
  train_kl$reset_states()

}))

n_latent <- 10L
n_features <- 1

encoder <- vae_encoder_model(n_timesteps,
                             n_features,
                             n_latent)

decoder <- vae_decoder_model(n_timesteps,
                             n_features,
                             n_latent)
mse_loss <-
  tf$keras$losses$MeanSquaredError(reduction = tf$keras$losses$Reduction$SUM)

train_loss <- tf$keras$metrics$Mean(name = 'train_loss')
train_fnn <- tf$keras$metrics$Mean(name = 'train_fnn')
train_mse <-  tf$keras$metrics$Mean(name = 'train_mse')
train_kl <-  tf$keras$metrics$Mean(name = 'train_kl')

fnn_multiplier <- 1 # default value used in nearly all cases (see text)
fnn_weight <- fnn_multiplier * nrow(x_train)/batch_size

kl_weight <- 1

optimizer <- optimizer_adam(learning_rate = 1e-3)

n_timesteps <- 120
batch_size <- 32

gen_timesteps <- function(x, n_timesteps) {
  do.call(rbind,
          purrr::map(seq_along(x),
                     function(i) {
                       start <- i
                       end <- i + n_timesteps - 1
                       out <- x[start:end]
                       out
                     })
  ) %>%
    na.omit()
}

train <- gen_timesteps(roessler[1:(n/2)], 2 * n_timesteps)
test <- gen_timesteps(roessler[(n/2):n], 2 * n_timesteps) 

dim(train) <- c(dim(train), 1)
dim(test) <- c(dim(test), 1)

x_train <- train[ , 1:n_timesteps, , drop = FALSE]
y_train <- train[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_train <- tensor_slices_dataset(list(x_train, y_train)) %>%
  dataset_shuffle(nrow(x_train)) %>%
  dataset_batch(batch_size)

x_test <- test[ , 1:n_timesteps, , drop = FALSE]
y_test <- test[ , (n_timesteps + 1):(2*n_timesteps), , drop = FALSE]

ds_test <- tensor_slices_dataset(list(x_test, y_test)) %>%
  dataset_batch(nrow(x_test))

for (epoch in 1:100) {
  cat("Epoch: ", epoch, " -----------\n")
  training_loop_vae(ds_train)

  test_batch <- as_iterator(ds_test) %>% iter_next()
  encoded <- encoder(test_batch[[1]][1:1000])
  test_var <- tf$math$reduce_variance(encoded, axis = 0L)
  print(test_var %>% as.numeric() %>% round(5))
}

fails with the following message:

Error in py_call_impl(callable, call_args$unnamed, call_args$named) : 
  KeyError: 'The optimizer cannot recognize variable conv1d_transpose_24/kernel:0. This usually means you are trying to call the optimizer to update different parts of the model separately. Please call `optimizer.build(variables)` with the full list of trainable variables before the training loop or use legacy optimizer `tf.keras.optimizers.legacy.Adam.'
Run `reticulate::py_last_error()` for details.

Thanks.

t-kalinowski commented 10 months ago

Tracked here: https://github.com/rstudio/keras/issues/1391