Training of Bayesian neural network crashes R and RStudio, fails to process TF graph gradient tape

Hi all,

The following code runs fine for a standard neural network, but not for a Bayesian neural network. In the latter case, calling the function fitleads R to abort, not before saying that tensorflow/compiler/tf2mlcompute/kernels/mlc_subgraph_op.cc Compute: Failed in processing TF graph gradient tape.

I would appreciate if you could share any pointers into how to make this work. Could this be an issue related to Apple's Tensorflow implementation? Or just some wrong code from my side? Finally, could this be a bug in the R implementation of tfprobability?

Best regards and thanks in advance.

### INTRO
###
### The code below is an implementation in R of the following python code:
### https://keras.io/examples/keras_recipes/bayesian_neural_networks/
###
### Further useful material for understanding the use of tfprobability in R:
### https://blogs.rstudio.com/ai/posts/2019-06-05-uncertainty-estimates-tfprobability/
### https://blogs.rstudio.com/ai/posts/2019-11-07-tfp-cran/

# Config the tensorflow backend -------------------------------------------
if (Sys.info()['sysname'] == "Darwin") {
  device <- c('gpu', 'cpu')[1] # change here between 1 and 2 to choose the device; restart the R session between changes

  library(reticulate)
  use_virtualenv(virtualenv = "~/tensorflow_macos_venv/", required = TRUE)
  mlcompute <- reticulate::import("tensorflow.python.compiler.mlcompute")
  mlcompute$set_mlc_device(device_name = device)

} else {
  library(reticulate)
}

library(tensorflow)
library(keras)
library(tfdatasets)
library(tfprobability)
library(zeallot)

# Get data ----------------------------------------------------------------

wine_data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine_names_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names"

wine_data <- read.csv(wine_data_url, header = FALSE)
wine_names <- c("Grade",
                "Alcohol",
                "Malic_acid",
                "Ash",
                "Ash_alc",
                "Magnesium",
                "Phenols",
                "Flavanoids",
                "Nonflav_phenols",
                "Proanth",
                "Color",
                "Hue",
                "Diluted",
                "Proline")
colnames(wine_data) <- wine_names

# Aux functions -----------------------------------------------------------

input_data <- function(dataset, pct_valid = 0.3, target_col = 1) {
  n_obs <- nrow(dataset)
  n_features <- ncol(dataset) - 1
  obs_shuffle <- sample(1:n_obs)
  dataset_shuffled <- dataset[obs_shuffle,]
  n_train <- floor((1 - pct_valid) * n_obs)

  train_ds <- list(train_X = array_reshape(as.matrix(dataset_shuffled[1:n_train, -target_col]), dim = c(n_train, n_features)),
                   train_y = dataset_shuffled[1:n_train, target_col])
  valid_ds <- list(valid_X = array_reshape(as.matrix(dataset_shuffled[(n_train + 1):n_obs, -target_col]), dim = c(n_obs - n_train, n_features)),
                   valid_y = dataset_shuffled[(n_train + 1):n_obs, target_col])

  mean_train_X <- colMeans(train_ds$train_X)
  stdv_train_X <- apply(train_ds$train_X, 2, sd)

  train_ds$train_X <- t(apply(train_ds$train_X - matrix(rep(mean_train_X, n_train), ncol = n_features, byrow = TRUE), 1, function(x) x / stdv_train_X))
  valid_ds$valid_X <- t(apply(valid_ds$valid_X - matrix(rep(mean_train_X, n_obs - n_train), ncol = n_features, byrow = TRUE), 1, function(x) x / stdv_train_X))

  return(list(
    train = train_ds,
    valid = valid_ds
  ))
}

create_standard_nn_model <- function(dataset) {
  # standard neural network model
  num_features <- ncol(dataset) - 1
  inputs <- layer_input(name = "features",
                        shape = c(num_features))
  feature <- inputs %>% 
    layer_batch_normalization() %>% 
    layer_dense(units = 8, activation = "relu") %>% 
    layer_dense(units = 8, activation = "relu")

  output <- features %>% 
    layer_dense(units = 1)

  model <- keras_model(inputs = inputs,
                       outputs = output)
  return(model)
}

prior_func <- function(kernel_size, bias_size, dtype = NULL) {
  n = as.integer(kernel_size + bias_size)
  prior_model <- keras_model_sequential() %>% 
    layer_variable(n, dtype = dtype, trainable = FALSE) %>% 
    layer_distribution_lambda(
      make_distribution_fn = function(t) tfd_multivariate_normal_diag(
        loc = c(tf$zeros(n)),
        scale_diag = c(tf$ones(n))
      )
    )

  return(prior_model)
}

posterior_func <- function(kernel_size, bias_size, dtype = NULL) {
  n = as.integer(kernel_size + bias_size)
  posterior_model <- keras_model_sequential() %>% 
    layer_variable(layer_multivariate_normal_tri_l(event_size = n)$params_size(n)) %>% 
    layer_multivariate_normal_tri_l(event_size = n)
  return(posterior_model)
}

create_bayesian_nn_model <- function(dataset, train_size) {
  # Bayesian neural network model
  num_features <- ncol(dataset) - 1
  inputs <- layer_input(name = "features",
                        shape = c(num_features))
  features <- inputs %>% 
    layer_batch_normalization() %>% 
    layer_dense_variational(
      units = 8,
      make_prior = prior_func,
      make_posterior = posterior_func,
      kl_weight = 1 / train_size,
      activation = "sigmoid"
    )

  outputs <- features %>% 
    layer_dense(units = 1)

  model <- keras_model(inputs = inputs,
                       outputs = outputs)
  return(model)
}

run_experiment <- function(model, params, dataset_lists) {
  model %>% compile(
    optimizer = params$optimizer(lr = params$learning_rate),
    loss = c(params$loss_func),
    metrics = c(params$metrics)
  )

  summary(model)

  print("Starting to train the model...")
  model %>% 
    fit(x = dataset_lists$train$train_X,
        y = dataset_lists$train$train_y,
        epochs = params$num_epochs,
        batch_size = params$batch_size,
        validation_data = dataset_lists$valid)
}

# Config models ---------------------------------------------------------------------

params <- list()
params[["num_epochs"]] <- 500
params[["optimizer"]] <- keras::optimizer_rmsprop
params[["loss_func"]] <- keras::loss_mean_squared_error
params[["batch_size"]] <- 32
params[["learning_rate"]] <- 0.0001

# Train models ------------------------------------------------------------
data_lists <- input_data(wine_data)

baseline_model <- create_standard_nn_model(dataset = wine_data)
run_experiment(model = baseline_model,
               params = params,
               dataset_lists = data_lists)

bayesian_model <- create_bayesian_nn_model(dataset = wine_data, train_size = length(data_lists$train$train_y))
run_experiment(model = bayesian_model,
               params = params,
               dataset_lists = data_lists)

rstudio / tfprobability

Training of Bayesian neural network crashes R and RStudio, fails to process TF graph gradient tape #138