rstudio / tfprobability

R interface to TensorFlow Probability
https://rstudio.github.io/tfprobability/
Other
54 stars 16 forks source link

Training of Bayesian neural network crashes R and RStudio, fails to process TF graph gradient tape #138

Open dkgaraujo opened 3 years ago

dkgaraujo commented 3 years ago

Hi all,

The following code runs fine for a standard neural network, but not for a Bayesian neural network. In the latter case, calling the function fitleads R to abort, not before saying that tensorflow/compiler/tf2mlcompute/kernels/mlc_subgraph_op.cc Compute: Failed in processing TF graph gradient tape.

I would appreciate if you could share any pointers into how to make this work. Could this be an issue related to Apple's Tensorflow implementation? Or just some wrong code from my side? Finally, could this be a bug in the R implementation of tfprobability?

Best regards and thanks in advance.

### INTRO
###
### The code below is an implementation in R of the following python code:
### https://keras.io/examples/keras_recipes/bayesian_neural_networks/
###
### Further useful material for understanding the use of tfprobability in R:
### https://blogs.rstudio.com/ai/posts/2019-06-05-uncertainty-estimates-tfprobability/
### https://blogs.rstudio.com/ai/posts/2019-11-07-tfp-cran/

# Config the tensorflow backend -------------------------------------------
if (Sys.info()['sysname'] == "Darwin") {
  device <- c('gpu', 'cpu')[1] # change here between 1 and 2 to choose the device; restart the R session between changes

  library(reticulate)
  use_virtualenv(virtualenv = "~/tensorflow_macos_venv/", required = TRUE)
  mlcompute <- reticulate::import("tensorflow.python.compiler.mlcompute")
  mlcompute$set_mlc_device(device_name = device)

} else {
  library(reticulate)
}

library(tensorflow)
library(keras)
library(tfdatasets)
library(tfprobability)
library(zeallot)

# Get data ----------------------------------------------------------------

wine_data_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine_names_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names"

wine_data <- read.csv(wine_data_url, header = FALSE)
wine_names <- c("Grade",
                "Alcohol",
                "Malic_acid",
                "Ash",
                "Ash_alc",
                "Magnesium",
                "Phenols",
                "Flavanoids",
                "Nonflav_phenols",
                "Proanth",
                "Color",
                "Hue",
                "Diluted",
                "Proline")
colnames(wine_data) <- wine_names

# Aux functions -----------------------------------------------------------

input_data <- function(dataset, pct_valid = 0.3, target_col = 1) {
  n_obs <- nrow(dataset)
  n_features <- ncol(dataset) - 1
  obs_shuffle <- sample(1:n_obs)
  dataset_shuffled <- dataset[obs_shuffle,]
  n_train <- floor((1 - pct_valid) * n_obs)

  train_ds <- list(train_X = array_reshape(as.matrix(dataset_shuffled[1:n_train, -target_col]), dim = c(n_train, n_features)),
                   train_y = dataset_shuffled[1:n_train, target_col])
  valid_ds <- list(valid_X = array_reshape(as.matrix(dataset_shuffled[(n_train + 1):n_obs, -target_col]), dim = c(n_obs - n_train, n_features)),
                   valid_y = dataset_shuffled[(n_train + 1):n_obs, target_col])

  mean_train_X <- colMeans(train_ds$train_X)
  stdv_train_X <- apply(train_ds$train_X, 2, sd)

  train_ds$train_X <- t(apply(train_ds$train_X - matrix(rep(mean_train_X, n_train), ncol = n_features, byrow = TRUE), 1, function(x) x / stdv_train_X))
  valid_ds$valid_X <- t(apply(valid_ds$valid_X - matrix(rep(mean_train_X, n_obs - n_train), ncol = n_features, byrow = TRUE), 1, function(x) x / stdv_train_X))

  return(list(
    train = train_ds,
    valid = valid_ds
  ))
}

create_standard_nn_model <- function(dataset) {
  # standard neural network model
  num_features <- ncol(dataset) - 1
  inputs <- layer_input(name = "features",
                        shape = c(num_features))
  feature <- inputs %>% 
    layer_batch_normalization() %>% 
    layer_dense(units = 8, activation = "relu") %>% 
    layer_dense(units = 8, activation = "relu")

  output <- features %>% 
    layer_dense(units = 1)

  model <- keras_model(inputs = inputs,
                       outputs = output)
  return(model)
}

prior_func <- function(kernel_size, bias_size, dtype = NULL) {
  n = as.integer(kernel_size + bias_size)
  prior_model <- keras_model_sequential() %>% 
    layer_variable(n, dtype = dtype, trainable = FALSE) %>% 
    layer_distribution_lambda(
      make_distribution_fn = function(t) tfd_multivariate_normal_diag(
        loc = c(tf$zeros(n)),
        scale_diag = c(tf$ones(n))
      )
    )

  return(prior_model)
}

posterior_func <- function(kernel_size, bias_size, dtype = NULL) {
  n = as.integer(kernel_size + bias_size)
  posterior_model <- keras_model_sequential() %>% 
    layer_variable(layer_multivariate_normal_tri_l(event_size = n)$params_size(n)) %>% 
    layer_multivariate_normal_tri_l(event_size = n)
  return(posterior_model)
}

create_bayesian_nn_model <- function(dataset, train_size) {
  # Bayesian neural network model
  num_features <- ncol(dataset) - 1
  inputs <- layer_input(name = "features",
                        shape = c(num_features))
  features <- inputs %>% 
    layer_batch_normalization() %>% 
    layer_dense_variational(
      units = 8,
      make_prior = prior_func,
      make_posterior = posterior_func,
      kl_weight = 1 / train_size,
      activation = "sigmoid"
    )

  outputs <- features %>% 
    layer_dense(units = 1)

  model <- keras_model(inputs = inputs,
                       outputs = outputs)
  return(model)
}

run_experiment <- function(model, params, dataset_lists) {
  model %>% compile(
    optimizer = params$optimizer(lr = params$learning_rate),
    loss = c(params$loss_func),
    metrics = c(params$metrics)
  )

  summary(model)

  print("Starting to train the model...")
  model %>% 
    fit(x = dataset_lists$train$train_X,
        y = dataset_lists$train$train_y,
        epochs = params$num_epochs,
        batch_size = params$batch_size,
        validation_data = dataset_lists$valid)
}

# Config models ---------------------------------------------------------------------

params <- list()
params[["num_epochs"]] <- 500
params[["optimizer"]] <- keras::optimizer_rmsprop
params[["loss_func"]] <- keras::loss_mean_squared_error
params[["batch_size"]] <- 32
params[["learning_rate"]] <- 0.0001

# Train models ------------------------------------------------------------
data_lists <- input_data(wine_data)

baseline_model <- create_standard_nn_model(dataset = wine_data)
run_experiment(model = baseline_model,
               params = params,
               dataset_lists = data_lists)

bayesian_model <- create_bayesian_nn_model(dataset = wine_data, train_size = length(data_lists$train$train_y))
run_experiment(model = bayesian_model,
               params = params,
               dataset_lists = data_lists)
dkgaraujo commented 3 years ago

Sorry, forgot to mention that I am using a MacBook with Intel chip and the AMD GPU.