nicholasjclark / MRFcov

Markov random fields with covariates
23 stars 5 forks source link

Deviance not calculated in cv_MRF_diag #31

Open shirasal opened 3 years ago

shirasal commented 3 years ago

After encountering an odd problem with cv_MRF_diag with Poisson data, I found a simple solution (with the help of @hezibu). The problem I had was that NaNs were produced when dividing zero by zero while creating preds_log. Other numbers divided by zero were taken care of with is.infinite but NaNs weren't. I used the same procedure already implemented in the function, to set infinite values to zero (when numbers were divided by 0), but with is.nan instead if is.infinite.

Here is my suggested fix [my additions in rows 299 and 324:

function (data, symmetrise, n_nodes, n_cores, sample_seed, n_folds, 
                            n_fold_runs, n_covariates, compare_null, family, plot = TRUE, 
                            cached_model, cached_predictions, mod_labels = NULL) 
{
  if (!(family %in% c("gaussian", "poisson", "binomial"))) 
    stop("Please select one of the three family options:\n         \"gaussian\", \"poisson\", \"binomial\"")
  if (missing(symmetrise)) {
    symmetrise <- "mean"
  }
  if (missing(compare_null)) {
    compare_null <- FALSE
  }
  if (missing(n_folds)) {
    n_folds <- 10
  }
  else {
    if (sign(n_folds) == 1) {
      n_folds <- ceiling(n_folds)
    }
    else {
      stop("Please provide a positive integer for n_folds")
    }
  }
  if (missing(n_fold_runs)) {
    n_fold_runs <- n_folds
  }
  else {
    if (sign(n_fold_runs) == 1) {
      n_fold_runs <- ceiling(n_fold_runs)
    }
    else {
      stop("Please provide a positive integer for n_fold_runs")
    }
  }
  if (missing(n_cores)) {
    n_cores <- 1
  }
  else {
    if (sign(n_cores) != 1) {
      stop("Please provide a positive integer for n_cores")
    }
    else {
      if (sfsmisc::is.whole(n_cores) == FALSE) {
        stop("Please provide a positive integer for n_cores")
      }
    }
  }
  if (missing(n_nodes)) {
    warning("n_nodes not specified. using ncol(data) as default, assuming no covariates", 
            call. = FALSE)
    n_nodes <- ncol(data)
    n_covariates <- 0
  }
  else {
    if (sign(n_nodes) != 1) {
      stop("Please provide a positive integer for n_nodes")
    }
    else {
      if (sfsmisc::is.whole(n_nodes) == FALSE) {
        stop("Please provide a positive integer for n_nodes")
      }
    }
  }
  if (missing(n_covariates)) {
    n_covariates <- ncol(data) - n_nodes
  }
  else {
    if (sign(n_covariates) != 1) {
      stop("Please provide a positive integer for n_covariates")
    }
    else {
      if (sfsmisc::is.whole(n_covariates) == FALSE) {
        stop("Please provide a positive integer for n_covariates")
      }
    }
  }
  if (missing(sample_seed)) {
    sample_seed <- ceiling(runif(1, 0, 1e+05))
  }
  if (missing(cached_model)) {
    cat("Generating node-optimised Conditional Random Fields model", 
        "\n", sep = "")
    if (family == "binomial") {
      mrf <- MRFcov(data = data, symmetrise = symmetrise, 
                    n_nodes = n_nodes, n_cores = n_cores, family = "binomial")
      if (compare_null) {
        cat("\nGenerating Markov Random Fields model (no covariates)", 
            "\n", sep = "")
        mrf_null <- MRFcov(data = data[, 1:n_nodes], 
                           symmetrise = symmetrise, n_nodes = n_nodes, 
                           n_cores = n_cores, family = "binomial")
      }
    }
    if (family == "poisson") {
      mrf <- MRFcov(data = data, symmetrise = symmetrise, 
                    n_nodes = n_nodes, n_cores = n_cores, family = "poisson")
      if (compare_null) {
        cat("\nGenerating Markov Random Fields model (no covariates)", 
            "\n", sep = "")
        mrf_null <- MRFcov(data = data[, 1:n_nodes], 
                           symmetrise = symmetrise, n_nodes = n_nodes, 
                           n_cores = n_cores, family = "poisson")
      }
    }
    if (family == "gaussian") {
      mrf <- MRFcov(data = data, symmetrise = symmetrise, 
                    n_nodes = n_nodes, n_cores = n_cores, family = "gaussian")
      if (compare_null) {
        cat("\nGenerating Markov Random Fields model (no covariates)", 
            "\n", sep = "")
        mrf_null <- MRFcov(data = data[, 1:n_nodes], 
                           symmetrise = symmetrise, n_nodes = n_nodes, 
                           n_cores = n_cores, family = "gaussian")
      }
    }
  }
  else {
    mrf <- cached_model$mrf
    if (compare_null) {
      mrf_null <- cached_model$mrf_null
    }
  }
  if (family == "binomial") {
    folds <- caret::createFolds(rownames(data), n_folds)
    all_predictions <- if (missing(cached_predictions)) {
      predict_MRF(data, mrf)
    }
    else {
      cached_predictions$predictions
    }
    cv_predictions <- lapply(seq_len(n_folds), function(k) {
      test_data <- data[folds[[k]], ]
      predictions <- all_predictions[[2]][folds[[k]], 
      ]
      true_pos <- (predictions == test_data[, c(1:n_nodes)])[test_data[, 
                                                                       c(1:n_nodes)] == 1]
      false_pos <- (predictions == 1)[predictions != test_data[, 
                                                               c(1:n_nodes)]]
      true_neg <- (predictions == test_data[, c(1:n_nodes)])[test_data[, 
                                                                       c(1:n_nodes)] == 0]
      false_neg <- (predictions == 0)[predictions != test_data[, 
                                                               c(1:n_nodes)]]
      pos_pred <- sum(true_pos, na.rm = TRUE)/(sum(true_pos, 
                                                   na.rm = TRUE) + sum(false_pos, na.rm = TRUE))
      neg_pred <- sum(true_neg, na.rm = TRUE)/(sum(true_neg, 
                                                   na.rm = TRUE) + sum(false_neg, na.rm = TRUE))
      sensitivity <- sum(true_pos, na.rm = TRUE)/(sum(true_pos, 
                                                      na.rm = TRUE) + sum(false_neg, na.rm = TRUE))
      specificity <- sum(true_neg, na.rm = TRUE)/(sum(true_neg, 
                                                      na.rm = TRUE) + sum(false_pos, na.rm = TRUE))
      tot_pred <- (sum(true_pos, na.rm = TRUE) + sum(true_neg, 
                                                     na.rm = TRUE))/(length(as.matrix(test_data[, 
                                                                                                c(1:n_nodes)])))
      list(mean_pos_pred = mean(pos_pred, na.rm = TRUE), 
           mean_neg_pred = mean(neg_pred, na.rm = TRUE), 
           mean_tot_pred = mean(tot_pred, na.rm = TRUE), 
           mean_sensitivity = mean(sensitivity, na.rm = TRUE), 
           mean_specificity = mean(specificity, na.rm = TRUE))
    })
    plot_dat <- purrr::map_df(cv_predictions, magrittr::extract, 
                              c("mean_pos_pred", "mean_tot_pred", "mean_sensitivity", 
                                "mean_specificity"))
    if (compare_null) {
      all_predictions <- if (missing(cached_predictions)) {
        predict_MRF(data[, 1:n_nodes], mrf_null)
      }
      else {
        cached_predictions$null_predictions
      }
      cv_predictions_null <- lapply(seq_len(n_folds), 
                                    function(k) {
                                      test_data <- data[folds[[k]], 1:n_nodes]
                                      predictions <- all_predictions[[2]][folds[[k]], 
                                      ]
                                      true_pos <- (predictions == test_data)[test_data == 
                                                                               1]
                                      false_pos <- (predictions == 1)[predictions != 
                                                                        test_data]
                                      true_neg <- (predictions == test_data)[test_data == 
                                                                               0]
                                      false_neg <- (predictions == 0)[predictions != 
                                                                        test_data]
                                      pos_pred <- sum(true_pos, na.rm = TRUE)/(sum(true_pos, 
                                                                                   na.rm = TRUE) + sum(false_pos, na.rm = TRUE))
                                      neg_pred <- sum(true_neg, na.rm = TRUE)/(sum(true_neg, 
                                                                                   na.rm = TRUE) + sum(false_neg, na.rm = TRUE))
                                      sensitivity <- sum(true_pos, na.rm = TRUE)/(sum(true_pos, 
                                                                                      na.rm = TRUE) + sum(false_neg, na.rm = TRUE))
                                      specificity <- sum(true_neg, na.rm = TRUE)/(sum(true_neg, 
                                                                                      na.rm = TRUE) + sum(false_pos, na.rm = TRUE))
                                      tot_pred <- (sum(true_pos, na.rm = TRUE) + 
                                                     sum(true_neg, na.rm = TRUE))/(length(as.matrix(test_data)))
                                      list(mean_pos_pred = mean(pos_pred, na.rm = TRUE), 
                                           mean_neg_pred = mean(neg_pred, na.rm = TRUE), 
                                           mean_tot_pred = mean(tot_pred, na.rm = TRUE), 
                                           mean_sensitivity = mean(sensitivity, na.rm = TRUE), 
                                           mean_specificity = mean(specificity, na.rm = TRUE))
                                    })
      plot_dat_null <- purrr::map_df(cv_predictions_null, 
                                     magrittr::extract, c("mean_pos_pred", "mean_tot_pred", 
                                                          "mean_sensitivity", "mean_specificity"))
      if (is.null(mod_labels)) {
        plot_dat$model <- "CRF"
        plot_dat_null$model <- "MRF (no covariates)"
      }
      else {
        plot_dat$model <- mod_labels[1]
        plot_dat_null$model <- mod_labels[2]
      }
      plot_dat <- rbind(plot_dat, plot_dat_null)
      if (plot) {
        plot_binom_cv_diag_optim(plot_dat, compare_null = TRUE)
      }
      else {
        return(plot_dat)
      }
    }
    else {
      if (plot) {
        plot_binom_cv_diag_optim(plot_dat, compare_null = FALSE)
      }
      else {
        return(plot_dat)
      }
    }
  }
  if (family == "gaussian") {
    folds <- caret::createFolds(rownames(data), n_folds)
    cv_predictions <- lapply(seq_len(n_folds), function(k) {
      test_data <- data[folds[[k]], ]
      predictions <- predict_MRF(test_data, mrf)
      Rsquared <- vector()
      MSE <- vector()
      for (i in seq_len(ncol(predictions))) {
        Rsquared[i] <- cor.test(test_data[, i], predictions[, 
                                                            i])[[4]]
        MSE[i] <- mean((test_data[, i] - predictions[, 
                                                     i])^2)
      }
      list(Rsquared = mean(Rsquared, na.rm = T), MSE = mean(MSE, 
                                                            na.rm = T))
    })
    plot_dat <- purrr::map_df(cv_predictions, magrittr::extract, 
                              c("Rsquared", "MSE"))
    if (compare_null) {
      cv_predictions_null <- lapply(seq_len(n_folds), 
                                    function(k) {
                                      test_data <- data[folds[[k]], 1:n_nodes]
                                      predictions <- predict_MRF(test_data, mrf_null)
                                      Rsquared <- vector()
                                      MSE <- vector()
                                      for (i in seq_len(ncol(predictions))) {
                                        Rsquared[i] <- cor.test(test_data[, i], 
                                                                predictions[, i])[[4]]
                                        MSE[i] <- mean((test_data[, i] - predictions[, 
                                                                                     i])^2)
                                      }
                                      list(Rsquared = mean(Rsquared, na.rm = T), 
                                           MSE = mean(MSE, na.rm = T))
                                    })
      plot_dat_null <- purrr::map_df(cv_predictions_null, 
                                     magrittr::extract, c("Rsquared", "MSE"))
      if (is.null(mod_labels)) {
        plot_dat$model <- "CRF"
        plot_dat_null$model <- "MRF (no covariates)"
      }
      else {
        plot_dat$model <- mod_labels[1]
        plot_dat_null$model <- mod_labels[2]
      }
      plot_dat <- rbind(plot_dat, plot_dat_null)
      if (plot) {
        plot_gauss_cv_diag_optim(plot_dat, compare_null = TRUE)
      }
      else {
        return(plot_dat)
      }
    }
    else {
      if (plot) {
        plot_gauss_cv_diag_optim(plot_dat, compare_null = FALSE)
      }
      else {
        return(plot_dat)
      }
    }
  }
  if (family == "poisson") {
    folds <- caret::createFolds(rownames(data), n_folds)
    cv_predictions <- lapply(seq_len(n_folds), function(k) {
      test_data <- data[folds[[k]], ]
      predictions <- predict_MRF(test_data, mrf)
      Deviance <- vector()
      MSE <- vector()
      for (i in seq_len(ncol(predictions))) {
        preds_log <- log(test_data[, i]/predictions[, 
                                                    i])
        preds_log[is.infinite(preds_log)] <- 0
        preds_log[is.nan(preds_log)] <- 0
        test_data_wzeros <- test_data
        test_data_wzeros[predictions[, i] == 0, i] <- 0
        Deviance[i] <- mean(2 * sum(test_data_wzeros[, 
                                                     i] * preds_log - (test_data_wzeros[, i] - 
                                                                         predictions[, i])))
        MSE[i] <- mean((test_data[, i] - predictions[, 
                                                     i])^2)
      }
      list(Deviance = mean(Deviance, na.rm = T), MSE = mean(MSE, 
                                                            na.rm = T))
    })
    plot_dat <- purrr::map_df(cv_predictions, magrittr::extract, 
                              c("Deviance", "MSE"))
    if (compare_null) {
      cv_predictions_null <- lapply(seq_len(n_folds), 
                                    function(k) {
                                      test_data <- data[folds[[k]], 1:n_nodes]
                                      predictions <- predict_MRF(test_data, mrf_null)
                                      Deviance <- vector()
                                      MSE <- vector()
                                      for (i in seq_len(ncol(predictions))) {
                                        preds_log <- log(test_data[, i]/predictions[, 
                                                                                    i])
                                        preds_log[is.infinite(preds_log)] <- 0
                                        preds_log[is.nan(preds_log)] <- 0
                                        test_data_wzeros <- test_data
                                        test_data_wzeros[predictions[, i] == 0, 
                                                         i] <- 0
                                        Deviance[i] <- mean(2 * sum(test_data_wzeros[, 
                                                                                     i] * preds_log - (test_data_wzeros[, i] - 
                                                                                                         predictions[, i])))
                                        MSE[i] <- mean((test_data[, i] - predictions[, 
                                                                                     i])^2)
                                      }
                                      list(Deviance = mean(Deviance, na.rm = T), 
                                           MSE = mean(MSE, na.rm = T))
                                    })
      plot_dat_null <- purrr::map_df(cv_predictions_null, 
                                     magrittr::extract, c("Deviance", "MSE"))
      if (is.null(mod_labels)) {
        plot_dat$model <- "CRF"
        plot_dat_null$model <- "MRF (no covariates)"
      }
      else {
        plot_dat$model <- mod_labels[1]
        plot_dat_null$model <- mod_labels[2]
      }
      plot_dat <- rbind(plot_dat, plot_dat_null)
      if (plot) {
        plot_poiss_cv_diag_optim(plot_dat, compare_null = TRUE)
      }
      else {
        return(plot_dat)
      }
    }
    else {
      if (plot) {
        plot_poiss_cv_diag_optim(plot_dat, compare_null = FALSE)
      }
      else {
        return(plot_dat)
      }
    }
  }
}

Does this look adequate?