spsanderson / TidyDensity

Create tidy probability/density tibbles and plots of randomly generated and empirical data.
https://www.spsanderson.com/TidyDensity
Other
34 stars 1 forks source link

Add Normal Distribution and AIC in tidy_distribution_comparison #227

Closed datadrivensupplychain closed 2 years ago

datadrivensupplychain commented 2 years ago

Please add Normal Distribution and AIC in tidy_distribution_comparison.

Per Linkedin post comment

Thanks Ralph Asher

spsanderson commented 2 years ago

Fixes section where combining distributions for continuous and discrete. Add aic calculation as a lm function of the distribution against the empirical. Adds several attributes. Adds gaussian to output.

New Function:

#' Compare Empirical Data to Distributions
#'
#' @family Empirical
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details The purpose of this function is to take some data set provided and
#' to try to find a distribution that may fit the best. A parameter of
#' `.distribution_type` must be set to either `continuous` or `discrete` in order
#' for this the function to try the appropriate types of distributions.
#'
#' The following distributions are used:
#'
#' Continuous:
#' -  tidy_beta
#' -  tidy_cauchy
#' -  tidy_exponential
#' -  tidy_gamma
#' -  tidy_logistic
#' -  tidy_lognormal
#' -  tidy_normal
#' -  tidy_pareto
#' -  tidy_uniform
#' -  tidy_weibull
#'
#' Discrete:
#' -  tidy_binomial
#' -  tidy_geometric
#' -  tidy_hypergeometric
#' -  tidy_poisson
#'
#'
#' @description Compare some empirical data set against different distributions
#' to help find the distribution that could be the best fit.
#'
#' @param .x The data set being passed to the function
#' @param .distribution_type What kind of data is it, can be one of `continuous`
#' or `discrete`
#' @param .print_aic This is a boolean that is defaulted to TRUE. This will print
#' out the `aic_tbl`. When set to FALSE it will print out the `total_deviance_tbl`
#'
#' @examples
#' xc <- mtcars$mpg
#' tidy_distribution_comparison(xc, "continuous")
#'
#' xd <- trunc(xc)
#' tidy_distribution_comparison(xd, "discrete")
#'
#' @return
#' An invisible list object. A tibble is printed.
#'
#' @export
#'

tidy_distribution_comparison <- function(.x, .distribution_type = "continuous",
                                         .print_aic = TRUE){

  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  dist_type <- tolower(as.character(.distribution_type))
  print_aic <- as.logical(.print_aic)

  if (!dist_type %in% c("continuous","discrete")){
    rlang::abort(
      message = "The '.distribution_type' parameter must be either 'continuous'
      or 'discrete'.",
      use_cli_format = TRUE
    )
  }

  if (!is.logical(print_aic)){
    rlang::abort(
      message = "'.print_aic' must be either TRUE or FALSE."
    )
  }

  # Get parameter estimates for distributions
  if (dist_type == "continuous"){
    b <- try(util_beta_param_estimate(x_term)$parameter_tbl %>%
               dplyr::filter(method == "NIST_MME") %>%
               dplyr::select(dist_type, shape1, shape2) %>%
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(b, "try-error")){
      tb <- tidy_beta(.n = n, .shape1 = round(b[[2]], 2), .shape2 = round(b[[3]], 2))
    }

    c <- try(util_cauchy_param_estimate(x_term)$parameter_tbl %>%
               dplyr::select(dist_type, location, scale) %>%
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(c, "try-error")){
      tc <- tidy_cauchy(.n = n, .location = round(c[[2]], 2), .scale = round(c[[3]], 2))
    }

    e <- try(util_exponential_param_estimate(x_term)$parameter_tbl %>%
               dplyr::select(dist_type, rate) %>%
               dplyr::mutate(param_2 = NA) %>%
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(e, "try-error")){
      te <- tidy_exponential(.n = n, .rate = round(e[[2]], 2))
    }

    g <- try(util_gamma_param_estimate(x_term)$parameter_tbl %>%
               dplyr::filter(method == "NIST_MME") %>%
               dplyr::select(dist_type, shape, scale) %>%
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(g, "try-error")){
      tg <- tidy_gamma(.n = n, .shape = round(g[[2]], 2),  .scale = round(g[[3]], 2))
    }

    l <- try(util_logistic_param_estimate(x_term)$parameter_tbl %>%
               dplyr::filter(method == "EnvStats_MME") %>%
               dplyr::select(dist_type, location, scale) %>%
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(l, "try-error")){
      tl <- tidy_logistic(.n = n, .location = round(l[[2]], 2), .scale = round(l[[3]], 2))
    }

    ln <- try(util_lognormal_param_estimate(x_term)$parameter_tbl %>%
                dplyr::filter(method == "EnvStats_MME") %>%
                dplyr::select(dist_type, mean_log, sd_log) %>%
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(ln, "try-error")){
      tln <- tidy_lognormal(.n = n, .meanlog = round(ln[[2]], 2), .sdlog = round(ln[[3]], 2))
    }

    p <- try(util_pareto_param_estimate(x_term)$parameter_tbl %>%
               dplyr::filter(method == "MLE") %>%
               dplyr::select(dist_type, shape, scale) %>%
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(p, "try-error")){
      tp <- tidy_pareto(.n = n, .shape = round(p[[2]], 2), .scale = round(p[[3]], 2))
    }

    u <- try(util_uniform_param_estimate(x_term)$parameter_tbl %>%
               dplyr::filter(method == "NIST_MME") %>%
               dplyr::select(dist_type, min_est, max_est) %>%
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(u, "try-error")){
      tu <- tidy_uniform(.n = n, .min = round(u[[2]], 2), .max = round(u[[3]], 2))
    }

    w <- try(util_weibull_param_estimate(x_term)$parameter_tbl %>%
               dplyr::select(dist_type, shape, scale) %>%
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(w, "try-error")){
      tw <- tidy_weibull(.n = n, .shape = round(w[[2]], 2), .scale = round(w[[3]], 2))
    }

    nn <- try(util_normal_param_estimate(x_term)$parameter_tbl %>%
                dplyr::filter(method == "EnvStats_MME_MLE") %>%
                dplyr::select(dist_type, mu, stan_dev) %>%
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(n, "try-error")){
      tn <- tidy_normal(.n = n, .mean = round(nn[[2]], 2), .sd = round(nn[[3]], 2))
    }

    comp_tbl <- tidy_combine_distributions(
      tidy_empirical(x_term, .distribution_type = dist_type),
      if (exists("tb") && nrow(tb) > 0){tb},
      if (exists("tc") && nrow(tc) > 0){tc},
      if (exists("te") && nrow(te) > 0){te},
      if (exists("tg") && nrow(tg) > 0){tg},
      if (exists("tl") && nrow(tl) > 0){tl},
      if (exists("tln") && nrow(tln) > 0){tln},
      if (exists("tp") && nrow(tp) > 0){tp},
      if (exists("tu") && nrow(tu) > 0){tu},
      if (exists("tw") && nrow(tw) > 0){tw},
      if (exists("tn") && nrow(tn) > 0){tn}
    )

  } else {
    bn <- try(util_binomial_param_estimate(trunc(tidy_scale_zero_one_vec(x_term)))$parameter_tbl %>%
                dplyr::select(dist_type, size, prob) %>%
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(bn, "try-error")){
      tb <- tidy_binomial(.n = n, .size = round(bn[[2]], 2), .prob = round(bn[[3]], 2))
    }

    ge <- try(util_geometric_param_estimate(trunc(x_term))$parameter_tbl %>%
                dplyr::filter(method == "EnvStats_MME") %>%
                dplyr::select(dist_type, shape) %>%
                dplyr::mutate(param_2 = NA) %>%
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(ge, "try-error")){
      tg <- tidy_geometric(.n = n, .prob = round(ge[[2]], 2))
    }

    h <- try(util_hypergeometric_param_estimate(.x = trunc(x_term), .total = n, .k = n)$parameter_tbl %>%
               tidyr::drop_na() %>%
               dplyr::slice(1) %>%
               dplyr::select(dist_type, m, total) %>%
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(h, "try-error")){
      th <- tidy_hypergeometric(
        .n = n,
        .m = trunc(h[[2]]),
        .nn = n - trunc(h[[2]]),
        .k = trunc(h[[2]])
      )
    }

    po <- try(util_poisson_param_estimate(trunc(x_term))$parameter_tbl %>%
                dplyr::select(dist_type, lambda) %>%
                dplyr::mutate(param_2 = NA) %>%
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(po, "try-error")){
      tp <- tidy_poisson(.n = n, .lambda = round(po[[2]], 2))
    }

    comp_tbl <- tidy_combine_distributions(
      tidy_empirical(.x = x_term, .distribution_type = dist_type),
      if (exists("tb") && nrow(tb) > 0){tb},
      if (exists("tg") && nrow(tg) > 0){tg},
      if (exists("th") && nrow(th) > 0){th},
      if (exists("tp") && nrow(tp) > 0){tp}
    )

  }

  # Deviance calculations ----
  deviance_tbl <- comp_tbl %>%
    dplyr::select(dist_type, x, y) %>%
    dplyr::group_by(dist_type, x) %>%
    dplyr::mutate(y = stats::median(y)) %>%
    dplyr::ungroup() %>%
    dplyr::group_by(dist_type) %>%
    dplyr::mutate(y = tidy_scale_zero_one_vec(y)) %>%
    dplyr::ungroup() %>%
    tidyr::pivot_wider(
      id_cols = x,
      names_from = dist_type,
      values_from = y
    ) %>%
    dplyr::select(x, Empirical, dplyr::everything()) %>%
    dplyr::mutate(
      dplyr::across(
        .cols = -c(x, Empirical),
        .fns = ~ Empirical - .
      )
    ) %>%
    tidyr::drop_na() %>%
    tidyr::pivot_longer(
      cols = -x
    ) %>%
    dplyr::select(-x)

  total_deviance_tbl <- deviance_tbl %>%
    dplyr::filter(!name == "Empirical") %>%
    dplyr::group_by(name) %>%
    dplyr::summarise(total_deviance = sum(value)) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(total_deviance = abs(total_deviance)) %>%
    dplyr::arrange(abs(total_deviance)) %>%
    dplyr::rename(dist_with_params = name,
                  abs_tot_deviance = total_deviance)

  # AIC Data ----
  emp_data_tbl <- comp_tbl %>%
    dplyr::select(dist_type, x, dy) %>%
    dplyr::filter(dist_type == "Empirical")

  aic_tbl <- comp_tbl %>%
    dplyr::filter(!dist_type == "Empirical") %>%
    dplyr::select(dist_type, dy) %>%
    tidyr::nest(data = dy) %>%
    dplyr::mutate(
      lm_model = purrr::map(
        data, 
        function(df) lm(dy ~ emp_data_tbl$dy, data = df)
      )
    ) %>%
    dplyr::mutate(aic_value = purrr::map(lm_model, stats::AIC)) %>%
    dplyr::mutate(aic_value = unlist(aic_value)) %>%
    dplyr::mutate(abs_aic = abs(aic_value)) %>%
    dplyr::arrange(abs_aic) %>%
    dplyr::select(dist_type, aic_value, abs_aic)

  multi_metric_tbl <- total_deviance_tbl %>%
    dplyr::mutate(dist_with_params = as.factor(dist_with_params)) %>%
    dplyr::inner_join(output$aic_tbl, by = c("dist_with_params"="dist_type"))

  # Return ----
  output <- list(
    comparison_tbl     = comp_tbl,
    deviance_tbl       = deviance_tbl,
    total_deviance_tbl = total_deviance_tbl,
    aic_tbl            = aic_tbl,
    multi_metric_tbl   = multi_metric_tbl
  )

  # Attributes ----
  attr(deviance_tbl, ".tibble_type") <- "deviance_comparison_tbl"
  attr(total_deviance_tbl, ".tibble_type") <- "deviance_results_tbl"
  attr(aic_tbl, ".tibble_type") <- "aic_tbl"
  attr(comp_tbl, ".tibble_type") <- "comparison_tbl"
  attr(output, ".x") <- x_term
  attr(output, ".n") <- n
  attr(output, ".print_aic") <- .print_aic

  # Print to console
  if (print_aic){
    print(aic_tbl)
  } else {
    print(total_deviance_tbl)
  }

  return(invisible(output))

}

Simple Example:

x <- mtcars$mpg

# Continuous
> tidy_distribution_comparison(x)
For the beta distribution, its mean 'mu' should be 0 < mu < 1. The data will
therefore be scaled to enforce this.
# A tibble: 10 × 3
   dist_type               aic_value abs_aic
   <fct>                       <dbl>   <dbl>
 1 Beta c(1.11, 1.58, 0)       -1.98    1.98
 2 Pareto c(10.4, 1.62)        82.7    82.7 
 3 Logistic c(20.09, 3.27)   -149.    149.  
 4 Gaussian c(20.09, 5.93)   -181.    181.  
 5 Cauchy c(19.2, 7.38)      -193.    193.  
 6 Weibull c(3.58, 22.29)    -207.    207.  
 7 Lognormal c(2.96, 0.29)   -210.    210.  
 8 Uniform c(8.34, 31.84)    -216.    216.  
 9 Exponential c(0.05)       -252.    252.  
10 Gamma c(11.47, 1.75)      -268.    268.

# Discrete
> tidy_distribution_comparison(trunc(x), "discrete")
# A tibble: 4 × 3
  dist_type                    aic_value abs_aic
  <fct>                            <dbl>   <dbl>
1 Binomial c(32, 0.03)             -33.3    33.3
2 Hypergeometric c(21, 11, 21)     -76.9    76.9
3 Poisson c(19.69)                -137.    137. 
4 Geometric c(0.05)               -240.    240. 

> output$multi_metric_tbl
# A tibble: 10 × 4
   dist_with_params        abs_tot_deviance aic_value abs_aic
   <fct>                              <dbl>     <dbl>   <dbl>
 1 Lognormal c(2.96, 0.29)           0.0804   -225.    225.  
 2 Beta c(1.11, 1.58, 0)             0.157       5.74    5.74
 3 Logistic c(20.09, 3.27)           0.455    -152.    152.  
 4 Gamma c(11.47, 1.75)              2.73     -213.    213.  
 5 Uniform c(8.34, 31.84)            3.25     -215.    215.  
 6 Cauchy c(19.2, 7.38)              4.84     -220.    220.  
 7 Gaussian c(20.09, 5.93)           5.08     -161.    161.  
 8 Pareto c(10.4, 1.62)              5.24       95.8    95.8 
 9 Weibull c(3.58, 22.29)            5.56     -142.    142.  
10 Exponential c(0.05)               5.69     -181.    181. 
datadrivensupplychain commented 2 years ago

I'm not sure if I'm using your function wrong, but I'm getting a different AIC from tidy_distribution_comparison than from fitdistrplus:

library(tidyverse)
library(TidyDensity)
rm(list=ls())

set.seed(42)
df1 <- data.frame( x= (rnorm(n=10000, mean=20, sd=3)))

output <- TidyDensity::tidy_distribution_comparison(.x= df1$x, .distribution_type='continuous')

gaussian_row <- which(stringr::str_detect(output$aic_tbl$dist_type,"Gaussian"))
tidydensity_gaussian_aic <- output$aic_tbl$aic_value[gaussian_row]
tidydensity_gaussian_aic
#[1] -84218.28

#from fitdistrplus
fit <- fitdistrplus::fitdist(data=df1$x, distr='norm')
fit$aic
#[1] 50476.34

Regards Ralph Asher