spsanderson / TidyDensity

Create tidy probability/density tibbles and plots of randomly generated and empirical data.
https://www.spsanderson.com/TidyDensity
Other
34 stars 1 forks source link

I'm not sure if I'm using your function wrong, but I'm getting a different AIC from `tidy_distribution_comparison` than from `fitdistrplus`: #240

Closed spsanderson closed 4 months ago

spsanderson commented 2 years ago

I'm not sure if I'm using your function wrong, but I'm getting a different AIC from tidy_distribution_comparison than from fitdistrplus:

library(tidyverse)
library(TidyDensity)
rm(list=ls())

set.seed(42)
df1 <- data.frame( x= (rnorm(n=10000, mean=20, sd=3)))

output <- TidyDensity::tidy_distribution_comparison(.x= df1$x, .distribution_type='continuous')

gaussian_row <- which(stringr::str_detect(output$aic_tbl$dist_type,"Gaussian"))
tidydensity_gaussian_aic <- output$aic_tbl$aic_value[gaussian_row]
tidydensity_gaussian_aic
#[1] -84218.28

#from fitdistrplus
fit <- fitdistrplus::fitdist(data=df1$x, distr='norm')
fit$aic
#[1] 50476.34

Regards Ralph Asher

Originally posted by @datadrivensupplychain in https://github.com/spsanderson/TidyDensity/issues/227#issuecomment-1213538985

spsanderson commented 2 years ago

Some work done:

> set.seed(42)
> x <- mtcars$mpg
> 
> output <- tidy_distribution_comparison(.x = x)
For the beta distribution, its mean 'mu' should be 0 < mu < 1. The data will
therefore be scaled to enforce this.
> gaussian_row <- which(stringr::str_detect(output$aic_tbl$dist_type,"Gaussian"))
> tidydensity_gaussian_aic <- output$aic_tbl$aic_value[gaussian_row]
> tidydensity_gaussian_aic
[1] -188.0386
> 
> fit <- fitdistrplus::fitdist(data = x, distr = 'norm')
> fit$aic
[1] 208.7555
> 
> pe <- util_normal_param_estimate(x)$parameter_tbl %>% head(1)
> pe
# A tibble: 1 × 8
  dist_type samp_size   min   max method              mu stan_dev shape_ratio
  <chr>         <int> <dbl> <dbl> <chr>            <dbl>    <dbl>       <dbl>
1 Gaussian         32  10.4  33.9 EnvStats_MME_MLE  20.1     5.93        3.39
> 
> neg_log_lik_gaussian <- function(mean, sd){
+   -sum(dnorm(x, mean = mean, sd = sd, log = TRUE))
+ }
> 
> mle_out <- mle(
+   neg_log_lik_gaussian, 
+   start = list(mean = pe$mu, sd = pe$stan_dev), 
+   method = "L-BFGS-B"
+ )
> 
> mu <- mle_out@coef[[1]]
> sd <- mle_out@coef[[2]]
> 
> AIC(mle_out)
[1] 208.7555
spsanderson commented 2 years ago

This may possibly be better suited as separate calculations inside each of the util_distribution_parameter_estimate functions

spsanderson commented 2 years ago

For normal, all others should follow the same convention

Function:

util_normal_mle_aic <- function(.x){

  # Tidyeval
  x <- as.numeric(.x)

  # Get parameters
  pe <- TidyDensity::util_normal_param_estimate(x)$parameter_tbl %>%
    head(1)

  # Neagative Log Likelihood Fx
  neg_log_lik_gaussian <- function(mean, sd){
    -sum(dnorm(x, mean = mean, sd = sd, log = TRUE))
  }

  # MLE
  mle_out <- stats4::mle(
    neg_log_lik_gaussian, 
    start = list(mean = pe$mu, sd = pe$stan_dev), 
    method = "L-BFGS-B"
  )

  ret <- stats::AIC(mle_out)

  # Return
  return(ret)

}

util_normal_mle_aic(x)

Example:

x <- mtcars$mpg

> util_normal_mle_aic(x)
[1] 208.7555
spsanderson commented 1 year ago

moving to next release.

bvancil commented 8 months ago

I saw this on https://mstdn.social/@stevensanderson/111739186445888629 and wanted to check, what do you need help with? Is this a case of authors not agreeing on constants defining the AIC (which often happens) or something else?

spsanderson commented 8 months ago

Hi @bvancil thanks for reaching out, so I think there is a fundamental issue with the way I get the AIC that is in of itself probably inherently wrong.

What is most likely needed is some sort of method to call that will generate the AIC based off of some negative log likelihood function.

Here is a link to the function: https://github.com/spsanderson/TidyDensity/blob/master/R/utils-distribution-comparison.R

You will see I am simply doing the following:

aic_tbl <- comp_tbl |>
        dplyr::filter(!dist_type == "Empirical") |>
        dplyr::select(dist_type, dy) |>
        tidyr::nest(data = dy) |>
        dplyr::mutate(
            lm_model = purrr::map(
                data,
                function(df) stats::lm(dy ~ emp_data_tbl$dy, data = df)
            )
        ) |>
        dplyr::mutate(aic_value = purrr::map(lm_model, stats::AIC)) |>
        dplyr::mutate(aic_value = unlist(aic_value)) |>
        dplyr::mutate(abs_aic = abs(aic_value)) |>
        dplyr::arrange(abs_aic) |>
        dplyr::select(dist_type, aic_value, abs_aic)
spsanderson commented 5 months ago
# Sample data (replace with your actual data)
population_data <- rnorm(100)  # Assuming rnorm is the population
sample_data <- rchisq(100, df = 2)  # Example with chi-square, df=2

# Negative log-likelihood function for normal distribution
neg_log_lik_norm <- function(par, data) {
  mu <- par[1]
  sigma <- par[2]
  n <- length(data)
  -sum(dnorm(data, mean = mu, sd = sigma, log = TRUE))
}

# Negative log-likelihood function for chi-square distribution
neg_log_lik_chisq <- function(par, data) {
  df <- par[1]
  n <- length(data)
  -sum(dchisq(data, df = df, log = TRUE))
}

# Fit normal distribution to population data (rnorm)
fit_norm <- optim(
  c(
    mean(population_data), 
    sd(population_data)
    ), 
  neg_log_lik_norm, 
  data = population_data
  )

# Fit chi-square distribution to sample data (rchisq)
fit_chisq <- optim(
  c(2), 
  neg_log_lik_chisq, 
  data = sample_data,
  method = "Brent",
  lower = 0,
  upper = mean(sample_data)
  )  # Initial df guess

# Extract log-likelihoods and number of parameters
logLik_norm <- -fit_norm$value
logLik_chisq <- -fit_chisq$value
k_norm <- 2  # Two parameters for normal (mean and sd)
k_chisq <- 1  # One parameter for chi-square (df)

# Calculate AIC
AIC_norm <- 2*k_norm - 2*logLik_norm
AIC_chisq <- 2*k_chisq - 2*logLik_chisq

# Print results
> cat("Normal Distribution AIC:", AIC_norm, "\n")
Normal Distribution AIC: 287.8883 
> cat("Chi-square Distribution AIC:", AIC_chisq)
Chi-square Distribution AIC: 324.051
spsanderson commented 4 months ago

New Function:

#' Compare Empirical Data to Distributions
#'
#' @family Empirical
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details The purpose of this function is to take some data set provided and
#' to try to find a distribution that may fit the best. A parameter of
#' `.distribution_type` must be set to either `continuous` or `discrete` in order
#' for this the function to try the appropriate types of distributions.
#'
#' The following distributions are used:
#'
#' Continuous:
#' -  tidy_beta
#' -  tidy_cauchy
#' -  tidy_chisquare
#' -  tidy_exponential
#' -  tidy_gamma
#' -  tidy_logistic
#' -  tidy_lognormal
#' -  tidy_normal
#' -  tidy_pareto
#' -  tidy_uniform
#' -  tidy_weibull
#'
#' Discrete:
#' -  tidy_binomial
#' -  tidy_geometric
#' -  tidy_hypergeometric
#' -  tidy_poisson
#'
#' The function itself returns a list output of tibbles. Here are the tibbles that
#' are returned:
#' -  comparison_tbl
#' -  deviance_tbl
#' -  total_deviance_tbl
#' -  aic_tbl
#' -  kolmogorov_smirnov_tbl
#' -  multi_metric_tbl
#'
#' The `comparison_tbl` is a long `tibble` that lists the values of the `density`
#' function against the given data.
#'
#' The `deviance_tbl` and the `total_deviance_tbl` just give the simple difference
#' from the actual density to the estimated density for the given estimated distribution.
#'
#' The `aic_tbl` will provide the `AIC` for liklehood of the distribution.
#'
#' The `kolmogorov_smirnov_tbl` for now provides a `two.sided` estimate of the
#' `ks.test` of the estimated density against the empirical.
#'
#' The `multi_metric_tbl` will summarise all of these metrics into a single tibble.
#'
#'
#' @description Compare some empirical data set against different distributions
#' to help find the distribution that could be the best fit.
#'
#' @param .x The data set being passed to the function
#' @param .distribution_type What kind of data is it, can be one of `continuous`
#' or `discrete`
#' @param .round_to_place How many decimal places should the parameter estimates
#' be rounded off to for distibution construction. The default is 3
#'
#' @examples
#' xc <- mtcars$mpg
#' output_c <- tidy_distribution_comparison(xc, "continuous")
#'
#' xd <- trunc(xc)
#' output_d <- tidy_distribution_comparison(xd, "discrete")
#'
#' output_c
#' output_d
#'
#' @return
#' An invisible list object. A tibble is printed.
#'
#' @export
#'

tidy_distribution_comparison <- function(.x, .distribution_type = "continuous",
                                         .round_to_place = 3) {

  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  dist_type <- tolower(as.character(.distribution_type))
  rt <- as.numeric(.round_to_place)

  if (!dist_type %in% c("continuous", "discrete")) {
    rlang::abort(
      message = "The '.distribution_type' parameter must be either 'continuous'
      or 'discrete'.",
      use_cli_format = TRUE
    )
  }

  # Get parameter estimates for distributions
  if (dist_type == "continuous") {
    b <- try(util_beta_param_estimate(x_term)$parameter_tbl |>
               dplyr::filter(method == "NIST_MME") |>
               dplyr::select(dist_type, shape1, shape2) |>
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(b, "try-error")) {
      tb <- tidy_beta(.n = n, .shape1 = round(b[[2]], rt), .shape2 = round(b[[3]], rt))
    }

    c <- try(util_cauchy_param_estimate(x_term)$parameter_tbl |>
               dplyr::select(dist_type, location, scale) |>
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(c, "try-error")) {
      tc <- tidy_cauchy(.n = n, .location = round(c[[2]], rt), .scale = round(c[[3]], rt))
    }

    chi <- try(util_chisquare_param_estimate(x_term)$parameter_tbl |>
                 dplyr::select(dist_type, dof, ncp) |>
                 purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(chi, "try-error")) {
      tchi <- tidy_chisquare(.n = n, .df = round(chi[[2]], rt), .ncp = round(chi[[3]], rt))
    }

    e <- try(util_exponential_param_estimate(x_term)$parameter_tbl |>
               dplyr::select(dist_type, rate) |>
               dplyr::mutate(param_2 = NA) |>
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(e, "try-error")) {
      te <- tidy_exponential(.n = n, .rate = round(e[[2]], rt))
    }

    g <- try(util_gamma_param_estimate(x_term)$parameter_tbl |>
               dplyr::filter(method == "NIST_MME") |>
               dplyr::select(dist_type, shape, scale) |>
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(g, "try-error")) {
      tg <- tidy_gamma(.n = n, .shape = round(g[[2]], rt), .scale = round(g[[3]], rt))
    }

    l <- try(util_logistic_param_estimate(x_term)$parameter_tbl |>
               dplyr::filter(method == "EnvStats_MME") |>
               dplyr::select(dist_type, location, scale) |>
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(l, "try-error")) {
      tl <- tidy_logistic(.n = n, .location = round(l[[2]], rt), .scale = round(l[[3]], rt))
    }

    ln <- try(util_lognormal_param_estimate(x_term)$parameter_tbl |>
                dplyr::filter(method == "EnvStats_MME") |>
                dplyr::select(dist_type, mean_log, sd_log) |>
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(ln, "try-error")) {
      tln <- tidy_lognormal(.n = n, .meanlog = round(ln[[2]], rt), .sdlog = round(ln[[3]], rt))
    }

    p <- try(util_pareto_param_estimate(x_term)$parameter_tbl |>
               dplyr::filter(method == "MLE") |>
               dplyr::select(dist_type, shape, scale) |>
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(p, "try-error")) {
      tp <- tidy_pareto(.n = n, .shape = round(p[[2]], rt), .scale = round(p[[3]], rt))
    }

    u <- try(util_uniform_param_estimate(x_term)$parameter_tbl |>
               dplyr::filter(method == "NIST_MME") |>
               dplyr::select(dist_type, min_est, max_est) |>
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(u, "try-error")) {
      tu <- tidy_uniform(.n = n, .min = round(u[[2]], rt), .max = round(u[[3]], rt))
    }

    w <- try(util_weibull_param_estimate(x_term)$parameter_tbl |>
               dplyr::select(dist_type, shape, scale) |>
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(w, "try-error")) {
      tw <- tidy_weibull(.n = n, .shape = round(w[[2]], rt), .scale = round(w[[3]], rt))
    }

    nn <- try(util_normal_param_estimate(x_term)$parameter_tbl |>
                dplyr::filter(method == "EnvStats_MME_MLE") |>
                dplyr::select(dist_type, mu, stan_dev) |>
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(n, "try-error")) {
      tn <- tidy_normal(.n = n, .mean = round(nn[[2]], rt), .sd = round(nn[[3]], rt))
    }

    comp_tbl <- tidy_combine_distributions(
      tidy_empirical(x_term, .distribution_type = dist_type),
      if (exists("tb") && nrow(tb) > 0) {
        tb
      },
      if (exists("tc") && nrow(tc) > 0) {
        tc
      },
      if (exists("tchi") && nrow(tchi) > 0) {
        tchi
      },
      if (exists("te") && nrow(te) > 0) {
        te
      },
      if (exists("tg") && nrow(tg) > 0) {
        tg
      },
      if (exists("tl") && nrow(tl) > 0) {
        tl
      },
      if (exists("tln") && nrow(tln) > 0) {
        tln
      },
      if (exists("tp") && nrow(tp) > 0) {
        tp
      },
      if (exists("tu") && nrow(tu) > 0) {
        tu
      },
      if (exists("tw") && nrow(tw) > 0) {
        tw
      },
      if (exists("tn") && nrow(tn) > 0) {
        tn
      }
    )
  } else {
    bn <- try(util_binomial_param_estimate(trunc(tidy_scale_zero_one_vec(x_term)))$parameter_tbl |>
                dplyr::select(dist_type, size, prob) |>
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(bn, "try-error")) {
      tb <- tidy_binomial(.n = n, .size = round(bn[[2]], rt), .prob = round(bn[[3]], rt))
    }

    ge <- try(util_geometric_param_estimate(trunc(x_term))$parameter_tbl |>
                dplyr::filter(method == "EnvStats_MME") |>
                dplyr::select(dist_type, shape) |>
                dplyr::mutate(param_2 = NA) |>
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(ge, "try-error")) {
      tg <- tidy_geometric(.n = n, .prob = round(ge[[2]], rt))
    }

    h <- try(util_hypergeometric_param_estimate(.x = trunc(x_term), .total = n, .k = n)$parameter_tbl |>
               tidyr::drop_na() |>
               dplyr::slice(1) |>
               dplyr::select(dist_type, m, total) |>
               purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(h, "try-error")) {
      th <- tidy_hypergeometric(
        .n = n,
        .m = trunc(h[[2]]),
        .nn = n - trunc(h[[2]]),
        .k = trunc(h[[2]])
      )
    }

    po <- try(util_poisson_param_estimate(trunc(x_term))$parameter_tbl |>
                dplyr::select(dist_type, lambda) |>
                dplyr::mutate(param_2 = NA) |>
                purrr::set_names("dist_type", "param_1", "param_2"))

    if (!inherits(po, "try-error")) {
      tp <- tidy_poisson(.n = n, .lambda = round(po[[2]], rt))
    }

    comp_tbl <- tidy_combine_distributions(
      tidy_empirical(.x = x_term, .distribution_type = dist_type),
      if (exists("tb") && nrow(tb) > 0) {
        tb
      },
      if (exists("tg") && nrow(tg) > 0) {
        tg
      },
      if (exists("th") && nrow(th) > 0) {
        th
      },
      if (exists("tp") && nrow(tp) > 0) {
        tp
      }
    )
  }

  # Deviance calculations ----
  deviance_tbl <- comp_tbl |>
    dplyr::select(dist_type, x, y) |>
    dplyr::group_by(dist_type, x) |>
    dplyr::mutate(y = stats::median(y)) |>
    dplyr::ungroup() |>
    dplyr::group_by(dist_type) |>
    dplyr::mutate(y = tidy_scale_zero_one_vec(y)) |>
    dplyr::ungroup() |>
    tidyr::pivot_wider(
      id_cols = x,
      names_from = dist_type,
      values_from = y
    ) |>
    dplyr::select(x, Empirical, dplyr::everything()) |>
    dplyr::mutate(
      dplyr::across(
        .cols = -c(x, Empirical),
        .fns = ~ Empirical - .
      )
    ) |>
    tidyr::drop_na() |>
    tidyr::pivot_longer(
      cols = -x
    ) |>
    dplyr::select(-x)

  total_deviance_tbl <- deviance_tbl |>
    dplyr::filter(!name == "Empirical") |>
    dplyr::group_by(name) |>
    dplyr::summarise(total_deviance = sum(value)) |>
    dplyr::ungroup() |>
    dplyr::mutate(total_deviance = abs(total_deviance)) |>
    dplyr::arrange(abs(total_deviance)) |>
    dplyr::rename(
      dist_with_params = name,
      abs_tot_deviance = total_deviance
    )

  # AIC Data ----
  emp_data_tbl <- comp_tbl |>
    dplyr::select(dist_type, x, dy) |>
    dplyr::filter(dist_type == "Empirical")

  aic_tbl <- comp_tbl |>
    dplyr::filter(!dist_type == "Empirical") |>
    dplyr::select(dist_type, dy) |>
    tidyr::nest(data = dy) |>
    dplyr::mutate(
      lm_model = purrr::map(
        data,
        function(df) stats::lm(dy ~ emp_data_tbl$dy, data = df)
      )
    ) |>
    dplyr::mutate(aic_value = purrr::map(lm_model, stats::AIC)) |>
    dplyr::mutate(aic_value = unlist(aic_value)) |>
    dplyr::mutate(abs_aic = abs(aic_value)) |>
    dplyr::arrange(abs_aic) |>
    dplyr::select(dist_type, aic_value, abs_aic)

  ks_tbl <- comp_tbl |>
    dplyr::filter(dist_type != "Empirical") |>
    dplyr::select(dist_type, dy) |>
    tidyr::nest(data = dy) |>
    dplyr::mutate(
      ks = purrr::map(
        .x = data,
        .f = ~ ks.test(
          x = .x$dy,
          y = emp_data_tbl$dy,
          alternative = "two.sided",
          simulate.p.value = TRUE
        )
      ),
      tidy_ks = purrr::map(ks, broom::tidy)
    ) |>
    tidyr::unnest(cols = tidy_ks) |>
    dplyr::select(-c(data, ks)) |>
    dplyr::mutate(dist_char = as.character(dist_type)) |>
    purrr::set_names(
      "dist_type", "ks_statistic", "ks_pvalue", "ks_method", "alternative",
      "dist_char"
    )

  multi_metric_tbl <- total_deviance_tbl |>
    dplyr::mutate(dist_with_params = as.factor(dist_with_params)) |>
    dplyr::inner_join(aic_tbl, by = c("dist_with_params" = "dist_type")) |>
    dplyr::inner_join(ks_tbl, by = c("dist_with_params" = "dist_char")) |>
    dplyr::select(dist_type, dplyr::everything(), -dist_with_params) |>
    dplyr::mutate(dist_type = as.factor(dist_type))

  # Return ----
  output <- list(
    comparison_tbl         = comp_tbl,
    deviance_tbl           = deviance_tbl,
    total_deviance_tbl     = total_deviance_tbl,
    aic_tbl                = aic_tbl,
    kolmogorov_smirnov_tbl = ks_tbl,
    multi_metric_tbl       = multi_metric_tbl
  )

  # Attributes ----
  attr(deviance_tbl, ".tibble_type") <- "deviance_comparison_tbl"
  attr(total_deviance_tbl, ".tibble_type") <- "deviance_results_tbl"
  attr(aic_tbl, ".tibble_type") <- "aic_tbl"
  attr(comp_tbl, ".tibble_type") <- "comparison_tbl"
  attr(ks_tbl, ".tibble_type") <- "kolmogorov_smirnov_tbl"
  attr(multi_metric_tbl, ".tibble_type") <- "full_metric_tbl"
  attr(output, ".x") <- x_term
  attr(output, ".n") <- n

  # Return ----
  return(invisible(output))
}