Closed spsanderson closed 2 years ago
Function:
#' Compare Empirical Data to Distributions
#'
#' @family Empirical
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details The purpose of this function is to take some data set provided and
#' to try to find a distribution that may fit the best. A parameter of
#' `.distribution_type` must be set to either `continuous` or `discrete` in order
#' for this the function to try the appropriate types of distributions.
#'
#' The following distributions are used:
#'
#' Continuous:
#' - tidy_beta
#' - tidy_cauchy
#' - tidy_exponential
#' - tidy_gamma
#' - tidy_logistic
#' - tidy_lognormal
#' - tidy_pareto
#' - tidy_uniform
#' - tidy_weibull
#'
#' Discrete:
#' - tidy_binomial
#' - tidy_geometric
#' - tidy_hypergeometric
#' - tidy_poisson
#'
#'
#' @description Compare some empirical data set against different distributions
#' to help find the distribution that could be the best fit.
#'
#' @param .x The data set being passed to the function
#' @param .distribution_type What kind of data is it, can be one of `continuous`
#' or `discrete`
#'
#' @examples
#' xc <- mtcars$mpg
#' tidy_distribution_comparison(xc, "continuous")
#'
#' xd <- trunc(xc)
#' tidy_distribution_comparison(xd, "discrete")
#'
#' @return
#' An invisible list object. A tibble is printed.
#'
#' @export
#'
tidy_distribution_comparison <- function(.x, .distribution_type = "continuous"){
# Tidyeval ----
x_term <- as.numeric(.x)
n <- length(x_term)
dist_type <- tolower(as.character(.distribution_type))
if (!dist_type %in% c("continuous","discrete")){
rlang::abort(
message = "The '.distribution_type' parameter must be either 'continuous'
or 'discrete'.",
use_cli_format = TRUE
)
}
# Get parameter estimates for distributions
if (dist_type == "continuous"){
b <- try(util_beta_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, shape1, shape2) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(b, "try-error")){
tb <- tidy_beta(.n = n, .shape1 = round(b[[2]], 2), .shape2 = round(b[[3]], 2))
}
c <- try(util_cauchy_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, location, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(c, "try-error")){
tc <- tidy_cauchy(.n = n, .location = round(c[[2]], 2), .scale = round(c[[3]], 2))
}
e <- try(util_exponential_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, rate) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(e, "try-error")){
te <- tidy_exponential(.n = n, .rate = round(e[[2]], 2))
}
g <- try(util_gamma_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(g, "try-error")){
tg <- tidy_gamma(.n = n, .shape = round(g[[2]], 2), .scale = round(g[[3]], 2))
}
l <- try(util_logistic_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, location, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(l, "try-error")){
tl <- tidy_logistic(.n = n, .location = round(l[[2]], 2), .scale = round(l[[3]], 2))
}
ln <- try(util_lognormal_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, mean_log, sd_log) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(ln, "try-error")){
tln <- tidy_lognormal(.n = n, .meanlog = round(ln[[2]], 2), .sdlog = round(ln[[3]], 2))
}
p <- try(util_pareto_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "MLE") %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(p, "try-error")){
tp <- tidy_pareto(.n = n, .shape = round(p[[2]], 2), .scale = round(p[[3]], 2))
}
u <- try(util_uniform_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, min_est, max_est) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(u, "try-error")){
tu <- tidy_uniform(.n = n, .min = round(u[[2]], 2), .max = round(u[[3]], 2))
}
w <- try(util_weibull_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(w, "try-error")){
tw <- tidy_weibull(.n = n, .shape = round(w[[2]], 2), .scale = round(w[[3]], 2))
}
comp_tbl <- tidy_combine_distributions(
tidy_empirical(x_term, .distribution_type = dist_type),
if (exists("tb") && nrow(tb) > 0){tb},
if (exists("tc") && nrow(tb) > 0){tc},
if (exists("te") && nrow(tb) > 0){te},
if (exists("tg") && nrow(tb) > 0){tg},
if (exists("tl") && nrow(tb) > 0){tl},
if (exists("tln") && nrow(tb) > 0){tln},
if (exists("tp") && nrow(tb) > 0){tp},
if (exists("tu") && nrow(tb) > 0){tu},
if (exists("tw") && nrow(tb) > 0){tw}
)
# comp_tbl <- tidy_combine_distributions(
# tidy_empirical(x_term, .distribution_type = dist_type),
# tidy_beta(.n = n, .shape1 = round(b[[2]], 2), .shape2 = round(b[[3]], 2)),
# tidy_cauchy(.n = n, .location = round(c[[2]], 2), .scale = round(c[[3]], 2)),
# tidy_exponential(.n = n, .rate = round(e[[2]], 2)),
# tidy_gamma(.n = n, .shape = round(g[[2]], 2), .scale = round(g[[3]], 2)),
# tidy_logistic(.n = n, .location = round(l[[2]], 2), .scale = round(l[[3]], 2)),
# tidy_lognormal(.n = n, .meanlog = round(ln[[2]], 2), .sdlog = round(ln[[3]], 2)),
# tidy_pareto(.n = n, .shape = round(p[[2]], 2), .scale = round(p[[3]], 2)),
# tidy_uniform(.n = n, .min = round(u[[2]], 2), .max = round(u[[3]], 2)),
# tidy_weibull(.n = n, .shape = round(w[[2]], 2), .scale = round(w[[3]], 2))
# )
} else {
bn <- try(util_binomial_param_estimate(trunc(tidy_scale_zero_one_vec(x_term)))$parameter_tbl %>%
dplyr::select(dist_type, size, prob) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(bn, "try-error")){
tb <- tidy_binomial(.n = n, .size = round(bn[[2]], 2), .prob = round(bn[[3]], 2))
}
ge <- try(util_geometric_param_estimate(trunc(x_term))$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, shape) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(ge, "try-error")){
tg <- tidy_geometric(.n = n, .prob = round(ge[[2]], 2))
}
h <- try(util_hypergeometric_param_estimate(.x = trunc(x_term), .total = n, .k = n)$parameter_tbl %>%
tidyr::drop_na() %>%
dplyr::slice(1) %>%
dplyr::select(dist_type, m, total) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(h, "try-error")){
th <- tidy_hypergeometric(
.n = n,
.m = trunc(h[[2]]),
.nn = n - trunc(h[[2]]),
.k = trunc(h[[2]])
)
}
po <- try(util_poisson_param_estimate(trunc(x_term))$parameter_tbl %>%
dplyr::select(dist_type, lambda) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2"))
if (!inherits(po, "try-error")){
tp <- tidy_poisson(.n = n, .lambda = round(po[[2]], 2))
}
comp_tbl <- tidy_combine_distributions(
tidy_empirical(.x = x_term, .distribution_type = dist_type),
if (exists("tb") && nrow(tb) > 0){tb},
if (exists("tg") && nrow(tb) > 0){tg},
if (exists("th") && nrow(tb) > 0){th},
if (exists("tp") && nrow(tb) > 0){tp}
)
# comp_tbl <- tidy_combine_distributions(
# tidy_empirical(.x = x_term, .distribution_type = dist_type),
# tidy_binomial(.n = n, .size = round(bn[[2]], 2), .prob = round(bn[[3]], 2)),
# tidy_geometric(.n = n, .prob = round(ge[[2]], 2)),
# tidy_hypergeometric(
# .n = n,
# .m = trunc(h[[2]]),
# .nn = n - trunc(h[[2]]),
# .k = trunc(h[[2]])
# ),
# tidy_poisson(.n = n, .lambda = round(po[[2]], 2))
# )
}
# Deviance calculations ----
deviance_tbl <- comp_tbl %>%
dplyr::select(dist_type, x, y) %>%
dplyr::group_by(dist_type, x) %>%
dplyr::mutate(y = stats::median(y)) %>%
dplyr::ungroup() %>%
dplyr::group_by(dist_type) %>%
dplyr::mutate(y = tidy_scale_zero_one_vec(y)) %>%
dplyr::ungroup() %>%
tidyr::pivot_wider(
id_cols = x,
names_from = dist_type,
values_from = y
) %>%
dplyr::select(x, Empirical, dplyr::everything()) %>%
dplyr::mutate(
dplyr::across(
.cols = -c(x, Empirical),
.fns = ~ Empirical - .
)
) %>%
tidyr::drop_na() %>%
tidyr::pivot_longer(
cols = -x
)
total_deviance_tbl <- deviance_tbl %>%
dplyr::filter(!name == "Empirical") %>%
dplyr::group_by(name) %>%
dplyr::summarise(total_deviance = sum(value)) %>%
dplyr::ungroup() %>%
dplyr::mutate(total_deviance = abs(total_deviance)) %>%
dplyr::arrange(abs(total_deviance)) %>%
dplyr::rename(dist_with_params = name,
abs_tot_deviance = total_deviance)
# Return ----
attr(deviance_tbl, ".tibble_type") <- "deviance_comparison_tbl"
attr(total_deviance_tbl, ".tibble_type") <- "deviance_results_tbl"
output <- list(
comparison_tbl = comp_tbl,
deviance_tbl = deviance_tbl,
total_deviance_tbl = total_deviance_tbl
)
print(total_deviance_tbl)
return(invisible(output))
}
Example:
> tidy_distribution_comparison(trunc(mtcars$mpg), "discrete")
# A tibble: 4 × 2
dist_with_params abs_tot_deviance
<chr> <dbl>
1 Poisson c(19.69) 1.52
2 Hypergeometric c(21, 11, 21) 3.19
3 Geometric c(0.05) 4.37
4 Binomial c(32, 0.03) 6.48
> tidy_distribution_comparison(mtcars$mpg, "continuous")
For the beta distribution, its mean 'mu' should be 0 < mu < 1. The data will
therefore be scaled to enforce this.
# A tibble: 9 × 2
dist_with_params abs_tot_deviance
<chr> <dbl>
1 Weibull c(3.58, 22.29) 0.678
2 Exponential c(0.05) 1.06
3 Uniform c(31.84, 8.34) 1.60
4 Beta c(1.11, 1.58, 0) 2.18
5 Lognormal c(2.96, 0.29) 2.66
6 Gamma c(11.47, 1.75) 3.23
7 Logistic c(20.09, 3.27) 3.54
8 Pareto c(10.4, 1.62) 6.06
9 Cauchy c(19.2, 7.38) 10.4
> tidy_distribution_comparison(hai_scale_zscore_vec(mtcars$mpg), "continuous")
For the beta distribution, its mean 'mu' should be 0 < mu < 1. The data will
therefore be scaled to enforce this.
Error in util_gamma_param_estimate(x_term) :
The numeric vector '.x' must contain at least two unique values greater
than 0
Error in util_lognormal_param_estimate(x_term) :
'.x' must contain at least two non-missing distict values. All non-missing
values must be positive.
Error in util_pareto_param_estimate(x_term) :
'.x' must contain at least two non-missing distinct values. All values of
'.x' must be positive.
Error in survival::survreg(x_surv ~ 1, dist = "weibull") :
Invalid survival times for this distribution
In addition: Warning messages:
1: In log(dlist$dtrans(Y[exactsurv, 1])) : NaNs produced
2: In log(y) : NaNs produced
# A tibble: 5 × 2
dist_with_params abs_tot_deviance
<chr> <dbl>
1 Beta c(1.11, 1.58, 0) 0.752
2 Cauchy c(-0.15, 1.22) 1.53
3 Uniform c(1.95, -1.95) 3.67
4 Exponential c(14060018348863988) 3.86
5 Logistic c(0, 0.54) 4.22
Make sure the function does not fail out if data provided does not fit the parameters of what is needed to estimate a particular distributions parameters.