Closed spsanderson closed 2 years ago
add class of "hai_dist_tbl" to output tibble for other functions
hai_distribution_comparison_tbl <- function(.x, .distributions = c("gamma","beta")){
# Tidyeval ----
x_term <- .x
dl <- as.vector(tolower(.distributions))
# Parameters ----
hskew <- healthyR.ai::hai_skewness_vec(x_term)
hkurt <- healthyR.ai::hai_kurtosis_vec(x_term)
mu <- mean(x_term, na.rm = TRUE)
std <- stats::sd(x_term, na.rm = TRUE)
minimum <- min(x_term, na.rm = TRUE)
maximum <- max(x_term, na.rm = TRUE)
med <- stats::median(x_term, na.rm = TRUE)
n <- length(x_term)
# Distribution Table and associate stats function
dist_df <- tibble::tibble(
distribution = dl,
stats_func = dplyr::case_when(
distribution == "normal" ~ "rnorm",
distribution == "uniform" ~ "runif",
distribution == "exponential" ~ "rexp",
distribution == "logisitic" ~ "rlogis",
distribution == "beta" ~ "rbeta",
distribution == "lognormal" ~ "rlnorm",
distribution == "gamma" ~ "rgamma",
distribution == "weibull" ~ "rweibull",
distribution == "chisquare" ~ "rchisq",
distribution == "cauchy" ~ "rcauchy",
distribution == "hypergeometric" ~ "rhyper",
distribution == "f" ~ "rf",
distribution == "poisson" ~ "rpois"
)
)
# Was a distribution chosen unsupported?
supported_distributions <- c("normal","uniform","exponential","logistic","beta",
"lognormal","gamma","weibull","chisquare","cauchy",
"hypergeometric","f","poisson")
dist_unsupported <- dist_df %>%
dplyr::mutate(dist_supported = distribution %in% supported_distributions) %>%
dplyr::filter(dist_supported == FALSE)
# Checks ----
if(!is.numeric(x_term)){
stop(call. = FALSE, ".x must be a numeric vector.")
}
if(exists("dist_unsupported") & nrow(dist_unsupported) > 1){
print(dist_unsupported)
rlang::abort("You entered a distribution that is unsupported")
}
# Make distributions ----
dist_tbl <- dist_df %>%
dplyr::mutate(
dist_data = dplyr::case_when(
stats_func == "rgamma" ~ list(stats::rgamma(n = n, shape = hskew, rate = hkurt)),
stats_func == "rbeta" ~ list(stats::rbeta(n = n, shape1 = hskew, shape2 = hkurt, ncp = med)),
stats_func == "rnorm" ~ list(stats::rnorm(n = n, mean = mu, sd = std)),
stats_func == "runif" ~ list(stats::runif(n = n, min = minimum, max = maximum)),
stats_func == "rexp" ~ list(stats::rexp(n = n, rate = hkurt)),
stats_func == "rlogis" ~ list(stats::rlogis(n = n, location = hskew, scale = hkurt)),
stats_func == "rlnorm" ~ list(stats::rlnorm(n = n, meanlog = log(mu), sdlog = log(std))),
stats_func == "rweibull" ~ list(stats::rweibull(n = n, shape = hskew, scale = hkurt)),
stats_func == "rchisq" ~ list(stats::rchisq(n = n, df = hskew)),
stats_func == "rcauchy" ~ list(stats::rcauchy(n = n, location = hskew, scale = hkurt)),
stats_func == "rhyper" ~ list(stats::rhyper(nn = n, m = n, n = n, k = n) %>%
healthyR.ai::hai_scale_zero_one_vec()),
stats_func == "rf" ~ list(stats::rf(n = n, df1 = hskew, df2 = hskew) %>%
healthyR.ai::hai_scale_zero_one_vec()),
stats_func == "rpois" ~ list(stats::rpois(n = n, lambda = hskew) %>%
healthyR.ai::hai_scale_zero_one_vec())
)
) %>%
dplyr::group_by(distribution) %>%
dplyr::mutate(density_data = list(density(unlist(dist_data)))) %>%
dplyr::ungroup() %>%
dplyr::select(-stats_func)
# Add empirical data and density to tibble
emp_dens_tbl <- tibble::tibble(
distribution = "empirical",
dist_data = list(x_term),
density_data = list(density(x_term))
)
dist_final_tbl <- rbind(dist_tbl, emp_dens_tbl)
# Add Class of hai_dist_tbl
class(dist_final_tbl) <- c("tbl_df","tbl","data.frame","hai_dist_tbl")
# Return ----
return(dist_final_tbl)
}
Change code to the following (favor the use of an attribute over the class):
hai_distribution_comparison_tbl <- function(.x, .distributions = c("gamma","beta")){
# Tidyeval ----
x_term <- .x
dl <- as.vector(tolower(.distributions))
# Parameters ----
hskew <- hai_skewness_vec(x_term)
hkurt <- hai_kurtosis_vec(x_term)
mu <- mean(x_term, na.rm = TRUE)
std <- stats::sd(x_term, na.rm = TRUE)
minimum <- min(x_term, na.rm = TRUE)
maximum <- max(x_term, na.rm = TRUE)
med <- stats::median(x_term, na.rm = TRUE)
n <- length(x_term)
# Distribution Table and associate stats function
dist_df <- tibble::tibble(
distribution = dl,
stats_func = dplyr::case_when(
distribution == "normal" ~ "rnorm",
distribution == "uniform" ~ "runif",
distribution == "exponential" ~ "rexp",
distribution == "logisitic" ~ "rlogis",
distribution == "beta" ~ "rbeta",
distribution == "lognormal" ~ "rlnorm",
distribution == "gamma" ~ "rgamma",
distribution == "weibull" ~ "rweibull",
distribution == "chisquare" ~ "rchisq",
distribution == "cauchy" ~ "rcauchy",
distribution == "hypergeometric" ~ "rhyper",
distribution == "f" ~ "rf",
distribution == "poisson" ~ "rpois"
)
)
# Was a distribution chosen unsupported?
supported_distributions <- c("normal","uniform","exponential","logistic","beta",
"lognormal","gamma","weibull","chisquare","cauchy",
"hypergeometric","f","poisson")
dist_unsupported <- dist_df %>%
dplyr::mutate(dist_supported = distribution %in% supported_distributions) %>%
dplyr::filter(dist_supported == FALSE)
# Checks ----
if(!is.numeric(x_term)){
stop(call. = FALSE, ".x must be a numeric vector.")
}
if(exists("dist_unsupported") & nrow(dist_unsupported) > 1){
print(dist_unsupported)
rlang::abort("You entered a distribution that is unsupported")
}
# Make distributions ----
dist_tbl <- dist_df %>%
dplyr::mutate(
dist_data = dplyr::case_when(
stats_func == "rgamma" ~ list(stats::rgamma(n = n, shape = hskew, rate = hkurt)),
stats_func == "rbeta" ~ list(stats::rbeta(n = n, shape1 = hskew, shape2 = hkurt, ncp = med)),
stats_func == "rnorm" ~ list(stats::rnorm(n = n, mean = mu, sd = std)),
stats_func == "runif" ~ list(stats::runif(n = n, min = minimum, max = maximum)),
stats_func == "rexp" ~ list(stats::rexp(n = n, rate = hkurt)),
stats_func == "rlogis" ~ list(stats::rlogis(n = n, location = hskew, scale = hkurt)),
stats_func == "rlnorm" ~ list(stats::rlnorm(n = n, meanlog = log(mu), sdlog = log(std))),
stats_func == "rweibull" ~ list(stats::rweibull(n = n, shape = hskew, scale = hkurt)),
stats_func == "rchisq" ~ list(stats::rchisq(n = n, df = hskew)),
stats_func == "rcauchy" ~ list(stats::rcauchy(n = n, location = hskew, scale = hkurt)),
stats_func == "rhyper" ~ list(stats::rhyper(nn = n, m = n, n = n, k = n) %>%
hai_scale_zero_one_vec()),
stats_func == "rf" ~ list(stats::rf(n = n, df1 = hskew, df2 = hskew) %>%
hai_scale_zero_one_vec()),
stats_func == "rpois" ~ list(stats::rpois(n = n, lambda = hskew) %>%
hai_scale_zero_one_vec())
)
) %>%
dplyr::group_by(distribution) %>%
dplyr::mutate(density_data = list(density(unlist(dist_data)))) %>%
dplyr::ungroup() %>%
dplyr::select(-stats_func)
# Add empirical data and density to tibble
emp_dens_tbl <- tibble::tibble(
distribution = "empirical",
dist_data = list(x_term),
density_data = list(density(x_term))
)
dist_final_tbl <- rbind(dist_tbl, emp_dens_tbl)
# Add attributes ----
attr(dist_final_tbl, ".x") <- .x
attr(dist_final_tbl, ".distributions") <- .distributions
attr(dist_final_tbl, "tibble_type") <- "hai_dist_compare_tbl"
# Return ----
return(dist_final_tbl)
}
Need to compare the following distributions to the empirical, all of which are on a cullen frey graph
Not on Cullen Frey