spsanderson / TidyDensity

Create tidy probability/density tibbles and plots of randomly generated and empirical data.
34 stars 1 forks source link

Add function `util_chisquare_param_estimate()` #414

Closed spsanderson closed 6 months ago

spsanderson commented 7 months ago

Add a function for util_chisquare_param_estimate()


#' Estimate Chisquare Parameters
#' @family Parameter Estimation
#' @family Chisquare
#' @author Steven P. Sanderson II, MPH
#' @details This function will see if the given vector `.x` is a numeric vector.
#' It will attempt to estimate the prob parameter of a Chisquare distribution.
#' The function first performs tidyeval on the input data to ensure it's a 
#' numeric vector. It then checks if there are at least two data points, as this 
#' is a requirement for parameter estimation.
#' The estimation of the chi-square distribution parameters is performed using 
#' maximum likelihood estimation (MLE) implemented with the `bbmle` package. 
#' The negative log-likelihood function is minimized to obtain the estimates for 
#' the degrees of freedom (\code{doff}) and the non-centrality parameter (\code{ncp}). 
#' Initial values for the optimization are set based on the sample variance and 
#' mean, but these can be adjusted if necessary.
#' If the estimation fails or encounters an error, the function returns \code{NA} 
#' for both \code{doff} and \code{ncp}.
#' Finally, the function returns a tibble containing the following information:
#' \describe{
#'   \item{dist_type}{The type of distribution, which is "Chisquare" in this case.}
#'   \item{samp_size}{The sample size, i.e., the number of data points in the input vector.}
#'   \item{min}{The minimum value of the data points.}
#'   \item{max}{The maximum value of the data points.}
#'   \item{mean}{The mean of the data points.}
#'   \item{degrees_of_freedom}{The estimated degrees of freedom (\code{doff}) for the chi-square distribution.}
#'   \item{ncp}{The estimated non-centrality parameter (\code{ncp}) for the chi-square distribution.}
#' }
#' Additionally, if the argument \code{.auto_gen_empirical} is set to \code{TRUE} 
#' (which is the default behavior), the function also returns a combined tibble 
#' containing both empirical and chi-square distribution data, obtained by 
#' calling \code{tidy_empirical} and \code{tidy_chisquare}, respectively.
#' @description This function will attempt to estimate the Chisquare prob parameter
#' given some vector of values `.x`. The function will return a list output by default,
#' and  if the parameter `.auto_gen_empirical` is set to `TRUE` then the empirical
#' data given to the parameter `.x` will be run through the `tidy_empirical()`
#' function and combined with the estimated Chisquare data.
#' @param .x The vector of data to be passed to the function. Must be non-negative
#' integers.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#' @examples
#' library(dplyr)
#' library(ggplot2)
#' tc <- tidy_chisquare(.n = 500, .df = 6, .ncp = 1) |> pull(y)
#' output <- util_chisquare_param_estimate(tc)
#' output$parameter_tbl
#' output$combined_data_tbl |>
#'   tidy_combined_autoplot()
#' @return
#' A tibble/list
#' @name util_chisquare_param_estimate

#' @export
#' @rdname util_chisquare_param_estimate

util_chisquare_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {

  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  minx <- min(as.numeric(x_term))
  maxx <- max(as.numeric(x_term))

  # Checks ----
  if (!is.vector(x_term, mode = "numeric")) {
      message = "The '.x' term must be a numeric vector.",
      use_cli_format = TRUE

  if (n < 2) {
      message = "You must supply at least two data points for this function.",
      use_cli_format = TRUE

  # Parameters ----
  estimate_chisq_params <- function(data) {
    # Negative log-likelihood function
    negLogLik <- function(df, ncp) {
      -sum(stats::dchisq(data, df = df, ncp = ncp, log = TRUE))

    # Initial values (adjust based on your data if necessary)
    start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))

    # MLE using bbmle
    mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
    # Return estimated parameters as a named vector
    df <- dplyr::tibble(
      est_df = bbmle::coef(mle_fit)[1],
      est_ncp = bbmle::coef(mle_fit)[2]

  safe_estimates <- {
      otherwise = NA_real_,
      quiet = TRUE

  estimates <- safe_estimates(x_term)
  doff <- estimates$est_df
  ncp <- estimates$est_ncp

  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    tc <- tidy_chisquare(.n = n, .df = round(doff, 3), .ncp = round(ncp, 3))
    combined_tbl <- tidy_combine_distributions(te, tc)

  ret <- dplyr::tibble(
    dist_type = "Chisquare",
    samp_size = n,
    min = minx,
    max = maxx,
    mean = mean(x_term),
    degrees_of_freedom = doff,
    ncp = ncp

  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "chisquare"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n

  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
  } else {
    output <- list(
      parameter_tbl = ret



> tc <- tidy_chisquare(.n = 500, .df = 6, .ncp = 1) |> pull(y)
> output <- util_chisquare_param_estimate(tc)
Warning messages:
1: In stats::dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
2: In stats::dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
3: In stats::dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
> output$parameter_tbl
# A tibble: 1 × 7
  dist_type samp_size   min   max  mean degrees_of_freedom   ncp
  <chr>         <int> <dbl> <dbl> <dbl>              <dbl> <dbl>
1 Chisquare       500 0.935  20.6  7.13               6.65 0.488
> output$combined_data_tbl |>
+   tidy_combined_autoplot()


spsanderson commented 7 months ago

This might work well too, although still finding a good ncp start point might prove somewha elusive:

# Load required package

# Sample data (replace with your actual data)
data <- rchisq(100, df = 5, ncp = 20)

# Define negative log-likelihood function
negLogLik <- function(df, ncp) {
  -sum(dchisq(data, df = df, ncp = ncp, log = TRUE))

# Initial values for optimization (crucial for good convergence)
start_vals <- list(df = 3, ncp = 20)

# Maximum likelihood estimation
mle_fit <- mle2(negLogLik, start = start_vals)

# Extract estimated parameters
df_est <- coef(mle_fit)[1]
ncp_est <- coef(mle_fit)[2]

# Print the results
cat("Estimated df:", df_est, "\n")
cat("Estimated ncp:", ncp_est)


> # Load required package
> library(bbmle)
> # Sample data (replace with your actual data)
> data <- rchisq(100, df = 5, ncp = 20)
> # Define negative log-likelihood function
> negLogLik <- function(df, ncp) {
+   -sum(dchisq(data, df = df, ncp = ncp, log = TRUE))
+ }
> # Initial values for optimization (crucial for good convergence)
> start_vals <- list(df = 3, ncp = 20)
> # Maximum likelihood estimation
> mle_fit <- mle2(negLogLik, start = start_vals)
> # Extract estimated parameters
> df_est <- coef(mle_fit)[1]
> ncp_est <- coef(mle_fit)[2]
> # Print the results
> cat("Estimated df:", df_est, "\n")
Estimated df: 10.84161 
> cat("Estimated ncp:", ncp_est)
Estimated ncp: 14.05248

nd <- rchisq(100, df = df_est, ncp = ncp_est)

hist(data, col = "lightblue", main = "Histogram of data")
hist(nd, col = "lightgreen", add = TRUE)


spsanderson commented 7 months ago


> estimate_chisq_params <- function(data) {
+   # Negative log-likelihood function
+   negLogLik <- function(df, ncp) {
+     -sum(dchisq(data, df = df, ncp = ncp, log = TRUE))
+   }
+   # Initial values (adjust based on your data if necessary)
+   start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))
+   # MLE using bbmle
+   mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
+   # Return estimated parameters as a named vector
+   c(df = coef(mle_fit)[1], ncp = coef(mle_fit)[2])
+ }
> library(purrr)
> # List of data vectors (replace with your actual data)
> data_list <- list(rchisq(100, df = 5, ncp = 2), 
+                   rchisq(80, df = 3, ncp = 1), 
+                   rchisq(120, df = 7, ncp = 4))
> # Apply the estimation function to each data vector
> param_estimates <- map(data_list, estimate_chisq_params)
Warning messages:
1: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
2: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
3: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
4: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
5: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
6: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
> # Print results
> print(param_estimates)
    df.df   ncp.ncp 
5.3677154 0.9875087 

    df.df   ncp.ncp 
3.2668169 0.3253299 

   df.df  ncp.ncp 
6.559403 4.540669 
spsanderson commented 7 months ago

Mega Script test

# Lib Load ----

# Data ----
# Make parameters and grid
df <- 1:10
ncp <- 1:10
n <- runif(10, 250, 500) |> trunc()
param_grid <- expand_grid(n = n, df = df, ncp = ncp)

# Functions ----
# functions to estimate the parameters of a chisq distribution
# dof
mean_x <- function(x) mean(x)
mean_minus_1 <- function(x) mean(x) - 1
var_div_2 <- function(x) var(x) / 2
length_minus_1 <- function(x) length(x) - 1
# ncp
mean_minus_mean_minus_1 <- function(x) mean(x) - (mean(x) - 1)
ie_mean_minus_var_div_2 <- function(x) ifelse((mean(x) - (var(x) / 2)) < 0, 0, mean(x) - var(x)/2)
ie_optim <- function(x) optim(par = 0,
                             fn = function(ncp) {
                               -sum(dchisq(x, df = var(x)/2, ncp = ncp, log = TRUE))
                             method = "Brent",
                             lower = 0,
                             upper = 10 * var(x)/2)$par
# both
estimate_chisq_params <- function(data) {
  # Negative log-likelihood function
  negLogLik <- function(df, ncp) {
    -sum(dchisq(data, df = df, ncp = ncp, log = TRUE))

  # Initial values (adjust based on your data if necessary)
  start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))

  # MLE using bbmle
  mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
  # Return estimated parameters as a named vector
  df <- dplyr::tibble(
    est_df = coef(mle_fit)[1],
    est_ncp = coef(mle_fit)[2]

safe_estimates <- {
    otherwise = NA_real_,
    quiet = TRUE

# Simulate data ----
dff <- param_grid |>
  mutate(x = pmap(pick(everything()), match.fun("rchisq"))) |>
    safe_est_parms = map(x, safe_estimates),
    dfa = map_dbl(x, mean_minus_1),
    dfb = map_dbl(x, var_div_2),
    dfc = map_dbl(x, length_minus_1),
    ncpa = map_dbl(x, mean_minus_mean_minus_1),
    ncpb = map_dbl(x, ie_mean_minus_var_div_2),
    ncpc = map_dbl(x, ie_optim)
  ) |>
  select(-x) |>
  filter(map_lgl(safe_est_parms, ~ any(is.na(.x))) == FALSE) |>
  unnest(cols = safe_est_parms) |>
    dfa_resid = dfa - df,
    dfb_resid = dfb - df,
    dfc_resid = dfc - df,
    dfd_resid = est_df - df,
    ncpa_resid = ncpa - ncp,
    ncpb_resid = ncpb - ncp,
    ncpc_resid = ncpc - ncp,
    ncpd_resid = est_ncp - ncp

# Visuals ----
boxplot(dff$dfa ~ dff$df, main = "mean(x) -1 ~ df")
boxplot(dff$dfb ~ dff$df, main = "var(x) / 2 ~ df")         
boxplot(dff$dfc ~ dff$df, main = "length(x) - 1 ~ df")
boxplot(dff$est_df ~ dff$df, main = "negloglik ~ df - Looks Good")
boxplot(dff$ncpa ~ dff$ncp, main = "mean(x) - (mean(x) - 1) ~ ncp")
boxplot(dff$ncpb ~ dff$ncp, main = "mean(x) - var(x)/2 ~ nc")
boxplot(dff$ncpc ~ dff$ncp, main = "optim ~ ncp")
boxplot(dff$est_ncp ~ dff$ncp, main = "negloglik ~ ncp - Looks Good")
boxplot(dff$dfa_resid ~ dff$df, main = "mean(x) -1 ~ df Residuals")
boxplot(dff$dfb_resid ~ dff$df, main = "var(x) / 2 ~ df Residuals")
boxplot(dff$dfc_resid ~ dff$df, main = "length(x) - 1 ~ df Residuals")
boxplot(dff$dfd_resid ~ dff$df, main = "negloglik ~ df Residuals")
boxplot(dff$ncpa_resid ~ dff$ncp, main = "mean(x) - (mean(x) - 1) ~ ncp Residuals")
boxplot(dff$ncpb_resid ~ dff$ncp, main = "mean(x) - var(x)/2 ~ ncp Residuals")
boxplot(dff$ncpc_resid ~ dff$ncp, main = "optim ~ ncp Residuals")
boxplot(dff$ncpd_resid ~ dff$ncp, main = "negloglik ~ ncp Residuals")



