Add function `util_chisquare_param_estimate()`

spsanderson / TidyDensity

Create tidy probability/density tibbles and plots of randomly generated and empirical data.

Other

34 stars 1 forks source link

Add a function for util_chisquare_param_estimate()

Function:

#' Estimate Chisquare Parameters
#'
#' @family Parameter Estimation
#' @family Chisquare
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details This function will see if the given vector `.x` is a numeric vector.
#' It will attempt to estimate the prob parameter of a Chisquare distribution.
#' The function first performs tidyeval on the input data to ensure it's a 
#' numeric vector. It then checks if there are at least two data points, as this 
#' is a requirement for parameter estimation.
#' 
#' The estimation of the chi-square distribution parameters is performed using 
#' maximum likelihood estimation (MLE) implemented with the `bbmle` package. 
#' The negative log-likelihood function is minimized to obtain the estimates for 
#' the degrees of freedom (\code{doff}) and the non-centrality parameter (\code{ncp}). 
#' Initial values for the optimization are set based on the sample variance and 
#' mean, but these can be adjusted if necessary.
#' 
#' If the estimation fails or encounters an error, the function returns \code{NA} 
#' for both \code{doff} and \code{ncp}.
#' 
#' Finally, the function returns a tibble containing the following information:
#' \describe{
#'   \item{dist_type}{The type of distribution, which is "Chisquare" in this case.}
#'   \item{samp_size}{The sample size, i.e., the number of data points in the input vector.}
#'   \item{min}{The minimum value of the data points.}
#'   \item{max}{The maximum value of the data points.}
#'   \item{mean}{The mean of the data points.}
#'   \item{degrees_of_freedom}{The estimated degrees of freedom (\code{doff}) for the chi-square distribution.}
#'   \item{ncp}{The estimated non-centrality parameter (\code{ncp}) for the chi-square distribution.}
#' }
#' 
#' Additionally, if the argument \code{.auto_gen_empirical} is set to \code{TRUE} 
#' (which is the default behavior), the function also returns a combined tibble 
#' containing both empirical and chi-square distribution data, obtained by 
#' calling \code{tidy_empirical} and \code{tidy_chisquare}, respectively.
#' 
#' @description This function will attempt to estimate the Chisquare prob parameter
#' given some vector of values `.x`. The function will return a list output by default,
#' and  if the parameter `.auto_gen_empirical` is set to `TRUE` then the empirical
#' data given to the parameter `.x` will be run through the `tidy_empirical()`
#' function and combined with the estimated Chisquare data.
#'
#' @param .x The vector of data to be passed to the function. Must be non-negative
#' integers.
#' @param .auto_gen_empirical This is a boolean value of TRUE/FALSE with default
#' set to TRUE. This will automatically create the `tidy_empirical()` output
#' for the `.x` parameter and use the `tidy_combine_distributions()`. The user
#' can then plot out the data using `$combined_data_tbl` from the function output.
#'
#' @examples
#' library(dplyr)
#' library(ggplot2)
#'
#' tc <- tidy_chisquare(.n = 500, .df = 6, .ncp = 1) |> pull(y)
#' output <- util_chisquare_param_estimate(tc)
#'
#' output$parameter_tbl
#'
#' output$combined_data_tbl |>
#'   tidy_combined_autoplot()
#'
#' @return
#' A tibble/list
#'
#' @name util_chisquare_param_estimate
NULL

#' @export
#' @rdname util_chisquare_param_estimate

util_chisquare_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {

  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  minx <- min(as.numeric(x_term))
  maxx <- max(as.numeric(x_term))

  # Checks ----
  if (!is.vector(x_term, mode = "numeric")) {
    rlang::abort(
      message = "The '.x' term must be a numeric vector.",
      use_cli_format = TRUE
    )
  }

  if (n < 2) {
    rlang::abort(
      message = "You must supply at least two data points for this function.",
      use_cli_format = TRUE
    )
  }

  # Parameters ----
  estimate_chisq_params <- function(data) {
    # Negative log-likelihood function
    negLogLik <- function(df, ncp) {
      -sum(stats::dchisq(data, df = df, ncp = ncp, log = TRUE))
    }

    # Initial values (adjust based on your data if necessary)
    start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))

    # MLE using bbmle
    mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
    # Return estimated parameters as a named vector
    df <- dplyr::tibble(
      est_df = bbmle::coef(mle_fit)[1],
      est_ncp = bbmle::coef(mle_fit)[2]
    )
    return(df)
  }

  safe_estimates <- {
    purrr::possibly(
      estimate_chisq_params,
      otherwise = NA_real_,
      quiet = TRUE
    )
  }

  estimates <- safe_estimates(x_term)
  doff <- estimates$est_df
  ncp <- estimates$est_ncp

  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    tc <- tidy_chisquare(.n = n, .df = round(doff, 3), .ncp = round(ncp, 3))
    combined_tbl <- tidy_combine_distributions(te, tc)
  }

  ret <- dplyr::tibble(
    dist_type = "Chisquare",
    samp_size = n,
    min = minx,
    max = maxx,
    mean = mean(x_term),
    degrees_of_freedom = doff,
    ncp = ncp
  )

  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "chisquare"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n

  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }

  return(output)
}

Example:

> tc <- tidy_chisquare(.n = 500, .df = 6, .ncp = 1) |> pull(y)
> output <- util_chisquare_param_estimate(tc)
Warning messages:
1: In stats::dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
2: In stats::dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
3: In stats::dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced
> 
> output$parameter_tbl
# A tibble: 1 × 7
  dist_type samp_size   min   max  mean degrees_of_freedom   ncp
  <chr>         <int> <dbl> <dbl> <dbl>              <dbl> <dbl>
1 Chisquare       500 0.935  20.6  7.13               6.65 0.488
> 
> output$combined_data_tbl |>
+   tidy_combined_autoplot()

# Load required package library(bbmle) # Sample data (replace with your actual data) data <- rchisq(100, df = 5, ncp = 20) # Define negative log-likelihood function negLogLik <- function(df, ncp) { -sum(dchisq(data, df = df, ncp = ncp, log = TRUE)) } # Initial values for optimization (crucial for good convergence) start_vals <- list(df = 3, ncp = 20) # Maximum likelihood estimation mle_fit <- mle2(negLogLik, start = start_vals) # Extract estimated parameters df_est <- coef(mle_fit)[1] ncp_est <- coef(mle_fit)[2] # Print the results cat("Estimated df:", df_est, "\n") cat("Estimated ncp:", ncp_est)

> # Load required package > library(bbmle) > > # Sample data (replace with your actual data) > data <- rchisq(100, df = 5, ncp = 20) > > # Define negative log-likelihood function > negLogLik <- function(df, ncp) { + -sum(dchisq(data, df = df, ncp = ncp, log = TRUE)) + } > > # Initial values for optimization (crucial for good convergence) > start_vals <- list(df = 3, ncp = 20) > > # Maximum likelihood estimation > mle_fit <- mle2(negLogLik, start = start_vals) > > # Extract estimated parameters > df_est <- coef(mle_fit)[1] > ncp_est <- coef(mle_fit)[2] > > # Print the results > cat("Estimated df:", df_est, "\n") Estimated df: 10.84161 > cat("Estimated ncp:", ncp_est) Estimated ncp: 14.05248 nd <- rchisq(100, df = df_est, ncp = ncp_est) hist(data, col = "lightblue", main = "Histogram of data") hist(nd, col = "lightgreen", add = TRUE)

> estimate_chisq_params <- function(data) { + # Negative log-likelihood function + negLogLik <- function(df, ncp) { + -sum(dchisq(data, df = df, ncp = ncp, log = TRUE)) + } + + # Initial values (adjust based on your data if necessary) + start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data))) + + # MLE using bbmle + mle_fit <- bbmle::mle2(negLogLik, start = start_vals) + + # Return estimated parameters as a named vector + c(df = coef(mle_fit)[1], ncp = coef(mle_fit)[2]) + } > > library(purrr) > > # List of data vectors (replace with your actual data) > data_list <- list(rchisq(100, df = 5, ncp = 2), + rchisq(80, df = 3, ncp = 1), + rchisq(120, df = 7, ncp = 4)) > > # Apply the estimation function to each data vector > param_estimates <- map(data_list, estimate_chisq_params) Warning messages: 1: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced 2: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced 3: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced 4: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced 5: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced 6: In dchisq(data, df = df, ncp = ncp, log = TRUE) : NaNs produced > > # Print results > print(param_estimates) [[1]] df.df ncp.ncp 5.3677154 0.9875087 [[2]] df.df ncp.ncp 3.2668169 0.3253299 [[3]] df.df ncp.ncp 6.559403 4.540669

Mega Script test

# Lib Load ---- library(tidyverse) library(bbmle) # Data ---- # Make parameters and grid df <- 1:10 ncp <- 1:10 n <- runif(10, 250, 500) |> trunc() param_grid <- expand_grid(n = n, df = df, ncp = ncp) # Functions ---- # functions to estimate the parameters of a chisq distribution # dof mean_x <- function(x) mean(x) mean_minus_1 <- function(x) mean(x) - 1 var_div_2 <- function(x) var(x) / 2 length_minus_1 <- function(x) length(x) - 1 # ncp mean_minus_mean_minus_1 <- function(x) mean(x) - (mean(x) - 1) ie_mean_minus_var_div_2 <- function(x) ifelse((mean(x) - (var(x) / 2)) < 0, 0, mean(x) - var(x)/2) ie_optim <- function(x) optim(par = 0, fn = function(ncp) { -sum(dchisq(x, df = var(x)/2, ncp = ncp, log = TRUE)) }, method = "Brent", lower = 0, upper = 10 * var(x)/2)$par # both estimate_chisq_params <- function(data) { # Negative log-likelihood function negLogLik <- function(df, ncp) { -sum(dchisq(data, df = df, ncp = ncp, log = TRUE)) } # Initial values (adjust based on your data if necessary) start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data))) # MLE using bbmle mle_fit <- bbmle::mle2(negLogLik, start = start_vals) # Return estimated parameters as a named vector df <- dplyr::tibble( est_df = coef(mle_fit)[1], est_ncp = coef(mle_fit)[2] ) return(df) } safe_estimates <- { purrr::possibly( estimate_chisq_params, otherwise = NA_real_, quiet = TRUE ) } # Simulate data ---- set.seed(123) dff <- param_grid |> mutate(x = pmap(pick(everything()), match.fun("rchisq"))) |> mutate( safe_est_parms = map(x, safe_estimates), dfa = map_dbl(x, mean_minus_1), dfb = map_dbl(x, var_div_2), dfc = map_dbl(x, length_minus_1), ncpa = map_dbl(x, mean_minus_mean_minus_1), ncpb = map_dbl(x, ie_mean_minus_var_div_2), ncpc = map_dbl(x, ie_optim) ) |> select(-x) |> filter(map_lgl(safe_est_parms, ~ any(is.na(.x))) == FALSE) |> unnest(cols = safe_est_parms) |> mutate( dfa_resid = dfa - df, dfb_resid = dfb - df, dfc_resid = dfc - df, dfd_resid = est_df - df, ncpa_resid = ncpa - ncp, ncpb_resid = ncpb - ncp, ncpc_resid = ncpc - ncp, ncpd_resid = est_ncp - ncp ) # Visuals ---- boxplot(dff$dfa ~ dff$df, main = "mean(x) -1 ~ df") boxplot(dff$dfb ~ dff$df, main = "var(x) / 2 ~ df") boxplot(dff$dfc ~ dff$df, main = "length(x) - 1 ~ df") boxplot(dff$est_df ~ dff$df, main = "negloglik ~ df - Looks Good") boxplot(dff$ncpa ~ dff$ncp, main = "mean(x) - (mean(x) - 1) ~ ncp") boxplot(dff$ncpb ~ dff$ncp, main = "mean(x) - var(x)/2 ~ nc") boxplot(dff$ncpc ~ dff$ncp, main = "optim ~ ncp") boxplot(dff$est_ncp ~ dff$ncp, main = "negloglik ~ ncp - Looks Good") boxplot(dff$dfa_resid ~ dff$df, main = "mean(x) -1 ~ df Residuals") boxplot(dff$dfb_resid ~ dff$df, main = "var(x) / 2 ~ df Residuals") boxplot(dff$dfc_resid ~ dff$df, main = "length(x) - 1 ~ df Residuals") boxplot(dff$dfd_resid ~ dff$df, main = "negloglik ~ df Residuals") boxplot(dff$ncpa_resid ~ dff$ncp, main = "mean(x) - (mean(x) - 1) ~ ncp Residuals") boxplot(dff$ncpb_resid ~ dff$ncp, main = "mean(x) - var(x)/2 ~ ncp Residuals") boxplot(dff$ncpc_resid ~ dff$ncp, main = "optim ~ ncp Residuals") boxplot(dff$ncpd_resid ~ dff$ncp, main = "negloglik ~ ncp Residuals")

spsanderson / TidyDensity

Add function `util_chisquare_param_estimate()` #414

Mega Script test