spsanderson / TidyDensity

Create tidy probability/density tibbles and plots of randomly generated and empirical data.
https://www.spsanderson.com/TidyDensity
Other
34 stars 1 forks source link

Prepare Package for rOpenSci #513

Open spsanderson opened 4 days ago

spsanderson commented 4 days ago

Need to prepare package for rOpensci

spsanderson commented 4 days ago

The autotest_package() function stops at util_chisquare_param_estimate() which will currently produce a warning of NaN's produced. Possible fix to exponentiate the params in the neg log liklihood func and log the initial params as the input to the func to optimize, this seems to work to force a constrained problem. It however produces sub-optimal results as seen here:

util_chisquare_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {

  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  minx <- min(as.numeric(x_term))
  maxx <- max(as.numeric(x_term))

  # Checks ----
  if (!is.vector(x_term, mode = "numeric")) {
    rlang::abort(
      message = "The '.x' term must be a numeric vector.",
      use_cli_format = TRUE
    )
  }

  if (n < 2) {
    rlang::abort(
      message = "You must supply at least two data points for this function.",
      use_cli_format = TRUE
    )
  }

  # Parameters ----
  # estimate_chisq_params <- function(data) {
  #   # Negative log-likelihood function
  #   negLogLik <- function(df, ncp) {
  #     -sum(stats::dchisq(data, df = df, ncp = ncp, log = TRUE))
  #   }
  #
  #   # Initial values (adjust based on your data if necessary)
  #   start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))
  #
  #   # MLE using bbmle
  #   mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
  #   # Return estimated parameters as a named vector
  #   df <- dplyr::tibble(
  #     est_df = bbmle::coef(mle_fit)[1],
  #     est_ncp = bbmle::coef(mle_fit)[2]
  #   )
  #   return(df)
  # }
  #
  # safe_estimates <- {
  #   purrr::possibly(
  #     estimate_chisq_params,
  #     otherwise = NA_real_,
  #     quiet = TRUE
  #   )
  # }
  #
  # estimates <- safe_estimates(x_term)
  # Define negative log-likelihood function
  neg_log_likelihood <- function(params) {
    params <- exp(params) * 10^-10
    df <- params[1]
    ncp <- params[2]
    sum_densities <- sum(stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE))
    return(-sum_densities)
  }

  # Initial guess for parameters
  initial_params <- c(trunc(var(x_term)/2), trunc(mean(x_term)))

  # Optimize parameters using optim() function
  opt_result <- stats::optim(par = log(initial_params), fn = neg_log_likelihood)

  # Extract estimated parameters
  doff <- opt_result$par[1]
  ncp <- opt_result$par[2]

  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    tc <- tidy_chisquare(.n = n, .df = round(doff, 3), .ncp = round(ncp, 3))
    combined_tbl <- tidy_combine_distributions(te, tc)
  }

  ret <- dplyr::tibble(
    dist_type = "Chisquare",
    samp_size = n,
    min = minx,
    max = maxx,
    mean = mean(x_term),
    dof = doff,
    ncp = ncp
  )

  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "chisquare"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n

  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }

  return(output)
}

# Test ----
library(TidyDensity)
tc <- tidy_chisquare(.n = 500, .df = 6, .ncp = 1)$y
util_chisquare_param_estimate(.x = rchisq(n = 100, df = 5))
util_chisquare_param_estimate(.x = tc)

util_chisquare_param_estimate(.x = tc)$combined_data_tbl |>
  tidy_combined_autoplot()

TidyDensity::util_chisquare_param_estimate(.x = tc)$combined_data_tbl |>
  TidyDensity::tidy_combined_autoplot()
Warning messages:
1: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
2: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
3: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
4: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
5: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
6: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced

Modified function: image

Original function: image

> x_term <- tc
> c(trunc(var(x_term)/2), trunc(mean(x_term)))
[1] 7 6
> log(c(trunc(var(x_term)/2), trunc(mean(x_term))))
[1] 1.945910 1.791759
> exp(log(c(trunc(var(x_term)/2), trunc(mean(x_term))))) * 10^-10
[1] 7e-10 6e-10