Prepare Package for rOpenSci

The autotest_package() function stops at util_chisquare_param_estimate() which will currently produce a warning of NaN's produced. Possible fix to exponentiate the params in the neg log liklihood func and log the initial params as the input to the func to optimize, this seems to work to force a constrained problem. It however produces sub-optimal results as seen here:

util_chisquare_param_estimate <- function(.x, .auto_gen_empirical = TRUE) {

  # Tidyeval ----
  x_term <- as.numeric(.x)
  n <- length(x_term)
  minx <- min(as.numeric(x_term))
  maxx <- max(as.numeric(x_term))

  # Checks ----
  if (!is.vector(x_term, mode = "numeric")) {
    rlang::abort(
      message = "The '.x' term must be a numeric vector.",
      use_cli_format = TRUE
    )
  }

  if (n < 2) {
    rlang::abort(
      message = "You must supply at least two data points for this function.",
      use_cli_format = TRUE
    )
  }

  # Parameters ----
  # estimate_chisq_params <- function(data) {
  #   # Negative log-likelihood function
  #   negLogLik <- function(df, ncp) {
  #     -sum(stats::dchisq(data, df = df, ncp = ncp, log = TRUE))
  #   }
  #
  #   # Initial values (adjust based on your data if necessary)
  #   start_vals <- list(df = trunc(var(data)/2), ncp = trunc(mean(data)))
  #
  #   # MLE using bbmle
  #   mle_fit <- bbmle::mle2(negLogLik, start = start_vals)
  #   # Return estimated parameters as a named vector
  #   df <- dplyr::tibble(
  #     est_df = bbmle::coef(mle_fit)[1],
  #     est_ncp = bbmle::coef(mle_fit)[2]
  #   )
  #   return(df)
  # }
  #
  # safe_estimates <- {
  #   purrr::possibly(
  #     estimate_chisq_params,
  #     otherwise = NA_real_,
  #     quiet = TRUE
  #   )
  # }
  #
  # estimates <- safe_estimates(x_term)
  # Define negative log-likelihood function
  neg_log_likelihood <- function(params) {
    params <- exp(params) * 10^-10
    df <- params[1]
    ncp <- params[2]
    sum_densities <- sum(stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE))
    return(-sum_densities)
  }

  # Initial guess for parameters
  initial_params <- c(trunc(var(x_term)/2), trunc(mean(x_term)))

  # Optimize parameters using optim() function
  opt_result <- stats::optim(par = log(initial_params), fn = neg_log_likelihood)

  # Extract estimated parameters
  doff <- opt_result$par[1]
  ncp <- opt_result$par[2]

  # Return Tibble ----
  if (.auto_gen_empirical) {
    te <- tidy_empirical(.x = x_term)
    tc <- tidy_chisquare(.n = n, .df = round(doff, 3), .ncp = round(ncp, 3))
    combined_tbl <- tidy_combine_distributions(te, tc)
  }

  ret <- dplyr::tibble(
    dist_type = "Chisquare",
    samp_size = n,
    min = minx,
    max = maxx,
    mean = mean(x_term),
    dof = doff,
    ncp = ncp
  )

  # Return ----
  attr(ret, "tibble_type") <- "parameter_estimation"
  attr(ret, "family") <- "chisquare"
  attr(ret, "x_term") <- .x
  attr(ret, "n") <- n

  if (.auto_gen_empirical) {
    output <- list(
      combined_data_tbl = combined_tbl,
      parameter_tbl     = ret
    )
  } else {
    output <- list(
      parameter_tbl = ret
    )
  }

  return(output)
}

# Test ----
library(TidyDensity)
tc <- tidy_chisquare(.n = 500, .df = 6, .ncp = 1)$y
util_chisquare_param_estimate(.x = rchisq(n = 100, df = 5))
util_chisquare_param_estimate(.x = tc)

util_chisquare_param_estimate(.x = tc)$combined_data_tbl |>
  tidy_combined_autoplot()

TidyDensity::util_chisquare_param_estimate(.x = tc)$combined_data_tbl |>
  TidyDensity::tidy_combined_autoplot()
Warning messages:
1: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
2: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
3: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
4: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
5: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced
6: In stats::dchisq(x_term, df = df, ncp = ncp, log = TRUE) :
  NaNs produced

Modified function:

Original function:

> x_term <- tc
> c(trunc(var(x_term)/2), trunc(mean(x_term)))
[1] 7 6
> log(c(trunc(var(x_term)/2), trunc(mean(x_term))))
[1] 1.945910 1.791759
> exp(log(c(trunc(var(x_term)/2), trunc(mean(x_term))))) * 10^-10
[1] 7e-10 6e-10

spsanderson / TidyDensity

Prepare Package for rOpenSci #513