easystats / bayestestR

:ghost: Utilities for analyzing Bayesian models and posterior distributions
https://easystats.github.io/bayestestR/
GNU General Public License v3.0
561 stars 55 forks source link

return `NA`s for `point_estimate` if sample too sparse #429

Closed IndrajeetPatil closed 3 days ago

IndrajeetPatil commented 3 years ago

This will make describe_distribution function in parameters more robust to failures.

It's especially difficult to find out which grouping level doesn't have enough observations when the function fails in grouped_ context.

library(tidyverse)
library(parameters)

set.seed(123)
mtcars %>%
  group_by(cyl) %>%
  group_modify(~ describe_distribution(.$wt, centrality = "map"))
#> # A tibble: 3 x 9
#> # Groups:   cyl [3]
#>     cyl   MAP   IQR   Min   Max Skewness Kurtosis     n n_Missing
#>   <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl> <int>     <int>
#> 1     4  2.09 0.945  1.51  3.19    0.404  -0.851     11         0
#> 2     6  3.43 0.67   2.62  3.46   -0.363  -2.08       7         0
#> 3     8  3.63 0.865  3.17  5.42    1.24    0.0780    14         0

set.seed(123)
mtcars %>%
  group_by(cyl) %>%
  group_modify(~ describe_distribution(.$wt, centrality = "map", ci = 0.95))
#> Error in bw.SJ(x, method = "ste"): sample is too sparse to find TD

Created on 2021-06-04 by the reprex package (v2.0.0)

Trace:

> traceback()
32: stop("sample is too sparse to find TD", domain = NA)
31: bw.SJ(x, method = "ste")
30: density.default(x, n = precision, bw = bw, from = x_range[1], 
        to = x_range[2], ...)
29: stats::density(x, n = precision, bw = bw, from = x_range[1], 
        to = x_range[2], ...)
28: .estimate_density_kernel(x, x_range, precision, bw, ci, ...)
27: .estimate_density(x, method = method, precision = precision, 
        extend = extend, extend_scale = extend_scale, bw = bw, ci = ci, 
        ...)
26: estimate_density.numeric(x, precision = precision, method = method, 
        ...)
25: estimate_density(x, precision = precision, method = method, ...)
24: map_estimate.numeric(x)
23: map_estimate(x)
22: point_estimate.numeric(x, centrality = centrality, dispersion = dispersion, 
        threshold = threshold, ...)
21: bayestestR::point_estimate(x, centrality = centrality, dispersion = dispersion, 
        threshold = threshold, ...) at describe_distribution.R#61
20: data.frame(..., check.names = FALSE)
19: cbind(deparse.level, ...)
18: cbind(out, bayestestR::point_estimate(x, centrality = centrality, 
        dispersion = dispersion, threshold = threshold, ...)) at describe_distribution.R#61
17: describe_distribution.numeric(data[indices], centrality = centrality, 
        dispersion = FALSE, iqr = FALSE, range = FALSE, quartiles = FALSE, 
        ci = NULL) at describe_distribution.R#37
16: parameters::describe_distribution(data[indices], centrality = centrality, 
        dispersion = FALSE, iqr = FALSE, range = FALSE, quartiles = FALSE, 
        ci = NULL) at describe_distribution.R#364
15: statistic(data, i[r, ], ...)
14: FUN(X[[i]], ...)
13: lapply(seq_len(RR), fn)
12: boot::boot(data = x, statistic = .boot_distribution, R = iterations, 
        centrality = centrality) at describe_distribution.R#84
11: describe_distribution.numeric(.$wt, centrality = "map", ci = 0.95) at describe_distribution.R#37
10: describe_distribution(.$wt, centrality = "map", ci = 0.95)
9: .f(.x, .y, ...)
8: (function (.x, .y) 
   {
       res <- .f(.x, .y, ...)
       if (!inherits(res, "data.frame")) {
           abort("The result of .f should be a data frame.")
       }
       if (any(bad <- names(res) %in% tbl_group_vars)) {
           abort(glue("The returned data frame cannot contain the original grouping variables: {names}.", 
               names = paste(names(res)[bad], collapse = ", ")))
       }
       bind_cols(.y[rep(1L, nrow(res)), , drop = FALSE], res)
   })(dots[[1L]][[2L]], dots[[2L]][[2L]])
7: mapply(.f, .x, .y, MoreArgs = list(...), SIMPLIFY = FALSE)
6: map2(chunks, group_keys, .f, ...)
5: group_map.data.frame(.data, fun, .keep = .keep)
4: group_map(.data, fun, .keep = .keep)
3: group_modify.grouped_df(., ~describe_distribution(.$wt, centrality = "map", 
       ci = 0.95))
2: group_modify(., ~describe_distribution(.$wt, centrality = "map", 
       ci = 0.95))
1: mtcars %>% group_by(cyl) %>% group_modify(~describe_distribution(.$wt, 
       centrality = "map", ci = 0.95))
ramiromagno commented 2 years ago

I am also having this issue:

Error in bw.SJ(x, method = "ste") : sample is too sparse to find TD
18.
stop("sample is too sparse to find TD", domain = NA)
17.
bw.SJ(x, method = "ste")
16.
density.default(x, n = precision, bw = bw, from = x_range[1], 
to = x_range[2], ...)
15.
stats::density(x, n = precision, bw = bw, from = x_range[1], 
to = x_range[2], ...)
14.
.estimate_density_kernel(x, x_range, precision, bw, ci, ...)
13.
.estimate_density(x, method = method, precision = precision, 
extend = extend, extend_scale = extend_scale, bw = bw, ci = ci, 
...)
12.
estimate_density.numeric(x, precision = precision, method = method, 
...)
11.
estimate_density(x, precision = precision, method = method, ...)
10.
map_estimate.numeric(x)
9.
bayestestR::map_estimate(x) at credible_interval.R#5
8.
credible_interval(rbeta(alt_counts = alt_counts[i], ref_counts = ref_counts[i], 
n = n), ci = ci) at hdi_beta.R#17
7.
obelus::hdi_beta(alt_counts = tcga_tp53$rna_alt_counts_tumor, 
ref_counts = tcga_tp53$rna_ref_counts_tumor, ci = ci_level)
6.
dplyr::mutate(., imbalance = !(0 >= ci_low & 0 <= ci_high), direction = case_when(ci_high < 
0 ~ "wt", ci_low > 0 ~ "mut", TRUE ~ "neutral"))
5.
paste0("beta_", names(.))
4.
setNames(., paste0("beta_", names(.)))
3.
list2(...)
2.
dplyr::bind_cols(tcga_tp53, .)
1.
obelus::hdi_beta(alt_counts = tcga_tp53$rna_alt_counts_tumor, 
ref_counts = tcga_tp53$rna_ref_counts_tumor, ci = ci_level) %>% 
dplyr::mutate(imbalance = !(0 >= ci_low & 0 <= ci_high), 
direction = case_when(ci_high < 0 ~ "wt", ci_low > 0 ~ ...
IndrajeetPatil commented 1 week ago

@mattansb Is it possible to tackle this in the next CRAN release?

mattansb commented 6 days ago

@IndrajeetPatil - in you're example you're trying to compute a bootstrap CI for some estimate and failing because the bootstrap samples are too sparse. Returning NA (=not failing) will result in a biased bootstrap CI.

(I'm not sure what @ramiromagno's example is doing, but since it is failing in a function called credible_interval() I'm guessing it's also doing something similar?)

So really, this issue should be "resolved" in parameters - if the bootstrap generally fails, it should catch it and return an [NA, NA] CI.

IndrajeetPatil commented 3 days ago

Closing in favour of https://github.com/easystats/datawizard/issues/540