tidymodels / infer

An R package for tidyverse-friendly statistical inference
https://infer.tidymodels.org
Other
726 stars 80 forks source link

How about add a function to run all workflow? #519

Closed Yunuuuu closed 10 months ago

Yunuuuu commented 11 months ago

Such as this, estimate the pvalue and confidence interval:

infer <- function(
    data, formula, stat, reps = 2000L, level = 0.95,
    direction = "two-sided", null = NULL, type = NULL,
    null_reps = reps, ci_reps = reps, ...,
    variables = NULL, response = NULL, explanatory = NULL, success = NULL,
    p = NULL, mu = NULL, med = NULL, sigma = NULL) {
    response <- rlang::enquo(response)
    explanatory <- rlang::enquo(explanatory)
    if (missing(formula)) {
        model <- infer::specify(data,
            response = !!response,
            explanatory = !!explanatory,
            success = success
        )
    } else {
        model <- infer::specify(data,
            formula = formula,
            response = !!response,
            explanatory = !!explanatory,
            success = success
        )
    }
    explanatory <- infer:::explanatory_name(model)
    if (length(explanatory) > 1L) {
    } else {
        stat <- match.arg(stat, infer:::implemented_stats)
    }
    # check if we need Declare a null hypothesis ---------------
    null_hypo <- quote(infer::hypothesize(
        model,
        null = null,
        p = p, mu = mu, med = med, sigma = sigma
    ))
    if (length(explanatory)) {
        null <- null %||% "independence"
        null_type <- "permute"
    } else {
        null <- null %||% "point"
        if (identical(attr(model, "response_type"), "factor")) {
            null_type <- "draw"
        } else {
            null_type <- "bootstrap"
        }
    }
    # Calculating the observed statistic
    if (length(explanatory) > 1L) {
        obs_stat <- infer::fit(model)
        model <- eval(null_hypo)
        null_dist <- model
    } else {
        if (!any(stat == infer:::untheorized_stats)) {
            model <- eval(null_hypo)
        }
        obs_stat <- infer::calculate(x = model, stat = stat, ...)
        if (any(stat == infer:::untheorized_stats)) {
            null_dist <- eval(null_hypo)
        } else {
            null_dist <- model
        }
    }
    # generating the null distribution
    if (null_type == "permute") {
        variables <- rlang::enquo(variables)
        if (rlang::quo_is_null(variables)) {
            variables <- attr(model, "response")
        }
        null_dist <- infer::generate(null_dist,
            reps = null_reps, type = null_type,
            variables = !!variables
        )
    } else {
        null_dist <- infer::generate(null_dist,
            reps = null_reps, type = null_type
        )
    }
    if (length(explanatory) > 1L) {
        null_dist <- infer::fit(null_dist)
        # calculate Confidence intervals
        ci <- infer::get_ci(
            x = null_dist,
            level = level, type = type,
            point_estimate = obs_stat
        )
    } else {
        null_dist <- infer::calculate(x = null_dist, stat = stat, ...)
        boot_dist <- infer::generate(model,
            reps = ci_reps, type = attr(model, "type")
        )
        boot_dist <- infer::calculate(x = boot_dist, stat = stat, ...)
        # calculate Confidence intervals
        ci <- infer::get_ci(x = boot_dist, level = level, type = type)
    }
    # calculate P-value
    pvalue <- infer::get_pvalue(null_dist, obs_stat, direction = direction)
    cbind(obs_stat, pvalue, ci)
}

Everything works fine

data(gss, package = "infer")
infer(gss, response = hours, stat = "mean", mu = 40)
#>     stat p_value lower_ci upper_ci
#> 1 41.382   0.029 40.07785  42.7121
infer(gss, response = hours, stat = "t", mu = 40)
#>       stat p_value  lower_ci upper_ci
#> 1 2.085191   0.044 -1.956919 1.983355
infer(gss, response = hours, stat = "median", med = 40)
#>   stat p_value lower_ci upper_ci
#> 1   40       1       40       40
infer(gss, response = sex, success = "female", stat = "prop", p = .5)
#>    stat p_value lower_ci upper_ci
#> 1 0.474   0.244     0.43    0.518
infer(gss, response = sex, success = "female", stat = "z", p = .5)
#>        stat p_value  lower_ci upper_ci
#> 1 -1.162755   0.258 -2.057183 1.969976
infer(gss, college ~ sex,
  success = "no degree",
  stat = "diff in props",
  order = c("female", "male")
)
#>           stat p_value    lower_ci   upper_ci
#> 1 -0.004203366    0.99 -0.09598178 0.08155747
infer(gss, hours ~ age + college, variables = c(age, college))
#>            term     estimate          term p_value          term    lower_ci
#> 1     intercept 40.608594852           age   0.909           age -0.09617401
#> 2           age  0.005959914 collegedegree   0.261 collegedegree -2.72587577
#> 3 collegedegree  1.532825452     intercept   0.707     intercept 37.21169268
#>     upper_ci
#> 1  0.1064425
#> 2  2.8049642
#> 3 45.2996399

Created on 2023-12-22 with reprex v2.0.2

simonpcouch commented 10 months ago

Thanks for the issue, @Yunuuuu! Glad you've documented that this is a possibility.

Given where the package is at in its lifecycle, and its otherwise emphasis on teaching the juxtaposition of a test statistic with a null distribution, I'm going to go ahead and close.

github-actions[bot] commented 10 months ago

This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.