dicook / nullabor

Tools for doing statistical inference using data plots
http://dicook.github.io/nullabor/
56 stars 10 forks source link

Add convenience functions for residual plots #24

Open mthulin opened 1 day ago

mthulin commented 1 day ago

I'd like to be able to generate lineup versions of some of the standard residual plots (residuals vs fitted, QQ-plot, scale-location, residuals vs leverage) in a single step. As far as I can tell, there aren't currently any convenience functions for this.

The current process for generating a residuals vs fitted-plot involves quite a few steps:

library(nullabor)
library(ggplot2)
x <- lm(tip ~ total_bill, data = tips)
tips.reg <- data.frame(tips, .resid = residuals(x), .fitted = fitted(x))
ggplot(lineup(null_lm(tip ~ total_bill, method = 'rotate'), tips.reg)) +
    geom_point(aes(x = total_bill, y = .resid)) +
    facet_wrap(~ .sample)

What I'm looking for are convenience functions similar to those that already exist for regular residual plots:

x <- lm(tip ~ total_bill, data = tips)
plot(x)
ggfortify::autoplot(x)

Example ggfortify::autoplot(x) output: image

A rough sketch of what these could look like are:

# Helper function for standardized residuals:
dropInf <- function(x, h) {
    if (any(isInf <- h >= 1)) {
        warning(gettextf("not plotting observations with leverage one:\n  %s", 
                         paste(which(isInf), collapse = ", ")), call. = FALSE, 
                domain = NA)
        x[isInf] <- NaN
    }
    x
}

# Updated null_lm that also computes standardized residuals and leverage:
null_lm <- function(f, method = "rotate", ...) 
{
    n <- NULL
    if (is.character(method)) {
        method <- match.fun(paste("resid", method, sep = "_"))
    }
    function(df) {
        model <- eval(substitute(lm(formula, data = df), list(formula = f)))
        resp_var <- all.vars(f[[2]])
        resid <- method(model, df, ...)
        fitted <- stats::predict(model, df)
        s <- sqrt(deviance(model)/df.residual(model))
        hii <- lm.influence(model, do.coef = FALSE)$hat
        df[".resid"] <- resid
        df[".fitted"] <- fitted
        df[".leverage"] <- hii
        df[".stdresid"] <- dropInf(resid/(s * sqrt(1 - hii)), hii)
        df[[resp_var]] <- fitted + resid
        df
    }
}

# Convenience function for plotting:
lineupplot <- function(model, type = 1, method = "rotate")
{
    model.reg <- data.frame(model$model,
                            .resid = residuals(model),
                            .fitted = fitted(model)
                            )

    s <- sqrt(deviance(model)/df.residual(model))
    hii <- lm.influence(model, do.coef = FALSE)$hat
    model.reg$.stdresid <- dropInf(model.reg$.resid/(s * sqrt(1 - hii)), hii)
    model.reg$.leverage <- hii

    if(type == 1) {
        p <- ggplot(lineup(null_lm(formula(model), method = method), model.reg)) +
            geom_point(aes(x = .fitted, y = .resid), alpha = 0.5) +
            geom_abline(aes(intercept = 0, slope = 0), colour = "red", linetype = "dashed") +
            geom_smooth(aes(x = .fitted, y = .resid), se = FALSE) +
            facet_wrap(~ .sample) +
            labs(x = "Fitted values", y = "Residuals", title = "Residuals vs Fitted")
    }
    if(type == 2) {
        p <- ggplot(lineup(null_lm(formula(model), method = method), model.reg)) +
            geom_qq(aes(sample = .stdresid), colour = "blue", alpha = 0.5) + geom_qq_line(aes(sample = .resid)) +
            facet_wrap(~ .sample) +
            labs(x = "Theoretical quantiles", y = "Standardized residuals", title = "Normal Q-Q plot")
    }
    if(type == 3) {
        p <- ggplot(lineup(null_lm(formula(model), method = method), model.reg)) +
            geom_point(aes(x = .fitted, y = sqrt(.stdresid)), alpha = 0.5) +
            geom_smooth(aes(x = .fitted, y = sqrt(.stdresid)), se = FALSE) +
            facet_wrap(~ .sample) +
            labs(x = "Fitted values", y = "Square-root of standardized residuals", title = "Scale-Location")
    }
    if(type == 4) {
        p <- ggplot(lineup(null_lm(formula(model), method = method), model.reg)) +
            geom_point(aes(x = .leverage, y = .stdresid), alpha = 0.5) +
            geom_smooth(aes(x = .leverage, y = .stdresid), se = FALSE) +
            facet_wrap(~ .sample) +
            labs(x = "Leverage", y = "Square-root of standardized residuals", title = "Residuals vs Leverage")
    }
    p
}

Getting lineup versions of the plots from plot.lm and ggfortify::autoplot is then straightforward:

x <- lm(tip ~ total_bill, data = tips)
lineupplot(x, type = 1)
lineupplot(x, type = 2)
lineupplot(x, type = 3)
lineupplot(x, type = 4)

Example lineupplot(x, type = 3) output: image

Does this make sense? If so, I'd be happy to work a bit more on the above code and then create a PR.

dicook commented 1 day ago

Very happy if you create a pull request with code to do this.