singmann / afex

Analysis of Factorial EXperiments (R package)
120 stars 32 forks source link

add residuals function for afex_aov objects #64

Closed singmann closed 3 years ago

singmann commented 5 years ago

afex_aov objects should have a residuals function that works for both between and within models. The following code could be the starting point:

library("afex")
data(md_12.1)
a1 <- aov_ez("id", "rt", md_12.1, within = c("angle", "noise"))

residuals.afex_aov <- function(object, ...) {
  if (length(attr(object, "within")) > 0) {
    str(object, 1)
    dw <- object$data$wide
    dw[,-1] <- residuals(object$lm)
    NULL # [...]
  } else {
    return(residuals(object$lm))
  }
}

md_12.1$residuals <- residuals(a1)
mattansb commented 5 years ago

I'm continuing our discussion from #69 about residuals.

I found a way to get the residuals without using aov (but also without using the mlm lm):

residuals.afex_aov <- function(object, model = "multivariate", ...){
  if (length(attr(object, "within")) == 0 || model == "multivariate") {
    return(residuals(object$lm))
  } else {
    data <- object$data$long
    dv <- attr(object,'dv')
    id <- attr(object,'id')
    between <- names(attr(object,'between'))
    within <- names(attr(object,'within'))

    # within
    combs <- expand.grid(lapply(within, function(x) c(x,NA)))
    combs$id <- id
    combs <- head(combs,-1)
    within_res <- list()
    for (i in seq_len(nrow(combs))) {
      tem_fs <- as.vector(na.omit(t(combs[i,])))
      ag_data <- aggregate(data[,dv],data[,tem_fs],mean)

      temp_name <- paste0(head(tem_fs,-1),collapse = '*')
      form <- formula(paste0("x~",temp_name))
      within_res[[temp_name]] <- residuals(lm(form,ag_data))
    }

    all_residuals <- within_res

    # between
    if (!is.null(between)) {
      ag_data <- aggregate(data[,dv],data[,c(between,id)],mean)
      form <- formula(paste0("x~",paste0(c(between),collapse = '*')))
      all_residuals[[id]] <- residuals(lm(form,ag_data))
    }
    return(all_residuals)
  }
}

residuals_qqplot <- function(object){
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("package ggplot2 is required.", call. = FALSE)
  }
  all_residuals <- residuals(object,model = "univariate")

  if (is.list(all_residuals)) {
    all_residuals <- lapply(names(all_residuals), function(x) data.frame(residuals = all_residuals[[x]],proj = x))

    plot_data <- do.call("rbind",all_residuals)
  } else {
    plot_data <- data.frame(residuals = all_residuals,
                            proj = "Error")
  }

  ggplot2::ggplot(plot_data,ggplot2::aes(sample = residuals)) +
    ggplot2::geom_qq() +
    ggplot2::geom_qq_line() +
    ggplot2::facet_wrap(~proj)
}

library(afex)
#> Loading required package: lme4
#> Loading required package: Matrix
#> ************
#> Welcome to afex. For support visit: http://afex.singmann.science/
#> - Functions for ANOVAs: aov_car(), aov_ez(), and aov_4()
#> - Methods for calculating p-values with mixed(): 'KR', 'S', 'LRT', and 'PB'
#> - 'afex_aov' and 'mixed' objects can be passed to emmeans() for follow-up tests
#> - NEWS: library('emmeans') now needs to be called explicitly!
#> - Get and set global package options with: afex_options()
#> - Set orthogonal sum-to-zero contrasts globally: set_sum_contrasts()
#> - For example analyses see: browseVignettes("afex")
#> ************
#> 
#> Attaching package: 'afex'
#> The following object is masked from 'package:lme4':
#> 
#>     lmer
data(obk.long, package = "afex")

# mixed
afex_mix <- aov_ez('id','value',obk.long,
                   between=c('treatment',"gender"),
                   within = c('phase','hour'))
#> Contrasts set to contr.sum for the following variables: treatment, gender
residuals_qqplot(afex_mix)


# with covariate
afex_cov <- aov_ez('id','value',obk.long,
                   between=c('treatment',"gender"),
                   within = c('phase','hour'),
                   covariate = 'age', factorize = FALSE)
#> Contrasts set to contr.sum for the following variables: treatment, gender
residuals_qqplot(afex_cov)

# within only
afex_within <- aov_ez('id','value',obk.long,
                      within = c('phase'), factorize = FALSE)
#> Warning: More than one observation per cell, aggregating the data using
#> mean (i.e, fun_aggregate = mean)!
residuals_qqplot(afex_within)


# between only
afex_between <- aov_ez('id','value',obk.long,
                       between=c('treatment'))
#> Warning: More than one observation per cell, aggregating the data using
#> mean (i.e, fun_aggregate = mean)!
#> Contrasts set to contr.sum for the following variables: treatment
residuals_qqplot(afex_between)

Created on 2019-05-03 by the reprex package (v0.2.1)

This is actually better than the previous version that did fit an aov, as the residuals for each error term is of the correct length.

JeffreyRStevens commented 4 years ago

Will the fix from @mattansb get accepted into the package? I find this functionality very useful.

crsh commented 4 years ago

Sorry to barge in, I just came across this discussion. How do you feel about adding an option to detrend the QQ-Plot and add a confidence interval?

residuals_qqplot <- function(object, detrend = FALSE){
  if (!requireNamespace("ggplot2", quietly = TRUE)) {
    stop("package ggplot2 is required.", call. = FALSE)
  }
  all_residuals <- residuals(object,model = "univariate")

  if (is.list(all_residuals)) {
    all_residuals <- lapply(names(all_residuals), function(x) data.frame(residuals = all_residuals[[x]],proj = x))

    plot_data <- do.call("rbind",all_residuals)
  } else {
    plot_data <- data.frame(residuals = all_residuals,
                            proj = "Error")
  }

  if(detrend) {
    n_residuals <- aggregate(residuals ~ proj, plot_data, length)
    ci_range <- qnorm(range(ppoints(max(n$residuals))))

    # Adapted from Buuren & Fredriks (2001; doi: 10.1002/sim.746)
    qq_ci <- function(n, level = 0.95, lz = -4, hz = 4, length.out = 100) {
      # adds confidence interval to Q–Q plot panel
      z <- seq(lz, hz, length.out = length.out)
      p <- pnorm(z)
      se <- (1/dnorm(z)) * (sqrt(p * (1-p)/n))
      low <- qnorm((1 - level)/2) * se
      data.frame(x = z, ymin = low, ymax = -low)
    }

    qq_ci_data <- apply(
      n_residuals,
      1,
      function(x) {
        data.frame(
          proj = x["proj"],
          qq_ci(as.numeric(x["residuals"]), lz = ci_range[1], hz = ci_range[2], length.out = 100),
          row.names = NULL
        )
      }
    )

    qq_ci_data <- do.call("rbind", qq_ci_data)

    ggplot2::ggplot(plot_data, ggplot2::aes(sample = residuals)) +
      ggplot2::geom_ribbon(
        data = qq_ci_data
        , ggplot2::aes(x = x, ymin = ymin, ymax = ymax), inherit.aes = FALSE
        , alpha = 0.25
      ) +
      ggplot2::geom_qq(ggplot2::aes(y = ..sample.. - ..theoretical..)) +
      ggplot2::geom_hline(yintercept = 0) +
      labs(x = "Theoretical", y = "Deviation") +
      ggplot2::facet_wrap(~proj, scales = "free")
  } else {
    ggplot2::ggplot(plot_data,ggplot2::aes(sample = residuals)) +
      ggplot2::geom_qq() +
      ggplot2::geom_qq_line() + 
      labs(x = "Theoretical", y = "Observed") +
      ggplot2::facet_wrap(~proj)
  }
}

This should work with all the examples above. Here's an example:

residuals_qqplot(afex_mix, detrend = TRUE)

image

singmann commented 4 years ago

Sorry for the delay in reacting to this. The pull request with the residuals method (#70) will be added hopefully at some point next week.

I am still not a big fan of having a dedicated qq_plot function and would hope for something more generic. For example, adding support for a qqplot function from a different package.

mattansb commented 4 years ago

For example, adding support for a qqplot function from a different package.

Once the residual methods will be implemented here, I plan to do just that with ggResidpanel.

singmann commented 4 years ago

Residuals function is added now in #70. Let me know if I should wait with pushing to CRAN for the ggResidpanel method.

mattansb commented 4 years ago

Thanks Henrik!

I'm still thinking of the best way to implement plotting of the residuals (ggResidpanel or some other ggplot implementation). No need to wait for that for the push to CRAN, imo.

Thanks!

-- Mattan S. Ben-Shachar, PhD student Department of Psychology & Zlotowski Center for Neuroscience Ben-Gurion University of the Negev The Developmental ERP Lab

On Tue, Jun 9, 2020, 22:30 Henrik Singmann notifications@github.com wrote:

Residuals function is added now in #70 https://github.com/singmann/afex/pull/70. Let me know if I should wait with pushing to CRAN for the ggResidpanel method.

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/singmann/afex/issues/64#issuecomment-641524546, or unsubscribe https://github.com/notifications/unsubscribe-auth/AINRP6EZVPNAEYRSELJTPUTRV2EWNANCNFSM4GFGGDPA .

mattansb commented 4 years ago

Actually...

ggResidpanel

it seems that ggResidpanel can be used now like this:

library(afex)

data(obk.long, package = "afex")
fit <- aov_ez('id', 'value', obk.long,
              between = c('treatment', 'gender'),
              within = c('phase', 'hour'))
#> Contrasts set to contr.sum for the following variables: treatment, gender

ggResidpanel::resid_auxpanel(residuals = residuals(fit),
                             predicted = fitted(fit))
#> Warning: Data was changed during ANOVA calculation. Thus, residuals cannot be added to original data.
#> residuals(..., append = TRUE) will return data and residuals.
#> Warning: Data was changed during ANOVA calculation. Thus, fitted values cannot be added to original data.
#> fitted(..., append = TRUE) will return data and fitted values.

residuals_qqplot

Additionally, I have the residuals_qqplot() function (and updated version from what we discussed last year):

residuals_qqplot(fit)

residuals_qqplot(fit, by_term = TRUE) # for within-subject error terms only

residuals_qqplot(fit, by_term = TRUE, model = "multivariate") # for within-subject error terms only

residuals_qqplot <- function(object, by_term = FALSE, 
                             model = c("univariate", "multivariate"),
                             qqbands = TRUE, detrend = FALSE, return = c("plot", "data")){
  stopifnot(requireNamespace("ggplot2"))

  if (qqbands && requireNamespace("qqplotr")) {
    qq_stuff <- list(qqplotr::stat_qq_band(detrend = detrend),
                     qqplotr::stat_qq_line(detrend = detrend),
                     qqplotr::stat_qq_point(detrend = detrend))

  } else {
    qq_stuff <- list(ggplot2::stat_qq(),
                     ggplot2::stat_qq_line())
  }

  model <- match.arg(model)
  return <- match.arg(return)

  within <- names(attr(object, "within"))
  id <- attr(object, 'id')

  e <- residuals(object, append = TRUE)

  if (!by_term) {
    e$term <- "residuals"
  } else if (length(within) > 0L & model == "univariate") {
    wf <- lapply(within, function(x) c(NA, x))
    wf <- do.call(expand.grid, wf)
    wl <- apply(wf, 1, function(x) c(na.omit(x)))

    fin_e <- vector("list", length = length(wl))
    for (i in seq_along(wl)) {
      temp_f <- c(id, wl[[i]])

      temp_e <- aggregate(e$.residuals, e[, temp_f, drop = F], FUN = mean)

      fin_e[[i]] <- data.frame(
        term = paste0(temp_f, collapse = ":"),
        .residuals = temp_e[[ncol(temp_e)]]
      )
    }
    e <- do.call(rbind, fin_e)
  } else if (length(within) > 0L) {
    e$term <- apply(e[, within], 1, function(x) paste0(x, collapse = "/"))
  } else {
    e$term <- id
  }

  if (return == "data") {
    return(e[,c("term",".residuals")])
  }

  ggplot2::ggplot(e, ggplot2::aes(sample = .data$.residuals)) +
    qq_stuff +
    ggplot2::facet_wrap(~ .data$term, scales = "free") +
    ggplot2::theme_minimal() +
    ggplot2::theme(
      axis.text.x = ggplot2::element_blank(),
      axis.ticks.x = ggplot2::element_blank(),
      axis.text.y = ggplot2::element_blank(),
      axis.ticks.y = ggplot2::element_blank()
    ) +
    ggplot2::labs(x = "Theoretical",
                  y = "Sample")
}

@singmann Let me know if you'd like the latter in afex / any suggested changes.

singmann commented 4 years ago

I have added the code of how to use ggResidpanel with the two methods to the examples of the methods.

However, I am still a bit wary of adding the residuals_qqplot function. Is there no existing function that does this in a package we could support via a method? Also, it would be really great to have something that works for both aNOVA and mixed models (i.e., function mixed). But for this, I first need to add a residuals method for the latter.

Anyway, I am also considering changing the warning to a message if the data is changed. A warning is maybe a bit much. With the plotting example it is clear that in many cases one might use it in a way in which this behaviour is totally fine. So a message may be appropriate.

mattansb commented 4 years ago

Is there no existing function that does this in a package we could support via a method?

I am not aware of any functions for diagnostics on residuals by term in a repeated-measures context (or in a hierarchical context where it is more appropriate to look at the distribution of random effects)... That's actually how I got to this issue in the first place, when teaching how to check the assumptions of within-subject ANOVAs, I couldn't find any tools to allow for error-term-wise diagnostics. From what I can tell assumptions related to residuals in such models are performed on the residuals as a whole. 🤷‍♂️

I am also considering changing the warning to a message

Yeah, that sounds more appropriate for a broader context of situations.

singmann commented 4 years ago

I see. Do you have an actual real-life example data set where this matters? In other words, I would be willing to accept this, but I think it would then be great to have a vignette actually showing how this should/could be used. Maybe this vignette could also include the other assumption tests.

mattansb commented 4 years ago

"real-life example data set where this matters" - do you mean a dataset were this assumption is violated? I guess any RT data is an easy target...

library(afex)

data(stroop, package = "afex")

stroop <- na.omit(stroop)

fit <-
  aov_ez("pno", "rt", stroop,
         within = c("congruency", "condition"),
         between = "study",
         include_aov = FALSE)
#> Warning: More than one observation per cell, aggregating the data using mean
#> (i.e, fun_aggregate = mean)!
#> Contrasts set to contr.sum for the following variables: study
residuals_qqplot(fit, by_term = TRUE)
#> Loading required namespace: qqplotr

Created on 2020-06-10 by the reprex package (v0.3.0)

singmann commented 4 years ago

But what is the consequence of that? What should the user do in this case? Does it look better with log(RT)?

mattansb commented 4 years ago

Personally not a fan of transformations, so I would bootstrap / use a permutation test instead, but since log is common for RTs, in this case I guess I approve 😊

So here is an example when log(RT) works for this assumption:


library(afex)
#> Loading required package: lme4
#> Loading required package: Matrix
#> Registered S3 methods overwritten by 'car':
#>   method                          from
#>   influence.merMod                lme4
#>   cooks.distance.influence.merMod lme4
#>   dfbeta.influence.merMod         lme4
#>   dfbetas.influence.merMod        lme4
#> ************
#> Welcome to afex. For support visit: http://afex.singmann.science/
#> - Functions for ANOVAs: aov_car(), aov_ez(), and aov_4()
#> - Methods for calculating p-values with mixed(): 'KR', 'S', 'LRT', and 'PB'
#> - 'afex_aov' and 'mixed' objects can be passed to emmeans() for follow-up tests
#> - NEWS: library('emmeans') now needs to be called explicitly!
#> - Get and set global package options with: afex_options()
#> - Set orthogonal sum-to-zero contrasts globally: set_sum_contrasts()
#> - For example analyses see: browseVignettes("afex")
#> ************
#> 
#> Attaching package: 'afex'
#> The following object is masked from 'package:lme4':
#> 
#>     lmer
data(stroop, package = "afex")

stroop <- na.omit(stroop)
stroop <- subset(stroop, study == "1")

fit <- aov_ez("pno", "rt", stroop,
              within = c("congruency", "condition"),
              include_aov = FALSE)
#> Warning: More than one observation per cell, aggregating the data using mean
#> (i.e, fun_aggregate = mean)!
residuals_qqplot(fit, by_term = TRUE)
#> Loading required namespace: qqplotr


fit <- aov_ez("pno", "rt", stroop,
              transformation = "log",
              within = c("congruency", "condition"),
              include_aov = FALSE)
#> Warning: More than one observation per cell, aggregating the data using mean
#> (i.e, fun_aggregate = mean)!
residuals_qqplot(fit, by_term = TRUE)

Created on 2020-06-10 by the reprex package (v0.3.0)

singmann commented 4 years ago

I see. I am willing to accept this function then.

It still would be good to have a full example in terms of a vignette. Also discussing the other assumption tests. Are you willing to provide this, too? Feel free to use the stroop data and copy from the other vignette if it would make sense.

mattansb commented 4 years ago

I could write a short vignette, sure. I'll try to get to it sometime next week.

mattansb commented 3 years ago

I think this issue can be closed.