Closed singmann closed 3 years ago
I'm continuing our discussion from #69 about residuals.
I found a way to get the residuals without using aov
(but also without using the mlm
lm
):
residuals.afex_aov <- function(object, model = "multivariate", ...){
if (length(attr(object, "within")) == 0 || model == "multivariate") {
return(residuals(object$lm))
} else {
data <- object$data$long
dv <- attr(object,'dv')
id <- attr(object,'id')
between <- names(attr(object,'between'))
within <- names(attr(object,'within'))
# within
combs <- expand.grid(lapply(within, function(x) c(x,NA)))
combs$id <- id
combs <- head(combs,-1)
within_res <- list()
for (i in seq_len(nrow(combs))) {
tem_fs <- as.vector(na.omit(t(combs[i,])))
ag_data <- aggregate(data[,dv],data[,tem_fs],mean)
temp_name <- paste0(head(tem_fs,-1),collapse = '*')
form <- formula(paste0("x~",temp_name))
within_res[[temp_name]] <- residuals(lm(form,ag_data))
}
all_residuals <- within_res
# between
if (!is.null(between)) {
ag_data <- aggregate(data[,dv],data[,c(between,id)],mean)
form <- formula(paste0("x~",paste0(c(between),collapse = '*')))
all_residuals[[id]] <- residuals(lm(form,ag_data))
}
return(all_residuals)
}
}
residuals_qqplot <- function(object){
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("package ggplot2 is required.", call. = FALSE)
}
all_residuals <- residuals(object,model = "univariate")
if (is.list(all_residuals)) {
all_residuals <- lapply(names(all_residuals), function(x) data.frame(residuals = all_residuals[[x]],proj = x))
plot_data <- do.call("rbind",all_residuals)
} else {
plot_data <- data.frame(residuals = all_residuals,
proj = "Error")
}
ggplot2::ggplot(plot_data,ggplot2::aes(sample = residuals)) +
ggplot2::geom_qq() +
ggplot2::geom_qq_line() +
ggplot2::facet_wrap(~proj)
}
library(afex)
#> Loading required package: lme4
#> Loading required package: Matrix
#> ************
#> Welcome to afex. For support visit: http://afex.singmann.science/
#> - Functions for ANOVAs: aov_car(), aov_ez(), and aov_4()
#> - Methods for calculating p-values with mixed(): 'KR', 'S', 'LRT', and 'PB'
#> - 'afex_aov' and 'mixed' objects can be passed to emmeans() for follow-up tests
#> - NEWS: library('emmeans') now needs to be called explicitly!
#> - Get and set global package options with: afex_options()
#> - Set orthogonal sum-to-zero contrasts globally: set_sum_contrasts()
#> - For example analyses see: browseVignettes("afex")
#> ************
#>
#> Attaching package: 'afex'
#> The following object is masked from 'package:lme4':
#>
#> lmer
data(obk.long, package = "afex")
# mixed
afex_mix <- aov_ez('id','value',obk.long,
between=c('treatment',"gender"),
within = c('phase','hour'))
#> Contrasts set to contr.sum for the following variables: treatment, gender
residuals_qqplot(afex_mix)
# with covariate
afex_cov <- aov_ez('id','value',obk.long,
between=c('treatment',"gender"),
within = c('phase','hour'),
covariate = 'age', factorize = FALSE)
#> Contrasts set to contr.sum for the following variables: treatment, gender
residuals_qqplot(afex_cov)
# within only
afex_within <- aov_ez('id','value',obk.long,
within = c('phase'), factorize = FALSE)
#> Warning: More than one observation per cell, aggregating the data using
#> mean (i.e, fun_aggregate = mean)!
residuals_qqplot(afex_within)
# between only
afex_between <- aov_ez('id','value',obk.long,
between=c('treatment'))
#> Warning: More than one observation per cell, aggregating the data using
#> mean (i.e, fun_aggregate = mean)!
#> Contrasts set to contr.sum for the following variables: treatment
residuals_qqplot(afex_between)
Created on 2019-05-03 by the reprex package (v0.2.1)
This is actually better than the previous version that did fit an aov
, as the residuals for each error term is of the correct length.
Will the fix from @mattansb get accepted into the package? I find this functionality very useful.
Sorry to barge in, I just came across this discussion. How do you feel about adding an option to detrend the QQ-Plot and add a confidence interval?
residuals_qqplot <- function(object, detrend = FALSE){
if (!requireNamespace("ggplot2", quietly = TRUE)) {
stop("package ggplot2 is required.", call. = FALSE)
}
all_residuals <- residuals(object,model = "univariate")
if (is.list(all_residuals)) {
all_residuals <- lapply(names(all_residuals), function(x) data.frame(residuals = all_residuals[[x]],proj = x))
plot_data <- do.call("rbind",all_residuals)
} else {
plot_data <- data.frame(residuals = all_residuals,
proj = "Error")
}
if(detrend) {
n_residuals <- aggregate(residuals ~ proj, plot_data, length)
ci_range <- qnorm(range(ppoints(max(n$residuals))))
# Adapted from Buuren & Fredriks (2001; doi: 10.1002/sim.746)
qq_ci <- function(n, level = 0.95, lz = -4, hz = 4, length.out = 100) {
# adds confidence interval to Q–Q plot panel
z <- seq(lz, hz, length.out = length.out)
p <- pnorm(z)
se <- (1/dnorm(z)) * (sqrt(p * (1-p)/n))
low <- qnorm((1 - level)/2) * se
data.frame(x = z, ymin = low, ymax = -low)
}
qq_ci_data <- apply(
n_residuals,
1,
function(x) {
data.frame(
proj = x["proj"],
qq_ci(as.numeric(x["residuals"]), lz = ci_range[1], hz = ci_range[2], length.out = 100),
row.names = NULL
)
}
)
qq_ci_data <- do.call("rbind", qq_ci_data)
ggplot2::ggplot(plot_data, ggplot2::aes(sample = residuals)) +
ggplot2::geom_ribbon(
data = qq_ci_data
, ggplot2::aes(x = x, ymin = ymin, ymax = ymax), inherit.aes = FALSE
, alpha = 0.25
) +
ggplot2::geom_qq(ggplot2::aes(y = ..sample.. - ..theoretical..)) +
ggplot2::geom_hline(yintercept = 0) +
labs(x = "Theoretical", y = "Deviation") +
ggplot2::facet_wrap(~proj, scales = "free")
} else {
ggplot2::ggplot(plot_data,ggplot2::aes(sample = residuals)) +
ggplot2::geom_qq() +
ggplot2::geom_qq_line() +
labs(x = "Theoretical", y = "Observed") +
ggplot2::facet_wrap(~proj)
}
}
This should work with all the examples above. Here's an example:
residuals_qqplot(afex_mix, detrend = TRUE)
Sorry for the delay in reacting to this. The pull request with the residuals
method (#70) will be added hopefully at some point next week.
I am still not a big fan of having a dedicated qq_plot
function and would hope for something more generic. For example, adding support for a qqplot
function from a different package.
For example, adding support for a qqplot function from a different package.
Once the residual methods will be implemented here, I plan to do just that with ggResidpanel
.
Residuals function is added now in #70. Let me know if I should wait with pushing to CRAN for the ggResidpanel
method.
Thanks Henrik!
I'm still thinking of the best way to implement plotting of the residuals (ggResidpanel or some other ggplot implementation). No need to wait for that for the push to CRAN, imo.
Thanks!
-- Mattan S. Ben-Shachar, PhD student Department of Psychology & Zlotowski Center for Neuroscience Ben-Gurion University of the Negev The Developmental ERP Lab
On Tue, Jun 9, 2020, 22:30 Henrik Singmann notifications@github.com wrote:
Residuals function is added now in #70 https://github.com/singmann/afex/pull/70. Let me know if I should wait with pushing to CRAN for the ggResidpanel method.
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/singmann/afex/issues/64#issuecomment-641524546, or unsubscribe https://github.com/notifications/unsubscribe-auth/AINRP6EZVPNAEYRSELJTPUTRV2EWNANCNFSM4GFGGDPA .
Actually...
it seems that ggResidpanel
can be used now like this:
library(afex)
data(obk.long, package = "afex")
fit <- aov_ez('id', 'value', obk.long,
between = c('treatment', 'gender'),
within = c('phase', 'hour'))
#> Contrasts set to contr.sum for the following variables: treatment, gender
ggResidpanel::resid_auxpanel(residuals = residuals(fit),
predicted = fitted(fit))
#> Warning: Data was changed during ANOVA calculation. Thus, residuals cannot be added to original data.
#> residuals(..., append = TRUE) will return data and residuals.
#> Warning: Data was changed during ANOVA calculation. Thus, fitted values cannot be added to original data.
#> fitted(..., append = TRUE) will return data and fitted values.
Additionally, I have the residuals_qqplot()
function (and updated version from what we discussed last year):
residuals_qqplot(fit)
residuals_qqplot(fit, by_term = TRUE) # for within-subject error terms only
residuals_qqplot(fit, by_term = TRUE, model = "multivariate") # for within-subject error terms only
residuals_qqplot <- function(object, by_term = FALSE,
model = c("univariate", "multivariate"),
qqbands = TRUE, detrend = FALSE, return = c("plot", "data")){
stopifnot(requireNamespace("ggplot2"))
if (qqbands && requireNamespace("qqplotr")) {
qq_stuff <- list(qqplotr::stat_qq_band(detrend = detrend),
qqplotr::stat_qq_line(detrend = detrend),
qqplotr::stat_qq_point(detrend = detrend))
} else {
qq_stuff <- list(ggplot2::stat_qq(),
ggplot2::stat_qq_line())
}
model <- match.arg(model)
return <- match.arg(return)
within <- names(attr(object, "within"))
id <- attr(object, 'id')
e <- residuals(object, append = TRUE)
if (!by_term) {
e$term <- "residuals"
} else if (length(within) > 0L & model == "univariate") {
wf <- lapply(within, function(x) c(NA, x))
wf <- do.call(expand.grid, wf)
wl <- apply(wf, 1, function(x) c(na.omit(x)))
fin_e <- vector("list", length = length(wl))
for (i in seq_along(wl)) {
temp_f <- c(id, wl[[i]])
temp_e <- aggregate(e$.residuals, e[, temp_f, drop = F], FUN = mean)
fin_e[[i]] <- data.frame(
term = paste0(temp_f, collapse = ":"),
.residuals = temp_e[[ncol(temp_e)]]
)
}
e <- do.call(rbind, fin_e)
} else if (length(within) > 0L) {
e$term <- apply(e[, within], 1, function(x) paste0(x, collapse = "/"))
} else {
e$term <- id
}
if (return == "data") {
return(e[,c("term",".residuals")])
}
ggplot2::ggplot(e, ggplot2::aes(sample = .data$.residuals)) +
qq_stuff +
ggplot2::facet_wrap(~ .data$term, scales = "free") +
ggplot2::theme_minimal() +
ggplot2::theme(
axis.text.x = ggplot2::element_blank(),
axis.ticks.x = ggplot2::element_blank(),
axis.text.y = ggplot2::element_blank(),
axis.ticks.y = ggplot2::element_blank()
) +
ggplot2::labs(x = "Theoretical",
y = "Sample")
}
@singmann Let me know if you'd like the latter in afex
/ any suggested changes.
I have added the code of how to use ggResidpanel
with the two methods to the examples of the methods.
However, I am still a bit wary of adding the residuals_qqplot
function. Is there no existing function that does this in a package we could support via a method? Also, it would be really great to have something that works for both aNOVA and mixed models (i.e., function mixed). But for this, I first need to add a residuals
method for the latter.
Anyway, I am also considering changing the warning
to a message
if the data is changed. A warning is maybe a bit much. With the plotting example it is clear that in many cases one might use it in a way in which this behaviour is totally fine. So a message may be appropriate.
Is there no existing function that does this in a package we could support via a method?
I am not aware of any functions for diagnostics on residuals by term in a repeated-measures context (or in a hierarchical context where it is more appropriate to look at the distribution of random effects)... That's actually how I got to this issue in the first place, when teaching how to check the assumptions of within-subject ANOVAs, I couldn't find any tools to allow for error-term-wise diagnostics. From what I can tell assumptions related to residuals in such models are performed on the residuals as a whole. 🤷♂️
I am also considering changing the warning to a message
Yeah, that sounds more appropriate for a broader context of situations.
I see. Do you have an actual real-life example data set where this matters? In other words, I would be willing to accept this, but I think it would then be great to have a vignette actually showing how this should/could be used. Maybe this vignette could also include the other assumption tests.
"real-life example data set where this matters" - do you mean a dataset were this assumption is violated? I guess any RT data is an easy target...
library(afex)
data(stroop, package = "afex")
stroop <- na.omit(stroop)
fit <-
aov_ez("pno", "rt", stroop,
within = c("congruency", "condition"),
between = "study",
include_aov = FALSE)
#> Warning: More than one observation per cell, aggregating the data using mean
#> (i.e, fun_aggregate = mean)!
#> Contrasts set to contr.sum for the following variables: study
residuals_qqplot(fit, by_term = TRUE)
#> Loading required namespace: qqplotr
Created on 2020-06-10 by the reprex package (v0.3.0)
But what is the consequence of that? What should the user do in this case? Does it look better with log(RT)
?
Personally not a fan of transformations, so I would bootstrap / use a permutation test instead, but since log is common for RTs, in this case I guess I approve 😊
So here is an example when log(RT) works for this assumption:
library(afex)
#> Loading required package: lme4
#> Loading required package: Matrix
#> Registered S3 methods overwritten by 'car':
#> method from
#> influence.merMod lme4
#> cooks.distance.influence.merMod lme4
#> dfbeta.influence.merMod lme4
#> dfbetas.influence.merMod lme4
#> ************
#> Welcome to afex. For support visit: http://afex.singmann.science/
#> - Functions for ANOVAs: aov_car(), aov_ez(), and aov_4()
#> - Methods for calculating p-values with mixed(): 'KR', 'S', 'LRT', and 'PB'
#> - 'afex_aov' and 'mixed' objects can be passed to emmeans() for follow-up tests
#> - NEWS: library('emmeans') now needs to be called explicitly!
#> - Get and set global package options with: afex_options()
#> - Set orthogonal sum-to-zero contrasts globally: set_sum_contrasts()
#> - For example analyses see: browseVignettes("afex")
#> ************
#>
#> Attaching package: 'afex'
#> The following object is masked from 'package:lme4':
#>
#> lmer
data(stroop, package = "afex")
stroop <- na.omit(stroop)
stroop <- subset(stroop, study == "1")
fit <- aov_ez("pno", "rt", stroop,
within = c("congruency", "condition"),
include_aov = FALSE)
#> Warning: More than one observation per cell, aggregating the data using mean
#> (i.e, fun_aggregate = mean)!
residuals_qqplot(fit, by_term = TRUE)
#> Loading required namespace: qqplotr
fit <- aov_ez("pno", "rt", stroop,
transformation = "log",
within = c("congruency", "condition"),
include_aov = FALSE)
#> Warning: More than one observation per cell, aggregating the data using mean
#> (i.e, fun_aggregate = mean)!
residuals_qqplot(fit, by_term = TRUE)
Created on 2020-06-10 by the reprex package (v0.3.0)
I see. I am willing to accept this function then.
It still would be good to have a full example in terms of a vignette. Also discussing the other assumption tests. Are you willing to provide this, too? Feel free to use the stroop data and copy from the other vignette if it would make sense.
I could write a short vignette, sure. I'll try to get to it sometime next week.
I think this issue can be closed.
afex_aov
objects should have a residuals function that works for both between and within models. The following code could be the starting point: