Open nahorp opened 4 years ago
For the time being, here's how to do it in R:
rm(list = ls())
library(ggplot2)
logSumExp <- matrixStats::logSumExp
# some functions
.BANOVAas.character.formula <- function(x, ...) {
Reduce(paste, trimws(deparse(x)))
}
.BANOVAfinalizeInternalTable <- function(options, internalTable) {
# function that actually fills in the table created by .BANOVAinitModelComparisonTable
footnotes <- NULL
if (anyNA(internalTable[, "P(M|data)"])) {
# if TRUE, called from analysis
logSumExp <- matrixStats::logSumExp
logbfs <- internalTable[, "BF10"]
if (!anyNA(internalTable[, "BF10"])) {
# no errors, proceed normally and complete the table
logsumbfs <- logSumExp(logbfs)
internalTable[, "P(M|data)"] <- exp(logbfs - logsumbfs)
nmodels <- nrow(internalTable)
mm <- max(logbfs)
for (i in seq_len(nmodels)) {
internalTable[i, "BFM"] <- logbfs[i] - logSumExp(logbfs[-i]) + log(nmodels - 1L)
}
} else {
# create table excluding failed models
idxGood <- !is.na(logbfs)
widxGood <- which(idxGood)
logsumbfs <- logSumExp(logbfs[idxGood])
internalTable[, "P(M|data)"] <- exp(logbfs - logsumbfs)
nmodels <- sum(idxGood)
mm <- max(logbfs, na.rm = TRUE)
widxBad <- which(!idxGood)
for (i in widxGood) {
internalTable[i, "BFM"] <- logbfs[i] - logSumExp(logbfs[-c(i, widxBad)]) + log(nmodels - 1L)
}
internalTable[widxBad, "P(M|data)"] <- NaN
internalTable[widxBad, "BFM"] <- NaN
internalTable[widxBad, "BF10"] <- NaN
footnotes <- list(
message = gettext("<b><em>Warning.</em></b> Some Bayes factors could not be calculated. Multi model inference is carried out while excluding these models. The results may be uninterpretable!")
)
}
} # else: results already computed
# create the output table
table <- as.data.frame(internalTable)
table[["Models"]] <- rownames(internalTable)
if (options[["bayesFactorType"]] == "LogBF10") {
table[["BFM"]] <- internalTable[, "BFM"]
} else {
table[["BFM"]] <- exp(internalTable[, "BFM"])
}
o <- order(table[["BF10"]], decreasing = TRUE)
table <- table[o, ]
idxNull <- which(o == 1L)
if (options[["bayesFactorOrder"]] == "nullModelTop") {
table[idxNull, "error %"] <- NA
table <- table[c(idxNull, seq_len(nrow(table))[-idxNull]), ]
} else {
table[["BF10"]] <- table[["BF10"]] - table[1L, "BF10"]
# recompute error (see BayesFactor:::`.__T__/:base`$`BFBayesFactor#BFBayesFactor`)
table[idxNull, "error %"] <- 0
table[["error %"]] <- sqrt(table[["error %"]]^2 + table[["error %"]][1L]^2)
table[1L, "error %"] <- NA
}
# table[["BF10"]] <- .recodeBFtype(table[["BF10"]], newBFtype = options[["bayesFactorType"]], oldBFtype = "LogBF10")
table[["error %"]] <- 100 * table[["error %"]]
if (!is.null(footnotes)) {
idxNan <- which(is.nan(as.matrix(table[-ncol(table)])), arr.ind = TRUE)
footnotes[["rows"]] <- idxNan[, "row"]
footnotes[["cols"]] <- idxNan[, "col"]
}
return(list(table = table, internalTable = internalTable, footnotes = footnotes))
}
makePredictions <- function(modelFormula, dataset, posteriorSamples) {
# get dependent and independent variabels
dvs <- all.vars(modelFormula)[1L]
ivs <- all.vars(modelFormula)[-1L]
idx <- sapply(dataset[ivs], is.factor)
# do one-hot encoding (thank base R for calling a function `contrast` and giving it the argument `contrast`...)
datOneHot <- model.matrix(modelFormula, data = dataset[c(dvs, ivs)],
contrasts.arg = lapply(dataset[ivs][idx], contrasts, contrasts = FALSE))
# here we need to do some more data set specific ordering of the names.
# We want colnames(datOneHot) == colnames(samplesModelAveraged)
# in the example this goes wrong but only for the interactions effects.
# this is cody is kinda messy because stats made their naming scheme hard to reverse engineer into variable:factorlevel
# luckily the naming scheme of BayesFactor is more sensible (variable1:variable2:...-factorLevel1.&.factorLevel2.&. ....)
colnames(datOneHot)[1L] <- "mu" # BayesFactor calls the intercept "mu"
# drop g_ and residual variance
posteriorSamples <- posteriorSamples[, !startsWith(colnames(posteriorSamples), "g_") & colnames(posteriorSamples) != "sig2"]
newNms <- colnames(posteriorSamples)
interactions <- grepl(".&.", newNms)
newNms[!interactions] <- gsub("-", "", newNms[!interactions])
# first element contains variables, second contains factor levels
tmp <- strsplit(newNms[interactions], "-")
newNms[interactions] <- sapply(tmp, function(x) {
varNms <- strsplit(x[[1L]], ":", fixed = TRUE)[[1L]]
levelNms <- strsplit(x[[2L]], ".&.", fixed = TRUE)[[1L]]
paste0(varNms, levelNms, collapse = ":")
})
colnames(posteriorSamples) <- newNms
# reorder datOneHot to have the columns in the same order as samplesModelAveraged
newOrder <- match(colnames(datOneHot), colnames(posteriorSamples))
datOneHot <- datOneHot[, newOrder]
if (!all(colnames(datOneHot) == colnames(posteriorSamples)))
stop("something went wrong with matching the names of the parameters with those of the data set!")
predsMA <- tcrossprod(datOneHot, posteriorSamples)
return(predsMA)
}
replicateJASP <- function(dataset, formulaFullModel, neverExclude, whichRandom, iterations = 1e3L,
bfIterations = 1e4L) {
# generate all formulas, optionally use neverExclude to add additional nuisance terms
modelFormulas <- BayesFactor::enumerateGeneralModels(fmla = formulaFullModel, whichModels = "withmain", neverExclude = neverExclude)
# some empty objects for bf objects and posterior samples.
# note that we do NOT take this approach in JASP because here we allocate the posterior samples for all models at once.
# If the model space is large then this may consume all available RAM on your machine.
nModels <- length(modelFormulas)
bfResults <- vector("list", nModels)
posteriors <- vector("list", nModels)
logbfs <- numeric(nModels)
error <- numeric(nModels)
modelNames <- sapply(modelFormulas, .BANOVAas.character.formula)
names(error) <- names(logbfs) <- names(modelFormulas) <- names(bfResults) <- names(posteriors) <-
modelNames
# loop over all models, compute Bayes factor and sample from the posterior distribution
for (model in modelNames) {
bfResults[[model]] <- BayesFactor::lmBF(modelFormulas[[model]], data = dataset, whichRandom = whichRandom, progress=FALSE,
iterations = bfIterations)
logbfs[[model]] <- bfResults[[model]]@bayesFactor$bf
error[[model]] <- bfResults[[model]]@bayesFactor$error
posteriors[[model]] <- BayesFactor::posterior(bfResults[[model]], iterations = iterations)
# here you need to adjust things because sometimes BayesFactor mangles names.
# In JASP we handle this differently because we also do name mangling to deal with non UTF characters (e.g., Chinese, Greek or Hebrew characters)
# I'd suggest using browser() to step through the code in case something errors
# this needs adjustment for other datasets if Bayesfactor mangles the names.
if (model == modelNames[[nModels]]) {
colnames(posteriors[[model]]) <- c("mu", paste("ID", 1:12, sep = "-"), "sig2", "g_ID")
}
}
logsumbfs <- logSumExp(logbfs)
# partially recreate model comparison table from JASP.
bfTable <- data.frame(
"model" = modelNames,
"BF10" = exp(logbfs - logbfs[length(logbfs)]), # the last one is always the null model
"P(M|data)" = exp(logbfs - logsumbfs),
"error %" = error,
check.names = FALSE
)
# Using the posterior model probabilities, we now resample from the posterior distribution.
# This is similar to how BAS:::confint.coef.bas does it.
# parameter names for all models
parameterNames <- sapply(posteriors, colnames)
# most parameters implies the full model, this is how many columns we need.
idxFullModel <- which.max(lengths(parameterNames))
allParameters <- parameterNames[idxFullModel][[1L]]
# initialize matrix with all samples set to 0.
samplesModelAveraged <- matrix(0, iterations, length(allParameters), dimnames = list(NULL, allParameters))
# these models will be sampled from.
resampledModels <- sample(seq_len(nModels), size = iterations, replace = TRUE, prob = bfTable[["P(M|data)"]])
for (i in seq_len(iterations)) {
model <- resampledModels[i]
iter <- sample(iterations, 1L)
# fill parameters samples in this particular model
samplesModelAveraged[i, parameterNames[[model]]] <- posteriors[[model]][iter, parameterNames[[model]]]
}
fullModelFormula <- modelFormulas[[idxFullModel]]
predsMA <- makePredictions(fullModelFormula, dataset, samplesModelAveraged)
# sort from highest posterior probability to lowest for presentation
bfTable <- bfTable[order(bfTable[["P(M|data)"]], decreasing = TRUE), ]
rownames(bfTable) <- NULL
return(list(
bfResults = bfResults,
posteriors = posteriors,
bfTable = bfTable,
# MA is short for model averaged
samplesMA = samplesModelAveraged,
modelFormulas = modelFormulas,
predictionsMA = predsMA,
dvs = all.vars(modelFormulas[[1L]])[1L]
))
}
plotDensityPrintCRI <- function(results, dataset, index, nBestModels = 4) {
dvs <- results$dvs
samplesIndex <- c(results$predictionsMA[index, ])
cri <- quantile(samplesIndex, probs = c(0.025, 0.975))
sampleMean <- mean(dataset[index, dvs])
meanMA <- mean(samplesIndex)
nBestModels <- min(nBestModels, nrow(results$bfTable))
idxBestModels <- as.character(results$bfTable$model[seq_len(nBestModels)])
predsBestModels <- sapply(idxBestModels, function(model) {
samples <- makePredictions(results$modelFormulas[[model]], dataset[index, ], results$posteriors[[model]])
mean <- mean(samples)
d <- as.data.frame(density(samples)[c("x", "y")])
list(mean = mean, d = d)
})
df <- do.call(rbind, predsBestModels[2, ])
rownames(df) <- NULL
df$group <- rep(names(predsBestModels[2, ]), sapply(predsBestModels[2, ], nrow))
dfMA <- as.data.frame(density(samplesIndex)[c("x", "y")])
dfMA$group <- "Model Averaged"
dfAll <- rbind(df, dfMA)
dfPoint <- data.frame(
x = c(sampleMean, meanMA, cri),
y = rep(0, 4),
g = c("Sample mean", "MA mean", "MA cri lower", "MA cri upper"),
cc = c("Sample mean", "MA mean", "MA cri", "MA cri")
)
g <- ggplot(data = dfAll, mapping = aes(x = x, y = y, group = group, color = group)) +
geom_ribbon(data = subset(dfAll, group == "Model Averaged"), mapping = aes(x = x, ymax = y),
ymin = 0, inherit.aes = FALSE, alpha = .55, fill = "grey60") +
geom_line() +
geom_point(data = dfPoint, aes(x = x, y = y, shape = cc), size = 2, inherit.aes = FALSE) +
labs(color = "Model", y = "density", shape = NULL) +
# viridis::scale_color_viridis(discrete = TRUE) +
theme_bw(24)
return(g)
}
data("puzzles", package = "BayesFactor")
formulaFullModel <- RT ~ shape*color + ID
whichRandom <- neverExclude <- "ID"
iterations <- 1e4
results <- replicateJASP(puzzles, formulaFullModel, neverExclude, whichRandom, iterations)
print(results$bfTable, digits = 4) # note that error is not recalculated, unlike in JASP
# to approximately replicate this in JASP:
# rerun the example above with bfIterations = 1e6
# write.csv(puzzles, "puzzles.csv")
# 1. open Bayesian ANCOVA:
# 2. Random Factors: ID
# 3. Fixed Factors: shape + color
# 4. Order: "Compare to null model"
# 5. Under Additional Options, set "Numerical Accuracy" to "Manual" and fill in 100000 (1e6)
# 6. Dependent Variable: RT
# for each observations a vector of model averaged posterior samples
dim(results$predictionsMA)
# plot for particular subset of observations the posterior density across all observations
index <- puzzles$shape == "round"
plotDensityPrintCRI(results, puzzles, index = index, nBestModels = 3)
# note that the bimodal shape is also present in the raw data:
plot(density(puzzles[index, "RT"]))
index <- puzzles$shape == "square" & puzzles$color == "color"
plotDensityPrintCRI(results, puzzles, index = index)
# the bimodal shape is again present in the raw data:
plot(density(puzzles[index, "RT"]))
The code makes some plots to illustrate the influence of model averaging. It's not completely generic but should be easily adaptable to other data sets. I hope that helps until we implement this in JASP.
Let me know if anything is unclear or if you have any more questions!
Wow, thanks @vandenman! :) I'll have a play with this over the next week and get back to you if I have any questions. Thank you once again.
Hi @vandenman, just touching base again.
I found the weighted_posteriors
function in the bayestestR package that seems to do what we want and since it accepts a BayesFactor model object, decided to go down that route.
I quite appreciate your efforts with the above code but the above route was slightly easier and quicker. I hope to see it in implemented in JASP one day and will keep it on my radar :)
Hi @vandenman, I've returned to this paper again (and hence this analysis).
With the code you attached, is there a way I can get the means and credible intervals (i.e., not in the graph form) for each combination of shape (round, square) and color (monochromatic, color), averaged across all models (based on their posterior model probabilities)?
Hi @nahorp, if you run the code above and then the code below you should get something like this:
# all possible combinations - mean + 95% cri
ans_post <- tapply(as.numeric(rownames(puzzles)), list(puzzles$shape, puzzles$color), function(x) {
x <- c(results$predictionsMA[x, ])
c("mean" = mean(x), quantile(x, probs = c(0.025, 0.975)))
})
ans_post2 <- do.call(rbind, ans_post)
rownames(ans_post2) <- paste(rep(rownames(ans_post), 2), "-", colnames(ans_post))
ans_post2
# the same but for the raw data - mean + 95% quantiles
ans_data <- tapply(puzzles$RT, list(puzzles$shape, puzzles$color), function(x) {
c("mean" = mean(x), quantile(x, probs = c(0.025, 0.975)))
})
ans_data2 <- do.call(rbind, ans_data)
rownames(ans_data2) <- paste(rep(rownames(ans_data), 2), "-", colnames(ans_data))
ans_data2
posterior mean + 95% credible interval
ans_post2
mean 2.5% 97.5%
round - color 44.99457 40.91183 48.65437
square - monochromatic 44.28898 40.19771 47.96045
round - color 45.71288 41.62080 49.39830
square - monochromatic 45.00622 40.93272 48.67154
observed mean + 95% quantiles
ans_data2
mean 2.5% 97.5%
round - color 45 40.275 49.000
square - monochromatic 44 40.000 47.725
round - color 46 41.550 48.725
square - monochromatic 45 40.550 48.725
Thanks for that @vandenman! ans_post2
looks like exactly what I want. However, I tried running the above code to my dataset but get the below when saving results <- replicateJASP(...
Error in dimnames(x) <- dn :
length of 'dimnames' [2] not equal to array extent
My dataframe has 4 columns - participant_code, age_group, version, y - and 47 rows. Both age_group and version are categorical predictors with 2 levels each. I am just changing these lines in the above code,
formulaFullModel <- y ~ age_group*version + participant_code whichRandom <- neverExclude <- "participant_code" iterations <- 1e4
I am happy to email you my dataframe as a .csv if it helps... Thank you
Hi @nahorp a dataframe would help a lot! Can you email it to me at <d.vandenbergh at jasp-stats.org>?
Hi @nahorp a dataframe would help a lot! Can you email it to me at <d.vandenbergh at jasp-stats.org>?
Hi @vandenman, just checking that there were no issues with you receiving the dataframe (emails with attachments from our university email address are sometimes blocked by the recipients spam filter)?
@nahorp This seems to be a duplicate of https://github.com/jasp-stats/jasp-issues/issues/2444 Do you agree?
Hello :)
I have outlined the request (an 'issue' at the time of posting it at the forum but I've since been told that it is not possible in the current version of JASP) at the link below,
https://forum.cogsci.nl/discussion/5962/credible-interval-for-repeated-measures-anova
Thank you.