Open fweber144 opened 1 year ago
I think I was now able to reproduce this with a "standalone" example, i.e., without requiring the specific dataset used above:
# Data --------------------------------------------------------------------
# Generate outcome based on `df_gaussian` and several group-level terms:
data("df_gaussian", package = "projpred")
dat <- data.frame(y = df_gaussian$y, df_gaussian$x)
dat$grpvar1 <- gl(n = 5, k = nrow(dat) %/% 5, length = nrow(dat),
labels = paste0("gr", seq_len(5)))
dat$grpvar2 <- gl(n = 6, k = nrow(dat) %/% 6, length = nrow(dat),
labels = paste0("agr", seq_len(6)))
dat$grpvar3 <- gl(n = 3, k = nrow(dat) %/% 3, length = nrow(dat),
labels = paste0("tgr", seq_len(3)))
set.seed(457211)
dat$grpvar2 <- sample(dat$grpvar2)
dat$grpvar3 <- sample(dat$grpvar3)
grpvar1_icpts_truth <- rnorm(nlevels(dat$grpvar1), sd = 6)
grpvar1_X1_truth <- rnorm(nlevels(dat$grpvar1), sd = 6)
icpt <- -4.2
dat$y <- icpt +
grpvar1_icpts_truth[dat$grpvar1] +
grpvar1_X1_truth[dat$grpvar1] * dat$X1
dat$y <- rnorm(nrow(dat), mean = dat$y, sd = 4)
# Split up into training and test (hold-out) dataset:
idcs_test <- sample.int(nrow(dat), size = nrow(dat) %/% 3)
dat_train <- dat[-idcs_test, , drop = FALSE]
dat_test <- dat[idcs_test, , drop = FALSE]
# Reference model fit -----------------------------------------------------
suppressPackageStartupMessages(library(rstanarm))
rfit <- stan_glmer(
y ~ X1 + X2 + X3 + (1 | grpvar1) + (1 | grpvar1:grpvar2) + (1 | grpvar1:grpvar2:grpvar3),
data = dat_train,
chains = 1,
iter = 500,
refresh = 0,
seed = 1140350788
)
# projpred ----------------------------------------------------------------
devtools::load_all() # requires at least commit a8b25c178223e2cd607070962fa0732f9abc3d85
options(projpred.extra_verbose = TRUE)
refmodel_obj <- get_refmodel(rfit)
sep_char <- if (inherits(refmodel_obj$fit, "stanreg")) ":" else "_"
dat_test$`grpvar1:grpvar2` <- paste(dat_test$grpvar1,
dat_test$grpvar2,
sep = sep_char)
dat_test$`grpvar1:grpvar2:grpvar3` <- paste(dat_test$grpvar1,
dat_test$grpvar2,
dat_test$grpvar3,
sep = sep_char)
d_test_list <- list(
data = dat_test[, names(dat_test) != "y"],
offset = rep(0, nrow(dat_test)),
weights = rep(1, nrow(dat_test)),
y = dat_test[["y"]]
)
vs <- varsel(refmodel_obj,
d_test = d_test_list,
nclusters = 1,
refit_prj = FALSE,
seed = 46782345)
print(plot(vs))
cvvs <- cv_varsel(refmodel_obj,
cv_method = "kfold",
K = 2,
nclusters = 1,
refit_prj = FALSE,
seed = 46782345)
print(plot(cvvs, ranking_nterms_max = NA))
print(plot(cv_proportions(cvvs)))
# Forcing the group-level terms to be selected first:
get_search_terms_forced <- function(forced_terms, optional_terms) {
forced_terms <- paste(forced_terms, collapse = " + ")
return(c(forced_terms, paste0(forced_terms, " + ", optional_terms)))
}
forced_predictors <- c("(1 | grpvar1)", "(1 | grpvar1:grpvar2)",
"(1 | grpvar1:grpvar2:grpvar3)")
optional_predictors <- paste0("X", seq_len(3))
search_terms_forcedGL <- get_search_terms_forced(forced_predictors,
optional_predictors)
cvvs_forcedGL <- cv_varsel(refmodel_obj,
cv_method = "kfold",
K = 2,
nclusters = 1,
refit_prj = FALSE,
search_terms = search_terms_forcedGL,
seed = 46782345)
print(plot(cvvs_forcedGL, ranking_nterms_max = NA))
print(plot(cv_proportions(cvvs_forcedGL)))
In this Stan Discourse reply (reference model: Gaussian, multilevel), we observed a gap in predictive performance between the submodels and the reference model when
search_terms
wasNULL
(with a "jump" towards the reference model's performance at the full model size), but not when forcing both group-level terms to be selected first. Reprex copied (and reduced to the relevant part) from that reply (the example dataset may be found here; also note that the reprex writes some files to the current working directory):Details may be found in the Stan Discourse reply.