Closed LucyMcGowan closed 1 year ago
Seems fine by me. ry
is already logical in the mice
sampler.
library(mice)
# custom imputation function that also checks if ry is logical
mice.impute.ry <- function(y, ry, x, wy = NULL, ...){
if (is.null(wy)) wy <- !ry
print(is.logical(ry))
yry <<- y[ry] # save y[ry] to environment
xry <<- x[ry, , drop = FALSE] # save x[ry, ] to environment
print(cbind(y, ry))
print(yry)
y[wy] <- -1
}
# custom method vector
meth <- make.method(nhanes2)
meth["hyp"] <- "ry"
# run mice
imp <- mice(nhanes,
m = 1,
maxit = 1,
printFlag = FALSE,
method = meth)
#> [1] TRUE
#> y ry
#> 1 1 0
#> 2 1 1
#> 3 1 1
#> 4 1 0
#> 5 1 1
#> 6 1 0
#> 7 1 1
#> 8 1 1
#> 9 1 1
#> 10 1 0
#> 11 1 0
#> 12 1 0
#> 13 1 1
#> 14 2 1
#> 15 1 1
#> 16 1 0
#> 17 2 1
#> 18 2 1
#> 19 1 1
#> 20 2 1
#> 21 1 0
#> 22 1 1
#> 23 1 1
#> 24 1 1
#> 25 1 1
#> [1] 1 1 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1
# combining incomplete bmi and imputed bmi
# imputations should all be -1
cbind(nhanes$hyp,
complete(imp)$hyp)
#> [,1] [,2]
#> [1,] NA -1
#> [2,] 1 1
#> [3,] 1 1
#> [4,] NA -1
#> [5,] 1 1
#> [6,] NA -1
#> [7,] 1 1
#> [8,] 1 1
#> [9,] 1 1
#> [10,] NA -1
#> [11,] NA -1
#> [12,] NA -1
#> [13,] 1 1
#> [14,] 2 2
#> [15,] 1 1
#> [16,] NA -1
#> [17,] 2 2
#> [18,] 2 2
#> [19,] 1 1
#> [20,] 2 2
#> [21,] NA -1
#> [22,] 1 1
#> [23,] 1 1
#> [24,] 1 1
#> [25,] 1 1
# the yry vector used in the sampler
yry
#> [1] 1 1 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1
# the xry matrix used in the sampler
xry
#> age bmi chl
#> 2 2 22.7 187
#> 3 1 29.6 187
#> 5 1 20.4 113
#> 7 1 22.5 118
#> 8 1 30.1 187
#> 9 2 22.0 238
#> 13 3 21.7 206
#> 14 2 28.7 204
#> 15 1 29.6 229
#> 17 3 27.2 284
#> 18 2 26.3 199
#> 19 1 35.3 218
#> 20 3 25.5 284
#> 22 1 33.2 229
#> 23 1 27.5 131
#> 24 3 24.9 131
#> 25 2 27.4 186
# versions and more
sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: aarch64-apple-darwin20 (64-bit)
#> Running under: macOS Ventura 13.1
#>
#> Matrix products: default
#> BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
#>
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] mice_3.15.3
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.10 rstudioapi_0.14 knitr_1.42 magrittr_2.0.3
#> [5] tidyselect_1.2.0 lattice_0.20-45 R6_2.5.1 rlang_1.1.1
#> [9] fastmap_1.1.1 fansi_1.0.4 dplyr_1.1.2 tools_4.2.2
#> [13] grid_4.2.2 broom_1.0.4 xfun_0.37 utf8_1.2.3
#> [17] cli_3.6.1 withr_2.5.0 htmltools_0.5.4 yaml_2.3.7
#> [21] digest_0.6.31 tibble_3.2.1 lifecycle_1.0.3 tidyr_1.3.0
#> [25] purrr_1.0.1 vctrs_0.6.2 fs_1.6.1 glue_1.6.2
#> [29] evaluate_0.20 rmarkdown_2.20 reprex_2.0.2 compiler_4.2.2
#> [33] pillar_1.9.0 backports_1.4.1 generics_0.1.3 pkgconfig_2.0.3
Created on 2023-05-19 with reprex v2.0.2
Ah sorry! I was barking up the wrong tree it seems. Here is the root of where this (potential!) bug comes in. I am getting unexpected (to me) behavior with mice.impute.norm
as in the reprex below:
library(tidyverse)
suppressPackageStartupMessages(library(mice))
set.seed(1)
n <- 10000
data <- tibble(
z = rnorm(n, sd = 1),
x = 10 * z + rnorm(n),
y = x + rnorm(n),
x_miss = ifelse(z, rbinom(n, 1, 0.5), rbinom(n, 1, 0.25)),
x_obs = ifelse(x_miss, NA, x)
)
## deterministic imputation
data$imp <- ifelse(data$x_miss,
predict(lm(x_obs ~ z, data = data), data),
data$x_obs)
cov(data$imp, data$y)
#> [1] 103.029
lm(y ~ imp, data) #unbiased
#>
#> Call:
#> lm(formula = y ~ imp, data = data)
#>
#> Coefficients:
#> (Intercept) imp
#> 0.006653 1.000305
## when I do it "by hand" trying to mimic mice inner workings) I get the same answer (as expected):
x <- cbind(1, as.matrix(data$z[data$x_miss==0]))
y <- data$x[data$x_miss==0]
qr <- lm.fit(x = x, y = y)
c <- qr$coef
f <- qr$fitted.values
r <- t(qr$residuals)
v <- try(solve(as.matrix(crossprod(qr.R(qr$qr)))), silent = TRUE)
sigma <- sqrt(sum(r^2) / rchisq(1, qr$df))
beta <- c + t(chol(mice:::sym(v))) %*% rnorm(2, 0, sigma)
e <- rnorm(n, 0, sigma)
x_p <- beta[1] + beta[2] * data$z + e
data$byhand_imp <- ifelse(data$x_miss, x_p, data$x_obs)
cov(data$byhand_imp, data$y)
#> [1] 102.9739
lm(y ~ byhand_imp, data) #unbiased
#>
#> Call:
#> lm(formula = y ~ byhand_imp, data = data)
#>
#> Coefficients:
#> (Intercept) byhand_imp
#> 0.02286 0.99614
## do it with mice using method = "norm", covariance is not what I would have expected:
data$mice_imp <- complete(mice(data, m = 1, method = "norm", formulas = list(x_obs ~ z), print = FALSE))$x_obs
#> Warning: Number of logged events: 5
cov(data$mice_imp, data$y)
#> [1] 51.24754
lm(y ~ mice_imp, data) #uh oh biased!
#>
#> Call:
#> lm(formula = y ~ mice_imp, data = data)
#>
#> Coefficients:
#> (Intercept) mice_imp
#> -0.1282 0.4822
Created on 2023-05-19 with reprex v2.0.2
basically I would expect the covariance between the imputed value and y to be the same as when I did deterministic imputation and this is the case when I try to use the inner working of mice to do the imputation, but when I run the mice
function it is 1/2 of what I would expect. I'm sorry I couldn't track down the cause better!
Oh no, this might be a user error, for some reason, when I run it locally I see:
``` r
library(tidyverse)
suppressPackageStartupMessages(library(mice))
set.seed(1)
n <- 10000
data <- tibble(
z = rnorm(n, sd = 1),
x = 10 * z + rnorm(n),
y = x + rnorm(n),
x_miss = ifelse(z, rbinom(n, 1, 0.5), rbinom(n, 1, 0.25)),
x_obs = ifelse(x_miss, NA, x)
)
m <- mice(data, m = 1, method = "norm", formulas = list(x_obs ~ z))
#>
#> iter imp variable
#> 1 1 x_obs
#> 2 1 x_obs
#> 3 1 x_obs
#> 4 1 x_obs
#> 5 1 x_obs
#> Warning: Number of logged events: 5
m$loggedEvents
NULL
(and still see the strange result) but when I run it in a new session I see: All predictors are constant or have too high correlation. So that is likely the issue. Closing this! Thank you for looking into it.
The generated data system is multicolinear and mice
removes predictors via mice:::remove.lindep()
. This can be bypassed, see below.
library(tibble)
library(dplyr)
library(mice, warn.conflicts = FALSE)
# fix seed
set.seed(1)
# data generation
n <- 10000
data <- tibble(
z = rnorm(n, sd = 1),
x = 10 * z + rnorm(n),
y = x + rnorm(n),
x_miss = ifelse(z, rbinom(n, 1, 0.5), rbinom(n, 1, 0.25)),
x_obs = ifelse(x_miss, NA, x)
)
imp <- mice(data,
m = 1,
method = "norm",
formulas = list(x_obs ~ z),
print = FALSE)
#> Warning: Number of logged events: 5
imp$loggedEvents
#> it im dep meth out
#> 1 1 1 x_obs norm All predictors are constant or have too high correlation.
#> 2 2 1 x_obs norm All predictors are constant or have too high correlation.
#> 3 3 1 x_obs norm All predictors are constant or have too high correlation.
#> 4 4 1 x_obs norm All predictors are constant or have too high correlation.
#> 5 5 1 x_obs norm All predictors are constant or have too high correlation.
imp
#> Class: mids
#> Number of multiple imputations: 1
#> Imputation methods:
#> x_obs
#> "norm"
#> PredictorMatrix:
#> z x y x_miss x_obs
#> x_obs 1 1 1 1 0
#> Number of logged events: 5
#> it im dep meth out
#> 1 1 1 x_obs norm All predictors are constant or have too high correlation.
#> 2 2 1 x_obs norm All predictors are constant or have too high correlation.
#> 3 3 1 x_obs norm All predictors are constant or have too high correlation.
#> 4 4 1 x_obs norm All predictors are constant or have too high correlation.
#> 5 5 1 x_obs norm All predictors are constant or have too high correlation.
# bypass mice:::remove.lindep()
imp <- mice(data,
m = 1,
method = "norm",
formulas = list(x_obs ~ z),
print = FALSE,
eps = 0)
imp$loggedEvents
#> NULL
imp
#> Class: mids
#> Number of multiple imputations: 1
#> Imputation methods:
#> x_obs
#> "norm"
#> PredictorMatrix:
#> z x y x_miss x_obs
#> x_obs 1 1 1 1 0
# estimates are fine when bypassing remove.lindep()
x <- complete(imp) %>%
select(x_obs)
y <- data$y
cov(x, y) #unbiased
#> [,1]
#> x_obs 103.1067
lm(y ~ x_obs, data = cbind(y, x))
#>
#> Call:
#> lm(formula = y ~ x_obs, data = cbind(y, x))
#>
#> Coefficients:
#> (Intercept) x_obs
#> -0.001807 0.995021
Created on 2023-05-19 with reprex v2.0.2
Again, this is expected behaviour.
Your last response crossed mine. Thanks for closing @LucyMcGowan.
Sorry again (and thank you for your patience!) I was trying to recreate the issue and created the ry
bug myself in my recreation (and inadvertently suppressed mice logging in the process 🤦♀️
The behavior still surprises me a bit, if I have a single very good predictor of x
I would expect to be able to use it without issue (with multicollinearity fixes only kicking in if I had multiple variables that were collinear?) but it's possible I haven't thought through this enough (and also clearly I have a toy example here)
I noticed a potential bug with
.norm.draw
when passing a binary variable to use for imputation. Right now this is the line:https://github.com/amices/mice/blob/ea6d83453b39f489c4991662565dc3c8d32cfd4b/R/mice.impute.norm.R#L75
I believe
ry
needs to be replaced withas.logical(ry)
. Otherwise, it is treating all of the values of the binary variable as the same. Here is a simple reprex:Created on 2023-05-19 with reprex v2.0.2