amices / mice

Multivariate Imputation by Chained Equations
https://amices.org/mice/
GNU General Public License v2.0
428 stars 107 forks source link

Behaviour of `mice.impute.norm` #558

Closed LucyMcGowan closed 1 year ago

LucyMcGowan commented 1 year ago

I noticed a potential bug with .norm.draw when passing a binary variable to use for imputation. Right now this is the line:

https://github.com/amices/mice/blob/ea6d83453b39f489c4991662565dc3c8d32cfd4b/R/mice.impute.norm.R#L75

I believe ry needs to be replaced with as.logical(ry). Otherwise, it is treating all of the values of the binary variable as the same. Here is a simple reprex:

set.seed(1)
n <- 10
x <- cbind(1, as.matrix(rbinom(n, 1, 0.5)))
ry <- rbinom(n, 1, 0.5)

x[ry, , drop = FALSE]
#>      [,1] [,2]
#> [1,]    1    0
#> [2,]    1    0
#> [3,]    1    0
#> [4,]    1    0
#> [5,]    1    0
x[as.logical(ry), , drop = FALSE]
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    1    0
#> [3,]    1    1
#> [4,]    1    1
#> [5,]    1    0

Created on 2023-05-19 with reprex v2.0.2

gerkovink commented 1 year ago

Seems fine by me. ry is already logical in the mice sampler.

library(mice)

# custom imputation function that also checks if ry is logical
mice.impute.ry <- function(y, ry, x, wy = NULL, ...){
  if (is.null(wy)) wy <- !ry
  print(is.logical(ry))
  yry <<- y[ry] # save y[ry] to environment
  xry <<- x[ry, , drop = FALSE] # save x[ry, ] to environment
  print(cbind(y, ry))
  print(yry)
  y[wy] <- -1
}

# custom method vector
meth <- make.method(nhanes2)
meth["hyp"] <- "ry"

# run mice
imp <- mice(nhanes, 
     m = 1, 
     maxit = 1, 
     printFlag = FALSE, 
     method = meth)
#> [1] TRUE
#>    y ry
#> 1  1  0
#> 2  1  1
#> 3  1  1
#> 4  1  0
#> 5  1  1
#> 6  1  0
#> 7  1  1
#> 8  1  1
#> 9  1  1
#> 10 1  0
#> 11 1  0
#> 12 1  0
#> 13 1  1
#> 14 2  1
#> 15 1  1
#> 16 1  0
#> 17 2  1
#> 18 2  1
#> 19 1  1
#> 20 2  1
#> 21 1  0
#> 22 1  1
#> 23 1  1
#> 24 1  1
#> 25 1  1
#>  [1] 1 1 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1

# combining incomplete bmi and imputed bmi
# imputations should all be -1
cbind(nhanes$hyp, 
      complete(imp)$hyp)
#>       [,1] [,2]
#>  [1,]   NA   -1
#>  [2,]    1    1
#>  [3,]    1    1
#>  [4,]   NA   -1
#>  [5,]    1    1
#>  [6,]   NA   -1
#>  [7,]    1    1
#>  [8,]    1    1
#>  [9,]    1    1
#> [10,]   NA   -1
#> [11,]   NA   -1
#> [12,]   NA   -1
#> [13,]    1    1
#> [14,]    2    2
#> [15,]    1    1
#> [16,]   NA   -1
#> [17,]    2    2
#> [18,]    2    2
#> [19,]    1    1
#> [20,]    2    2
#> [21,]   NA   -1
#> [22,]    1    1
#> [23,]    1    1
#> [24,]    1    1
#> [25,]    1    1

# the yry vector used in the sampler
yry
#>  [1] 1 1 1 1 1 1 1 2 1 2 2 1 2 1 1 1 1

# the xry matrix used in the sampler
xry
#>    age  bmi chl
#> 2    2 22.7 187
#> 3    1 29.6 187
#> 5    1 20.4 113
#> 7    1 22.5 118
#> 8    1 30.1 187
#> 9    2 22.0 238
#> 13   3 21.7 206
#> 14   2 28.7 204
#> 15   1 29.6 229
#> 17   3 27.2 284
#> 18   2 26.3 199
#> 19   1 35.3 218
#> 20   3 25.5 284
#> 22   1 33.2 229
#> 23   1 27.5 131
#> 24   3 24.9 131
#> 25   2 27.4 186

# versions and more
sessionInfo()
#> R version 4.2.2 (2022-10-31)
#> Platform: aarch64-apple-darwin20 (64-bit)
#> Running under: macOS Ventura 13.1
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] mice_3.15.3
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.10      rstudioapi_0.14  knitr_1.42       magrittr_2.0.3  
#>  [5] tidyselect_1.2.0 lattice_0.20-45  R6_2.5.1         rlang_1.1.1     
#>  [9] fastmap_1.1.1    fansi_1.0.4      dplyr_1.1.2      tools_4.2.2     
#> [13] grid_4.2.2       broom_1.0.4      xfun_0.37        utf8_1.2.3      
#> [17] cli_3.6.1        withr_2.5.0      htmltools_0.5.4  yaml_2.3.7      
#> [21] digest_0.6.31    tibble_3.2.1     lifecycle_1.0.3  tidyr_1.3.0     
#> [25] purrr_1.0.1      vctrs_0.6.2      fs_1.6.1         glue_1.6.2      
#> [29] evaluate_0.20    rmarkdown_2.20   reprex_2.0.2     compiler_4.2.2  
#> [33] pillar_1.9.0     backports_1.4.1  generics_0.1.3   pkgconfig_2.0.3

Created on 2023-05-19 with reprex v2.0.2

gerkovink commented 1 year ago

See https://github.com/amices/mice/blob/ea6d83453b39f489c4991662565dc3c8d32cfd4b/R/sampler.R#L9 and https://github.com/amices/mice/blob/ea6d83453b39f489c4991662565dc3c8d32cfd4b/R/sampler.R#L34

LucyMcGowan commented 1 year ago

Ah sorry! I was barking up the wrong tree it seems. Here is the root of where this (potential!) bug comes in. I am getting unexpected (to me) behavior with mice.impute.norm as in the reprex below:

library(tidyverse)
suppressPackageStartupMessages(library(mice))

set.seed(1)
n <- 10000
data <- tibble(
  z = rnorm(n, sd = 1),
  x = 10 * z + rnorm(n),
  y = x + rnorm(n),
  x_miss = ifelse(z, rbinom(n, 1, 0.5), rbinom(n, 1, 0.25)),
  x_obs = ifelse(x_miss, NA, x)
)

## deterministic imputation
data$imp <- ifelse(data$x_miss, 
                   predict(lm(x_obs ~ z, data = data), data),
                   data$x_obs)
cov(data$imp, data$y)
#> [1] 103.029
lm(y ~ imp, data) #unbiased
#> 
#> Call:
#> lm(formula = y ~ imp, data = data)
#> 
#> Coefficients:
#> (Intercept)          imp  
#>    0.006653     1.000305

## when I do it "by hand" trying to mimic mice inner workings) I get the same answer (as expected):
x <- cbind(1, as.matrix(data$z[data$x_miss==0]))
y <- data$x[data$x_miss==0]
qr <- lm.fit(x = x, y = y)
c <- qr$coef
f <- qr$fitted.values
r <- t(qr$residuals)
v <- try(solve(as.matrix(crossprod(qr.R(qr$qr)))), silent = TRUE)

sigma <- sqrt(sum(r^2) / rchisq(1, qr$df))
beta <-  c + t(chol(mice:::sym(v))) %*% rnorm(2, 0, sigma) 
e <- rnorm(n, 0, sigma)
x_p <- beta[1] + beta[2] * data$z + e
data$byhand_imp <- ifelse(data$x_miss, x_p, data$x_obs)
cov(data$byhand_imp, data$y)
#> [1] 102.9739
lm(y ~ byhand_imp, data) #unbiased
#> 
#> Call:
#> lm(formula = y ~ byhand_imp, data = data)
#> 
#> Coefficients:
#> (Intercept)   byhand_imp  
#>     0.02286      0.99614

## do it with mice using method = "norm", covariance is not what I would have expected:
data$mice_imp <- complete(mice(data, m = 1, method = "norm", formulas = list(x_obs ~ z), print = FALSE))$x_obs
#> Warning: Number of logged events: 5
cov(data$mice_imp, data$y)
#> [1] 51.24754
lm(y ~ mice_imp, data) #uh oh biased!
#> 
#> Call:
#> lm(formula = y ~ mice_imp, data = data)
#> 
#> Coefficients:
#> (Intercept)     mice_imp  
#>     -0.1282       0.4822

Created on 2023-05-19 with reprex v2.0.2

basically I would expect the covariance between the imputed value and y to be the same as when I did deterministic imputation and this is the case when I try to use the inner working of mice to do the imputation, but when I run the mice function it is 1/2 of what I would expect. I'm sorry I couldn't track down the cause better!

LucyMcGowan commented 1 year ago

Oh no, this might be a user error, for some reason, when I run it locally I see:

``` r
library(tidyverse)
suppressPackageStartupMessages(library(mice))

set.seed(1)
n <- 10000
data <- tibble(
  z = rnorm(n, sd = 1),
  x = 10 * z + rnorm(n),
  y = x + rnorm(n),
  x_miss = ifelse(z, rbinom(n, 1, 0.5), rbinom(n, 1, 0.25)),
  x_obs = ifelse(x_miss, NA, x)
)

m <- mice(data, m = 1, method = "norm", formulas = list(x_obs ~ z))
#> 
#>  iter imp variable
#>   1   1  x_obs
#>   2   1  x_obs
#>   3   1  x_obs
#>   4   1  x_obs
#>   5   1  x_obs
#> Warning: Number of logged events: 5
m$loggedEvents
NULL

(and still see the strange result) but when I run it in a new session I see: All predictors are constant or have too high correlation. So that is likely the issue. Closing this! Thank you for looking into it.

gerkovink commented 1 year ago

The generated data system is multicolinear and mice removes predictors via mice:::remove.lindep(). This can be bypassed, see below.

library(tibble)
library(dplyr)
library(mice, warn.conflicts = FALSE)

# fix seed
set.seed(1)

# data generation
n <- 10000
data <- tibble(
  z = rnorm(n, sd = 1),
  x = 10 * z + rnorm(n),
  y = x + rnorm(n),
  x_miss = ifelse(z, rbinom(n, 1, 0.5), rbinom(n, 1, 0.25)),
  x_obs = ifelse(x_miss, NA, x)
)

imp <- mice(data, 
            m = 1, 
            method = "norm", 
            formulas = list(x_obs ~ z), 
            print = FALSE)
#> Warning: Number of logged events: 5
imp$loggedEvents
#>   it im   dep meth                                                       out
#> 1  1  1 x_obs norm All predictors are constant or have too high correlation.
#> 2  2  1 x_obs norm All predictors are constant or have too high correlation.
#> 3  3  1 x_obs norm All predictors are constant or have too high correlation.
#> 4  4  1 x_obs norm All predictors are constant or have too high correlation.
#> 5  5  1 x_obs norm All predictors are constant or have too high correlation.
imp
#> Class: mids
#> Number of multiple imputations:  1 
#> Imputation methods:
#>  x_obs 
#> "norm" 
#> PredictorMatrix:
#>       z x y x_miss x_obs
#> x_obs 1 1 1      1     0
#> Number of logged events:  5 
#>   it im   dep meth                                                       out
#> 1  1  1 x_obs norm All predictors are constant or have too high correlation.
#> 2  2  1 x_obs norm All predictors are constant or have too high correlation.
#> 3  3  1 x_obs norm All predictors are constant or have too high correlation.
#> 4  4  1 x_obs norm All predictors are constant or have too high correlation.
#> 5  5  1 x_obs norm All predictors are constant or have too high correlation.

# bypass mice:::remove.lindep()
imp <- mice(data, 
            m = 1, 
            method = "norm", 
            formulas = list(x_obs ~ z), 
            print = FALSE, 
            eps = 0)
imp$loggedEvents
#> NULL
imp
#> Class: mids
#> Number of multiple imputations:  1 
#> Imputation methods:
#>  x_obs 
#> "norm" 
#> PredictorMatrix:
#>       z x y x_miss x_obs
#> x_obs 1 1 1      1     0

# estimates are fine when bypassing remove.lindep()
x <- complete(imp) %>% 
  select(x_obs)
y <- data$y
cov(x, y) #unbiased
#>           [,1]
#> x_obs 103.1067
lm(y ~ x_obs, data = cbind(y, x)) 
#> 
#> Call:
#> lm(formula = y ~ x_obs, data = cbind(y, x))
#> 
#> Coefficients:
#> (Intercept)        x_obs  
#>   -0.001807     0.995021

Created on 2023-05-19 with reprex v2.0.2

Again, this is expected behaviour.

gerkovink commented 1 year ago

Your last response crossed mine. Thanks for closing @LucyMcGowan.

LucyMcGowan commented 1 year ago

Sorry again (and thank you for your patience!) I was trying to recreate the issue and created the ry bug myself in my recreation (and inadvertently suppressed mice logging in the process 🤦‍♀️

The behavior still surprises me a bit, if I have a single very good predictor of x I would expect to be able to use it without issue (with multicollinearity fixes only kicking in if I had multiple variables that were collinear?) but it's possible I haven't thought through this enough (and also clearly I have a toy example here)