amices / mice

Multivariate Imputation by Chained Equations
https://amices.org/mice/
GNU General Public License v2.0
427 stars 107 forks source link

Conditional PMM routine that excludes (a vector of) observed values from the donor pool #519

Closed gerkovink closed 1 year ago

gerkovink commented 1 year ago

Further development of #392.

Proof of behaviour with blots:

# Use blots
library(mice)
#> 
#> Attaching package: 'mice'
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following objects are masked from 'package:base':
#> 
#>     cbind, rbind
library(magrittr)
blots <- make.blots(boys)
blots$tv$exclude <- c(1:5)
blots$hgt$exclude <- unique(boys$hgt) %>% sample(100)
blots # inspect blots
#> $age
#> list()
#> 
#> $hgt
#> $hgt$exclude
#>   [1] 105.1 151.1 176.6 134.5 192.0  99.0 190.7  76.1 128.1 181.1 158.1 162.1
#>  [13] 182.1 157.5  98.7  69.5 161.5  95.5 104.2 193.6 183.2 151.2 192.3 144.7
#>  [25] 144.8 132.0 105.2 177.3 195.0  53.0 156.2 186.1 170.8 156.7 174.5 148.2
#>  [37] 137.2 116.3 172.4 128.7 148.7 195.5 176.7 164.1 180.1 187.6 157.0 176.4
#>  [49]  82.5  92.6 146.9 192.2  59.9 160.5 145.9 171.5 192.7 141.4 175.4 189.0
#>  [61] 172.5  84.0 167.2  55.5 194.3 191.0 196.7 100.0 177.9 180.8  69.8 179.6
#>  [73]  74.5  91.1 174.3 141.1 185.5 184.0 184.3  61.5 179.7 148.4 179.0 185.4
#>  [85] 151.0 181.5  77.2 178.2 184.4 102.3 177.1 119.6 155.5  79.0 178.6 146.3
#>  [97] 176.0  87.5 192.8 127.5
#> 
#> 
#> $wgt
#> list()
#> 
#> $bmi
#> list()
#> 
#> $hc
#> list()
#> 
#> $gen
#> list()
#> 
#> $phb
#> list()
#> 
#> $tv
#> $tv$exclude
#> [1] 1 2 3 4 5
#> 
#> 
#> $reg
#> list()
set.seed(123); imp.pmm <- mice(boys, method = "pmm", print = FALSE)
set.seed(123); imp <- mice(boys, method = "pmm", print = FALSE, blots = blots)
identical(imp, imp.pmm) # should be FALSE
#> [1] FALSE
# test tv imputations in regular pmm
blots$tv$exclude %in% unlist(c(imp.pmm$imp$tv))# MAY be all TRUE
#> [1] TRUE TRUE TRUE TRUE TRUE
# test tv imputations in regular pmm
blots$tv$exclude %in% unlist(c(imp$imp$tv)) # MUST be all FALSE 
#> [1] FALSE FALSE FALSE FALSE FALSE
# test hgt imputations in regular pmm
blots$hgt$exclude %in% unlist(c(imp.pmm$imp$hgt))# MAY be all TRUE
#>   [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
#>  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [49]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [61] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [73] FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [85]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
#>  [97] FALSE  TRUE FALSE FALSE
# test hgt imputations in exclude pmm
blots$hgt$exclude %in% unlist(c(imp$imp$hgt)) # MUST be all FALSE 
#>   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#>  [97] FALSE FALSE FALSE FALSE

Created on 2022-11-10 with reprex v2.0.2

Proof of behaviour without blots

library(mice)

# TEST 1
# impute without exclude
imp <- mice(nhanes, 
            seed = 123, 
            printFlag = FALSE)
A <- imp$imp$chl

# impute with exclude
imp <- mice(nhanes, meth = "pmm", exclude = c(218, 187), 
            seed = 123, 
            printFlag = FALSE)
B <- imp$imp$chl

any(A == 187 | A == 218) # May be TRUE
#> [1] TRUE
any(B == 187 | B == 218) # Must be FALSE
#> [1] FALSE

# TEST 2 - copied from mice.impute.pmm
set.seed(53177)
xname <- c("age", "hgt", "wgt")
r <- stats::complete.cases(boys[, xname])
x <- boys[r, xname]
y <- boys[r, "tv"]
ry <- !is.na(y)

# Impute missing tv data with original pmm
set.seed(123); yimp.pmm <- mice.impute.pmm(y, ry, x)
set.seed(123); yimp <- mice.impute.pmm(y, ry, x)
identical(yimp, yimp.pmm) #should be TRUE
#> [1] TRUE

set.seed(123); yimp.pmm <- mice.impute.pmm(y, ry, x)
set.seed(123); yimp <- mice.impute.pmm(y, ry, x, exclude = c(20, 25))
identical(yimp, yimp.pmm) # should be FALSE
#> [1] FALSE
c(20, 25) %in% yimp # should be FALSE twice
#> [1] FALSE FALSE

Created on 2022-11-10 with reprex v2.0.2

Closes #392.

gerkovink commented 1 year ago

Done.

stefvanbuuren commented 1 year ago

Thanks. A useful addition for the advanced imputer.