ncn-foreigners / nonprobsvy

An R package for modern methods for non-probability surveys
https://ncn-foreigners.github.io/nonprobsvy/
Other
29 stars 4 forks source link

Default `epsilon` value in mass imputation #25

Closed Kertoo closed 11 months ago

Kertoo commented 11 months ago

By default when calling controlOut epsilon value is set to 1e-6 which is a sensible choice FOR stats::glm but this epsilon value is also used for fitting RANN::nn2(). The default eps in RANN::nn2() is set to 0, this is because when eps = 0 RANN::nn2 performs exact nearest neighbour search but when eps > 0 and appropriate solution is used.

This leads to worse results (as presented in code bellow) in predictive mean matching (ditto for kNN imputation) and can be confusing and users would probably expect the default to be set to an exact solution

library(nonprobsvy)

set.seed(1234567890)
N <- 1e5 ## 1000000
n <- 100
x1 <- rnorm(n = N, mean = 1, sd = 1)
x2 <- rexp(n = N, rate = 1)
epsilon <- rnorm(n = N) # rnorm(N)
population <- data.frame(
  x1,
  x2,
  y1 = 1 + x1 + x2 + epsilon,
  y2 = 0.5*(x1 - 0.5)^2 + x2 + epsilon,
  p1 = exp(x2)/(1+exp(x2)),
  p2 = exp(-0.5+0.5*(x2-2)^2)/(1+exp(-0.5+0.5*(x2-2)^2)),
  base_w_srs = N/n, 
  flag_bd1 = rbinom(n = N, size = 1, prob = p1), 
  flag_srs = as.numeric(1:N %in% sample(1:N, size = n))
)
base_w_bd <- N/sum(population$flag_bd1)
sample_prob <- svydesign(ids= ~1, weights = ~ base_w_srs,
                         data = subset(population, flag_srs == 1))

pmm1 <- nonprob(
  outcome = y1 ~ x1 + x2,
  data = population[population$flag_bd1 == 1, , drop = FALSE],
  svydesign = sample_prob,
  method_outcome = "pmm",
  family_outcome = "gaussian",
  control_outcome = controlOut(k=1)
)

pmm2 <- nonprob(
  outcome = y2 ~ x1 + x2,
  data = population[population$flag_bd1 == 1, , drop = FALSE],
  svydesign = sample_prob,
  method_outcome = "pmm",
  family_outcome = "gaussian",
  control_outcome = controlOut(k=1)
)

# manually

df_prob <- population[population$flag_srs == 1, ]
df_non  <- population[population$flag_bd1 == 1, ]

lm1 <- lm(y1 ~ x1 + x2, data = df_non)
lm2 <- lm(y2 ~ x1 + x2, data = df_non)

pred1 <- predict(lm1, newdata = df_prob)
pred2 <- predict(lm2, newdata = df_prob)

nn1 <- sapply(pred1, FUN = function(x) {which.min(abs(df_non$y1 - x))})
nn2 <- sapply(pred2, FUN = function(x) {which.min(abs(df_non$y2 - x))})

(est1 <- weighted.mean(x = df_non$y1[nn1],
                       w = df_prob$base_w_srs))
pmm1$output$mean

(est2 <- weighted.mean(x = df_non$y2[nn2],
                       w = df_prob$base_w_srs))
pmm2$output$mean
population[, c("y1", "y2"), drop = FALSE] |> colMeans()

xx1 <- dbscan::kNN(k = 1, x = cbind(df_non$y1), query = cbind(pred1))
# the same
xx11 <- dbscan::kNN(k = 1, x = cbind(df_non$y1), query = cbind(pred1), search = "linear")

xx111 <- RANN::nn2(data = cbind(df_non$y1), query = cbind(pred1), k = 1, eps = 0)

cbind(nn1, xx1$id, pmm1$outcome$y1$model_rand$nn.idx, xx111$nn.idx)