ajdamico / convey

variance of distribution measures estimation of survey data
GNU General Public License v3.0
17 stars 7 forks source link

svywatts missingness handling #394

Closed ajdamico closed 1 year ago

ajdamico commented 1 year ago
library(convey)
library(survey)
library(laeken)
data(eusilc) ; names( eusilc ) <- tolower( names( eusilc ) )

# add a missing and a zero
eusilc[ 1 , 'eqincome' ] <- NA
eusilc[ 2 , 'eqincome' ] <- 0

# linearized design

des_eusilc <- svydesign( ids = ~rb030 , strata = ~db040 ,  weights = ~rb050 , data = eusilc )
des_eusilc <- convey_prep( des_eusilc )

# replicate-weighted design
des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
des_eusilc_rep <- convey_prep( des_eusilc_rep )

# # # # # #

# remove both missings and zeroes
des_eusilc_positive <- subset( des_eusilc , eqincome > 0 )
des_eusilc_positive_rep <- subset( des_eusilc_rep , eqincome > 0 )

# seems OK
svywatts(~eqincome, des_eusilc_positive ,  abs_thresh=10000)

# incorrectly prints NAs
svywatts(~eqincome, des_eusilc_positive , type_thresh= "relq", thresh = TRUE)
# incorrectly prints NAs
svywatts(~eqincome, des_eusilc_positive , type_thresh= "relm" , thresh = TRUE)

# seems OK
svywatts(~eqincome, des_eusilc_positive_rep  ,  abs_thresh=10000)

# incorrectly prints NAs
svywatts(~eqincome, des_eusilc_positive_rep  , type_thresh= "relq", thresh = TRUE)
# incorrectly prints NAs
svywatts(~eqincome, des_eusilc_positive_rep  , type_thresh= "relm" , thresh = TRUE)

# seems OK
svywatts(~eqincome, des_eusilc_positive ,  abs_thresh=10000,na.rm=TRUE)
# incorrectly gives error
svywatts(~eqincome, des_eusilc_positive , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# incorrectly prints NAs
svywatts(~eqincome, des_eusilc_positive , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_positive_rep  ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_positive_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_positive_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# # # # # #

# remove zeroes but not missings
des_eusilc_nozero <- subset( des_eusilc , eqincome > 0 | is.na( eqincome ) )
des_eusilc_nozero_rep <- subset( des_eusilc_rep , eqincome > 0 | is.na( eqincome ) )

# seems OK
svywatts(~eqincome, des_eusilc_nozero ,  abs_thresh=10000,na.rm=TRUE)
# incorrectly gives error
svywatts(~eqincome, des_eusilc_nozero , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# incorrectly prints NA
svywatts(~eqincome, des_eusilc_nozero , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# seems OK
svywatts(~eqincome, des_eusilc_nozero_rep  ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)
guilhermejacob commented 1 year ago

Remember that when you do subset, you change the domain. However, for relm and relq, you need the full sample to estimate the thresholds and subset will not drop NAs of the full sample. Therefore, it should still result in NA if there is no NA in the domain, but there is in any other domain. This is correct:

library(convey)
library(survey)
library(laeken)
data(eusilc) ; names( eusilc ) <- tolower( names( eusilc ) )

# add a missing and a zero
eusilc[ 1 , 'eqincome' ] <- NA
eusilc[ 2 , 'eqincome' ] <- 0

# linearized design

des_eusilc <- svydesign( ids = ~rb030 , strata = ~db040 ,  weights = ~rb050 , data = eusilc )
des_eusilc <- convey_prep( des_eusilc )

# replicate-weighted design
des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
des_eusilc_rep <- convey_prep( des_eusilc_rep )

# # # # # #

# remove both missings and zeroes
des_eusilc_positive <- subset( des_eusilc , eqincome > 0 )
des_eusilc_positive_rep <- subset( des_eusilc_rep , eqincome > 0 )

# seems OK
svywatts(~eqincome, des_eusilc_positive ,  abs_thresh=10000)
# correctly prints NAs
svywatts(~eqincome, des_eusilc_positive , type_thresh= "relq", thresh = TRUE)
# correctly prints NAs
svywatts(~eqincome, des_eusilc_positive , type_thresh= "relm" , thresh = TRUE)

# seems OK
svywatts(~eqincome, des_eusilc_positive_rep  ,  abs_thresh=10000)
# correctly prints NAs
svywatts(~eqincome, des_eusilc_positive_rep  , type_thresh= "relq", thresh = TRUE)
# correctly prints NAs
svywatts(~eqincome, des_eusilc_positive_rep  , type_thresh= "relm" , thresh = TRUE)

# seems OK
svywatts(~eqincome, des_eusilc_positive ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_positive , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_positive , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# seems OK
svywatts(~eqincome, des_eusilc_positive_rep  ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_positive_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_positive_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# # # # # #

# remove zeroes but not missings
des_eusilc_nozero <- subset( des_eusilc , eqincome > 0 | is.na( eqincome ) )
des_eusilc_nozero_rep <- subset( des_eusilc_rep , eqincome > 0 | is.na( eqincome ) )

# seems OK
svywatts(~eqincome, des_eusilc_nozero ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_nozero , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_nozero , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# seems OK
svywatts(~eqincome, des_eusilc_nozero_rep  ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywatts(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)
ajdamico commented 1 year ago

i'm still thinking about this behavior: do we want an NA in any other domain to return NA while a negative or a zero in any other domain does get subsetted out?