ajdamico / convey

variance of distribution measures estimation of survey data
GNU General Public License v3.0
17 stars 7 forks source link

svywattsdec missingness handling #395

Closed ajdamico closed 1 year ago

ajdamico commented 1 year ago
library(convey)
library(survey)
library(laeken)
data(eusilc) ; names( eusilc ) <- tolower( names( eusilc ) )

# add a missing and a zero
eusilc[ 1 , 'eqincome' ] <- NA
eusilc[ 2 , 'eqincome' ] <- 0

# linearized design

des_eusilc <- svydesign( ids = ~rb030 , strata = ~db040 ,  weights = ~rb050 , data = eusilc )
des_eusilc <- convey_prep( des_eusilc )

# replicate-weighted design
des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
des_eusilc_rep <- convey_prep( des_eusilc_rep )

# # # # # #

# remove both missings and zeroes
des_eusilc_positive <- subset( des_eusilc , eqincome > 0 )
des_eusilc_positive_rep <- subset( des_eusilc_rep , eqincome > 0 )

# seems OK
svywattsdec(~eqincome, des_eusilc_positive ,  abs_thresh=10000)
# incorrectly shows NAs
svywattsdec(~eqincome, des_eusilc_positive , type_thresh= "relq", thresh = TRUE)
# incorrectly shows NAs
svywattsdec(~eqincome, des_eusilc_positive , type_thresh= "relm" , thresh = TRUE)
# incorrectly shows NAs in SE
svywattsdec(~eqincome, des_eusilc_positive_rep  ,  abs_thresh=10000)
# incorrectly shows NAs
svywattsdec(~eqincome, des_eusilc_positive_rep  , type_thresh= "relq", thresh = TRUE)
# incorrectly shows NAs
svywattsdec(~eqincome, des_eusilc_positive_rep  , type_thresh= "relm" , thresh = TRUE)

# seems OK
svywattsdec(~eqincome, des_eusilc_positive ,  abs_thresh=10000,na.rm=TRUE)
# incorrectly gives error
svywattsdec(~eqincome, des_eusilc_positive , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# incorrectly shows missing
svywattsdec(~eqincome, des_eusilc_positive , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_positive_rep  ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_positive_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_positive_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# # # # # #

# remove missings but not zeroes
des_eusilc_nomiss <- subset( des_eusilc , !is.na( eqincome ) )
des_eusilc_nomiss_rep <- subset( des_eusilc_rep , !is.na( eqincome ) )

# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss ,  abs_thresh=10000)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss , type_thresh= "relq", thresh = TRUE)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss , type_thresh= "relm" , thresh = TRUE)

# incorrectly prints results (should give error)
svywattsdec(~eqincome, des_eusilc_nomiss_rep  ,  abs_thresh=10000)
# incorrectly prints results (should give error)
svywattsdec(~eqincome, des_eusilc_nomiss_rep  , type_thresh= "relq", thresh = TRUE)
# incorrectly prints results (should give error)
svywattsdec(~eqincome, des_eusilc_nomiss_rep  , type_thresh= "relm" , thresh = TRUE)

# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss ,  abs_thresh=10000,na.rm=TRUE)

# incorrectly gives attribute error
svywattsdec(~eqincome, des_eusilc_nomiss , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)

# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# incorrectly prints results (should give error)
svywattsdec(~eqincome, des_eusilc_nomiss_rep  ,  abs_thresh=10000,na.rm=TRUE)

# incorrectly prints results (should give error)
svywattsdec(~eqincome, des_eusilc_nomiss_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)

# incorrectly prints results (should give error)
svywattsdec(~eqincome, des_eusilc_nomiss_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# # # # # #

# remove zeroes but not missings
des_eusilc_nozero <- subset( des_eusilc , eqincome > 0 | is.na( eqincome ) )
des_eusilc_nozero_rep <- subset( des_eusilc_rep , eqincome > 0 | is.na( eqincome ) )

# these six correctly return NAs
svywattsdec(~eqincome, des_eusilc_nozero ,  abs_thresh=10000)
svywattsdec(~eqincome, des_eusilc_nozero , type_thresh= "relq", thresh = TRUE)
svywattsdec(~eqincome, des_eusilc_nozero , type_thresh= "relm" , thresh = TRUE)
svywattsdec(~eqincome, des_eusilc_nozero_rep  ,  abs_thresh=10000)
svywattsdec(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relq", thresh = TRUE)
svywattsdec(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relm" , thresh = TRUE)

# seems OK
svywattsdec(~eqincome, des_eusilc_nozero ,  abs_thresh=10000,na.rm=TRUE)
# incorrectly gives attribute error
svywattsdec(~eqincome, des_eusilc_nozero , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# incorrectly prints NAs
svywattsdec(~eqincome, des_eusilc_nozero , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_nozero_rep  ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)
guilhermejacob commented 1 year ago

Remember that when you do subset, you change the domain. However, for relm and relq, you need the full sample to estimate the thresholds and subset will not drop NAs of the full sample. Therefore, it should still result in NA if there is no NA in the domain, but there is in any other domain. This is correct:

library(convey)
library(survey)
library(laeken)
data(eusilc) ; names( eusilc ) <- tolower( names( eusilc ) )

# add a missing and a zero
eusilc[ 1 , 'eqincome' ] <- NA
eusilc[ 2 , 'eqincome' ] <- 0

# linearized design

des_eusilc <- svydesign( ids = ~rb030 , strata = ~db040 ,  weights = ~rb050 , data = eusilc )
des_eusilc <- convey_prep( des_eusilc )

# replicate-weighted design
des_eusilc_rep <- as.svrepdesign( des_eusilc , type = "bootstrap" )
des_eusilc_rep <- convey_prep( des_eusilc_rep )

# # # # # #

# remove both missings and zeroes
des_eusilc_positive <- subset( des_eusilc , eqincome > 0 )
des_eusilc_positive_rep <- subset( des_eusilc_rep , eqincome > 0 )

# seems OK
svywattsdec(~eqincome, des_eusilc_positive ,  abs_thresh=10000)
# correctly shows NAs
svywattsdec(~eqincome, des_eusilc_positive , type_thresh= "relq", thresh = TRUE)
# correctly shows NAs
svywattsdec(~eqincome, des_eusilc_positive , type_thresh= "relm" , thresh = TRUE)

# seems OK
svywattsdec(~eqincome, des_eusilc_positive_rep  ,  abs_thresh=10000)
# correctly shows NAs
svywattsdec(~eqincome, des_eusilc_positive_rep  , type_thresh= "relq", thresh = TRUE)
# correctly shows NAs
svywattsdec(~eqincome, des_eusilc_positive_rep  , type_thresh= "relm" , thresh = TRUE)

# seems OK
svywattsdec(~eqincome, des_eusilc_positive ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_positive , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# incorrectly shows missing
svywattsdec(~eqincome, des_eusilc_positive , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

svyfgt(~eqincome, des_eusilc_positive , type_thresh= "relq" , thresh = TRUE,na.rm=TRUE , g=0)

# seems OK
svywattsdec(~eqincome, des_eusilc_positive_rep  ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_positive_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_positive_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# # # # # #

# remove missings but not zeroes
des_eusilc_nomiss <- subset( des_eusilc , !is.na( eqincome ) )
des_eusilc_nomiss_rep <- subset( des_eusilc_rep , !is.na( eqincome ) )

# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss ,  abs_thresh=10000)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss , type_thresh= "relq", thresh = TRUE)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss , type_thresh= "relm" , thresh = TRUE)

# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss_rep  ,  abs_thresh=10000)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss_rep  , type_thresh= "relq", thresh = TRUE)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss_rep  , type_thresh= "relm" , thresh = TRUE)

# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss ,  abs_thresh=10000,na.rm=TRUE)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss_rep  ,  abs_thresh=10000,na.rm=TRUE)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# correctly gives "zeroes not allowed" error
svywattsdec(~eqincome, des_eusilc_nomiss_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# # # # # #

# remove zeroes but not missings
des_eusilc_nozero <- subset( des_eusilc , eqincome > 0 | is.na( eqincome ) )
des_eusilc_nozero_rep <- subset( des_eusilc_rep , eqincome > 0 | is.na( eqincome ) )

# these six correctly return NAs
svywattsdec(~eqincome, des_eusilc_nozero ,  abs_thresh=10000)
svywattsdec(~eqincome, des_eusilc_nozero , type_thresh= "relq", thresh = TRUE)
svywattsdec(~eqincome, des_eusilc_nozero , type_thresh= "relm" , thresh = TRUE)
svywattsdec(~eqincome, des_eusilc_nozero_rep  ,  abs_thresh=10000)
svywattsdec(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relq", thresh = TRUE)
svywattsdec(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relm" , thresh = TRUE)

# seems OK
svywattsdec(~eqincome, des_eusilc_nozero ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_nozero , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_nozero , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)

# seems OK
svywattsdec(~eqincome, des_eusilc_nozero_rep  ,  abs_thresh=10000,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relq", thresh = TRUE,na.rm=TRUE)
# seems OK
svywattsdec(~eqincome, des_eusilc_nozero_rep  , type_thresh= "relm" , thresh = TRUE,na.rm=TRUE)
ajdamico commented 1 year ago

i'm still thinking about this behavior: do we want an NA in any other domain to return NA while a negative or a zero in any other domain does get subsetted out?

guilhermejacob commented 1 year ago

I think so. It reinforces the importance of domain vs full sample when estimating the poverty threshold. As long as it is clear in the documentation, I do not think this is a problem.

What would be the alternative? You could add a flag on the subset function to subset the full sample or the domain, but it would look weird. Right?

ajdamico commented 1 year ago

the removal of negatives and zeroes but not missings feels arbitrary to me -- what's your justification for treating them differently?

guilhermejacob commented 1 year ago

My justification is that negatives and zeroes are not a problem for the estimation of the poverty thresholds, but are a problem for the estimation of poverty measures

ajdamico commented 1 year ago

could you modify as you see fit and fill in the two blanks? does this apply to any other functions?

For the svywatts and svywattsdec functions, zeroes and negative numbers in the analysis domain cause an error because [explanation]. However, zeroes and negative values in the full survey design that are outside of the domain of analysis are valid to calculate the poverty threshold because [explanation]. Missing values are treated differently. NA values anywhere in the full survey design (not only the subset, or the domain of analysis) will cause these two functions to return NA results: to ignore NA values, use na.rm = TRUE.

guilhermejacob commented 1 year ago

Sure. Like this:

For the svywatts and svywattsdec functions, zeroes and negative numbers in the analysis domain cause an error because of the logarithm function in the definition of this poverty measure. However, zeroes and negative values in the full survey design that are outside of the domain of analysis are valid to calculate the poverty threshold because zeroes and negatives are not a problem for computing quantiles (used when type_thresh = "relq") or means (used when type_thresh = "relm") . Missing values are treated differently. NA values anywhere in the full survey design (not only the subset, or the domain of analysis) will cause these quantiles and means to return NA results. Then, the user should indicate whether the function should ignore NA values; to do this, the user can set na.rm = TRUE.