Open ajdamico opened 7 years ago
@guilhermejacob any chance you could match some unicef-published number off of mics that appropriately uses a survey design? not urgent at all. thanks
This matches some results of this, although the SE for literacy rate among men is different.
library( haven )
load( "~\\mics\\Guyana\\2014\\wm.rda" )
wm <- x
wm$hl1 <- wm$ln
rm( x )
load( "~\\mics\\Guyana\\2014\\mn.rda" )
mn <- x
rm( x )
# create codebook
wm_codebook <- lapply( wm, function( x ) attributes( x ) )
wm <- zap_labels( wm )
mn_codebook <- lapply( mn, function( x ) attributes( x ) )
mn <- zap_labels( mn )
# Can read a short sentence or went to secondary school
wm$literate <- wm$wb7 == 3 | wm$wb4 %in% 2:4
wm$literate[ is.na( wm$wb4 ) ] = (wm$wb7 == 3) [ is.na( wm$wb4 ) ]
wm$literate[ is.na( wm$wb7 ) ] = ( wm$wb4 %in% 2:4 ) [ is.na( wm$wb7 ) ]
mn$literate <- mn$mwb7 == 3 | mn$mwb4 %in% 2:4
mn$literate[ is.na( mn$mwb4 ) ] = (mn$mwb7 == 3) [ is.na( mn$mwb4 ) ]
mn$literate[ is.na( mn$mwb7 ) ] = ( mn$mwb4 %in% 2:4 ) [ is.na( mn$mwb7 ) ]
library(survey)
design_wm <- svydesign(ids=~hh1+hh2, strata=~hh6+hh7, weights=~wmweight, data=wm )
design_mn <- svydesign(ids=~hh1+hh2, strata=~hh6+hh7, weights=~mnweight, data=mn )
# Women
svymean( ~factor(literate), subset( design_wm , wb2 >= 15 & wb2 <= 24 ), na.rm = TRUE )
svyby( ~factor(literate), ~hh6, subset( design_wm , wb2 >= 15 & wb2 <= 24 ), svymean, na.rm = TRUE )
svytotal( ~factor(literate), subset( design_wm , wb2 >= 15 & wb2 <= 24 ), na.rm = TRUE )
sum( coef( svytotal( ~factor(literate), subset( design_wm , wb2 >= 15 & wb2 <= 24 ), na.rm = TRUE ) ) )
# Men : SEs are different!
svymean( ~factor(literate), subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), na.rm = TRUE )
svyby( ~factor(literate), ~hh6, subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), svymean, na.rm = TRUE )
svytotal( ~factor(literate), subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), na.rm = TRUE )
sum( coef( svytotal( ~factor(literate), subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), na.rm = TRUE ) ) )
wow, thank you! that's annoying that it doesn't match. any idea why? maybe they're using a fpc=
or something else that might throw off the design slightly? or an older version of the microdata? thanks for this, guilherme
@ajdamico , I don't really know. They used SPSS for this. Since our SE estimates are slightly higher, I'd like to try some fpc=
approach. Can you help me with that?
Also, check this. They provide all SPSS syntaxes to reproduce results.
I think they use fpc or different strata.
This SPSS file shows how the plan is set. This one shows how they create new strata. SE03 Calculate.txt: this calculates the statistics.
This is better. Thanks, @ajdamico!
library(lodown)
mics_catalog <- get_catalog( data_name = "mics" , your_email = "email@address.com" , your_password = "password" )
mics_catalog <- mics_catalog[ grepl( "Guyana", mics_catalog$country, ignore.case = T ) , ]
lodown( data_name = "mics", catalog = mics_catalog , your_email = "email@address.com" , your_password = "password" )
# Literacy rate
library( haven )
load( "~\\mics\\Guyana\\2014\\wm.rda" )
wm <- x
wm <- wm[ order( wm$hh1 , wm$hh2 , wm$ln ) , ]
rm( x )
load( "~\\mics\\Guyana\\2014\\mn.rda" )
mn <- x
mn <- mn[ order( mn$hh1 , mn$hh2 , mn$ln ) , ]
rm( x )
# create codebook
wm_codebook <- lapply( wm, function( x ) attributes( x ) )
wm <- zap_labels( wm )
mn_codebook <- lapply( mn, function( x ) attributes( x ) )
mn <- zap_labels( mn )
###############################
# * Sort according to sampling domains (urban/rural within region) and cluster within domains.
# * The program assumes that the sampling domains are urban and rural areas (HH6) within 4 regions (HH7).
# sort cases HH7 HH6 HH1.
x <- mn[ with( mn , order( hh7 , hh6 , hh1 ) ) , ]
# * Compute stratum variable as pairs of clusters within same domains.
# compute strat = 1.
# compute strpair = 0.
x$strat <- 1
x$strpair <- 0
# do if (hh7 <> lag(hh7) or hh6 <> lag(hh6)).
# + compute strat = lag(strat)+1.
# + compute strpair = 0.
# else if (hh1 = lag(hh1)).
# + compute strat = lag(strat).
# + compute strpair = lag(strpair).
# else if (lag(strpair) = 0).
# + compute strat = lag(strat).
# + compute strpair = 1.
# else.
# + compute strat = lag(strat)+1.
# + compute strpair = 0.
# end if.
for( this_rec in seq( 2 , nrow( x ) ) ){
first_condition <- ( x[ this_rec , 'hh7' ] != x[ this_rec - 1 , 'hh7' ] ) | ( x[ this_rec , 'hh6' ] != x[ this_rec - 1 , 'hh6' ] )
second_condition <- x[ this_rec , 'hh1' ] == x[ this_rec - 1 , 'hh1' ]
third_condition <- x[ this_rec - 1 , 'strpair' ] == 0
if( first_condition ){
x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ] + 1
x[ this_rec , 'strpair' ] <- 0
} else if( second_condition ){
x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ]
x[ this_rec , 'strpair' ] <- x[ this_rec - 1 , 'strpair' ]
} else if( third_condition ){
x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ]
x[ this_rec , 'strpair' ] <- 1
} else {
x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ] + 1
x[ this_rec , 'strpair' ] <- 0
}
}
# * Sort in reverse order of cluster to check for single clusters in a domain.
# sort cases hh7 hh6 hh1 (D).
x <- x[ order( x$hh7 , x$hh6 , x$hh1 , decreasing = TRUE ) , ]
# * Recode single clusters in a domain into the prior stratum group.
# do if (sysmis(lag(hh7)) or hh7 <> lag(hh7) or hh6 <> lag(hh6)).
# + do if (strpair = 0).
# + compute strat = strat - 1.
# + end if.
# else if (hh1 = lag(hh1)).
# + compute strat = lag(strat).
# end if.
for( this_rec in seq( 2 , nrow( x ) ) ){
first_condition <- is.na( x[ this_rec - 1 , 'hh7' ] ) | ( x[ this_rec , 'hh7' ] != x[ this_rec - 1 , 'hh7' ] ) | ( x[ this_rec , 'hh6' ] != x[ this_rec - 1 , 'hh6' ] )
second_condition <- x[ this_rec , 'strpair' ] == 0
third_condition <- x[ this_rec , 'hh1' ] == x[ this_rec - 1 , 'hh1' ]
if( first_condition ){
if( second_condition ){
x[ this_rec , 'strat' ] <- x[ this_rec , 'strat' ] - 1
}
} else if( third_condition ){
x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ]
}
}
# * Re-sort according to sampling domains (urban/rural within region) and cluster within domains.
# sort cases hh7 hh6 hh1.
x <- x[ order( x$hh7 , x$hh6 , x$hh1 ) , ]
# * Drop strpair variable.
# delete variables strpair.
x$strpair <- NULL
mn$strat <- x$strat
#############################
# Can read a short sentence or went to secondary school
wm$literate <- wm$wb7 == 3 | wm$wb4 %in% 2:4
wm$literate[ is.na( wm$wb4 ) ] = (wm$wb7 == 3) [ is.na( wm$wb4 ) ]
wm$literate[ is.na( wm$wb7 ) ] = ( wm$wb4 %in% 2:4 ) [ is.na( wm$wb7 ) ]
mn$literate <- mn$mwb7 == 3 | mn$mwb4 %in% 2:4
mn$literate[ is.na( mn$mwb4 ) ] = (mn$mwb7 == 3) [ is.na( mn$mwb4 ) ]
mn$literate[ is.na( mn$mwb7 ) ] = ( mn$mwb4 %in% 2:4 ) [ is.na( mn$mwb7 ) ]
#
library(survey)
options(survey.lonely.psu="adjust")
#options(survey.lonely.psu="average")
#options(survey.lonely.psu="remove")
#options(survey.lonely.psu="certainty")
design_mn <- svydesign( ids=~hh1 , strata=~strat , weights=~mnweight , data=mn , nest = TRUE )
# Men
svymean( ~factor(literate), subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), na.rm = TRUE )
svyby( ~factor(literate), ~hh6, subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), svymean, na.rm = TRUE )
cc @guilhermejacob