ajdamico / asdfree

analyze survey data for free
http://asdfree.com/
GNU General Public License v3.0
609 stars 450 forks source link

complete the unicef mics #158

Open ajdamico opened 7 years ago

ajdamico commented 7 years ago

cc @guilhermejacob

ajdamico commented 7 years ago

@guilhermejacob any chance you could match some unicef-published number off of mics that appropriately uses a survey design? not urgent at all. thanks

guilhermejacob commented 7 years ago

This matches some results of this, although the SE for literacy rate among men is different.

library( haven )
load(  "~\\mics\\Guyana\\2014\\wm.rda" )
wm <- x
wm$hl1 <- wm$ln
rm( x )

load(  "~\\mics\\Guyana\\2014\\mn.rda" )
mn <- x
rm( x )

# create codebook
wm_codebook <- lapply( wm, function( x ) attributes( x ) )
wm <- zap_labels( wm )

mn_codebook <- lapply( mn, function( x ) attributes( x ) )
mn <- zap_labels( mn )

# Can read a short sentence or went to secondary school
wm$literate <- wm$wb7 == 3 | wm$wb4 %in% 2:4
wm$literate[ is.na( wm$wb4 ) ] = (wm$wb7 == 3) [ is.na( wm$wb4 ) ]
wm$literate[ is.na( wm$wb7 ) ] = ( wm$wb4 %in% 2:4 ) [ is.na( wm$wb7 ) ]

mn$literate <- mn$mwb7 == 3 | mn$mwb4 %in% 2:4
mn$literate[ is.na( mn$mwb4 ) ] = (mn$mwb7 == 3) [ is.na( mn$mwb4 ) ]
mn$literate[ is.na( mn$mwb7 ) ] = ( mn$mwb4 %in% 2:4 ) [ is.na( mn$mwb7 ) ]

library(survey)
design_wm <- svydesign(ids=~hh1+hh2, strata=~hh6+hh7, weights=~wmweight, data=wm )
design_mn <- svydesign(ids=~hh1+hh2, strata=~hh6+hh7, weights=~mnweight, data=mn )

# Women
svymean( ~factor(literate), subset( design_wm , wb2 >= 15 & wb2 <= 24 ), na.rm = TRUE )
svyby( ~factor(literate), ~hh6, subset( design_wm , wb2 >= 15 & wb2 <= 24 ), svymean, na.rm = TRUE )

svytotal( ~factor(literate), subset( design_wm , wb2 >= 15 & wb2 <= 24 ), na.rm = TRUE )
sum( coef( svytotal( ~factor(literate), subset( design_wm , wb2 >= 15 & wb2 <= 24 ), na.rm = TRUE ) ) )

# Men :  SEs are different!
svymean( ~factor(literate), subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), na.rm = TRUE )
svyby( ~factor(literate), ~hh6, subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), svymean, na.rm = TRUE )

svytotal( ~factor(literate), subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), na.rm = TRUE )
sum( coef( svytotal( ~factor(literate), subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), na.rm = TRUE ) ) )
ajdamico commented 7 years ago

wow, thank you! that's annoying that it doesn't match. any idea why? maybe they're using a fpc= or something else that might throw off the design slightly? or an older version of the microdata? thanks for this, guilherme

guilhermejacob commented 7 years ago

@ajdamico , I don't really know. They used SPSS for this. Since our SE estimates are slightly higher, I'd like to try some fpc= approach. Can you help me with that?

Also, check this. They provide all SPSS syntaxes to reproduce results.

guilhermejacob commented 7 years ago

I think they use fpc or different strata.

This SPSS file shows how the plan is set. This one shows how they create new strata. SE03 Calculate.txt: this calculates the statistics.

guilhermejacob commented 7 years ago

This is better. Thanks, @ajdamico!

library(lodown)

mics_catalog <- get_catalog( data_name = "mics" , your_email = "email@address.com" , your_password = "password" )

mics_catalog <- mics_catalog[ grepl( "Guyana", mics_catalog$country, ignore.case = T ) , ]

lodown( data_name = "mics", catalog = mics_catalog , your_email = "email@address.com" , your_password = "password" )

# Literacy rate

library( haven )
load(  "~\\mics\\Guyana\\2014\\wm.rda" )
wm <- x
wm <- wm[ order( wm$hh1 , wm$hh2 , wm$ln ) , ]
rm( x )

load(  "~\\mics\\Guyana\\2014\\mn.rda" )
mn <- x
mn <- mn[ order( mn$hh1 , mn$hh2 , mn$ln ) , ]
rm( x )

# create codebook
wm_codebook <- lapply( wm, function( x ) attributes( x ) )
wm <- zap_labels( wm )

mn_codebook <- lapply( mn, function( x ) attributes( x ) )
mn <- zap_labels( mn )

###############################

# * Sort according to sampling domains (urban/rural within region) and cluster within domains.
# * The program assumes that the sampling domains are urban and rural areas (HH6) within 4 regions (HH7).

# sort cases HH7 HH6 HH1.
x <- mn[ with( mn , order( hh7 , hh6 , hh1 ) ) , ]

# * Compute stratum variable as pairs of clusters within same domains.
# compute strat = 1.
# compute strpair = 0.

x$strat <- 1
x$strpair <- 0

# do if (hh7 <> lag(hh7) or hh6 <> lag(hh6)).
# + compute strat = lag(strat)+1.
# + compute strpair = 0.
# else if (hh1 = lag(hh1)).
# + compute strat = lag(strat).
# + compute strpair = lag(strpair).
# else if (lag(strpair) = 0).
# + compute strat = lag(strat).
# + compute strpair = 1.
# else.
# + compute strat = lag(strat)+1.
# + compute strpair = 0.
# end if.

for( this_rec in seq( 2 , nrow( x ) ) ){

  first_condition <- ( x[ this_rec , 'hh7' ] != x[ this_rec - 1 , 'hh7' ] ) | ( x[ this_rec , 'hh6' ] != x[ this_rec - 1 , 'hh6' ] )

  second_condition <- x[ this_rec , 'hh1' ] == x[ this_rec - 1 , 'hh1' ]

  third_condition <- x[ this_rec - 1 , 'strpair' ] == 0

  if( first_condition ){

    x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ] + 1

    x[ this_rec , 'strpair' ] <- 0

  } else if( second_condition ){

    x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ]
    x[ this_rec , 'strpair' ] <- x[ this_rec - 1 , 'strpair' ]

  } else if( third_condition ){

    x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ]
    x[ this_rec , 'strpair' ] <- 1

  } else {

    x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ] + 1
    x[ this_rec , 'strpair' ] <- 0

  }

}

# * Sort in reverse order of cluster to check for single clusters in a domain.
# sort cases hh7 hh6 hh1 (D).

x <- x[ order( x$hh7 , x$hh6 , x$hh1 , decreasing = TRUE ) , ]

# * Recode single clusters in a domain into the prior stratum group.
# do if (sysmis(lag(hh7)) or hh7 <> lag(hh7) or hh6 <> lag(hh6)).
# + do if (strpair = 0).
# +   compute strat = strat - 1.
# + end if.
# else if (hh1 = lag(hh1)).
# + compute strat = lag(strat).
# end if.

for( this_rec in seq( 2 , nrow( x ) ) ){

  first_condition <- is.na( x[ this_rec - 1 , 'hh7' ] ) | ( x[ this_rec , 'hh7' ] != x[ this_rec - 1 , 'hh7' ] ) | ( x[ this_rec , 'hh6' ] != x[ this_rec - 1 , 'hh6' ] )

  second_condition <- x[ this_rec , 'strpair' ] == 0

  third_condition <- x[ this_rec , 'hh1' ] == x[ this_rec - 1 , 'hh1' ]

  if( first_condition ){

    if( second_condition ){

      x[ this_rec , 'strat' ] <- x[ this_rec , 'strat' ] - 1

    }

  } else if( third_condition ){

    x[ this_rec , 'strat' ] <- x[ this_rec - 1 , 'strat' ]

  }

}

# * Re-sort according to sampling domains (urban/rural within region) and cluster within domains.
# sort cases hh7 hh6 hh1.

x <- x[ order( x$hh7 , x$hh6 , x$hh1 ) , ]

# * Drop strpair variable.
# delete variables strpair.

x$strpair <- NULL

mn$strat <- x$strat

#############################

# Can read a short sentence or went to secondary school
wm$literate <- wm$wb7 == 3 | wm$wb4 %in% 2:4
wm$literate[ is.na( wm$wb4 ) ] = (wm$wb7 == 3) [ is.na( wm$wb4 ) ]
wm$literate[ is.na( wm$wb7 ) ] = ( wm$wb4 %in% 2:4 ) [ is.na( wm$wb7 ) ]

mn$literate <- mn$mwb7 == 3 | mn$mwb4 %in% 2:4
mn$literate[ is.na( mn$mwb4 ) ] = (mn$mwb7 == 3) [ is.na( mn$mwb4 ) ]
mn$literate[ is.na( mn$mwb7 ) ] = ( mn$mwb4 %in% 2:4 ) [ is.na( mn$mwb7 ) ]

#

library(survey)
options(survey.lonely.psu="adjust")
#options(survey.lonely.psu="average")
#options(survey.lonely.psu="remove")
#options(survey.lonely.psu="certainty")

design_mn <- svydesign( ids=~hh1 , strata=~strat , weights=~mnweight , data=mn , nest = TRUE )

# Men
svymean( ~factor(literate), subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), na.rm = TRUE )
svyby( ~factor(literate), ~hh6, subset( design_mn , mwb2 >= 15 & mwb2 <= 24 ), svymean, na.rm = TRUE )