ajdamico / convey

variance of distribution measures estimation of survey data
GNU General Public License v3.0
17 stars 7 forks source link

how to reproduce any 2022 or 2023 ibge-published gini with pnadc? #439

Closed ajdamico closed 10 months ago

ajdamico commented 10 months ago

===

https://cidades.ibge.gov.br/brasil/pesquisa/45/62590 says 2022 gini 0.544

https://agenciadenoticias.ibge.gov.br/agencia-noticias/2012-agencia-de-noticias/noticias/36857-em-2022-mercado-de-trabalho-e-auxilio-brasil-permitem-recuperacao-dos-rendimentos

says 2022 gini 0.544 and 2022 earnings gini 0.499: A desigualdade de rendimentos diminuiu no conjunto da população e também na população ocupada: o Índice de Gini do rendimento domiciliar per capita caiu de 0,544 para 0,518 e o Gini do rendimento de todos os trabalhos caiu de 0,499 para 0,486, ambos os menores da série.

===

library(PNADcIBGE)
library(survey)
library(convey)

four_quarters <-
    lapply(
        1:4 ,
        function( w ) get_pnadc( year = 2022 , quarter = w , design = FALSE )
    )

stacked_year <- Reduce( rbind , four_quarters )

names( stacked_year ) <- tolower( names( stacked_year ) )

pnad_design <-
    svrepdesign(
        data = stacked_year , 
        weight = ~v1028, 
        type = "bootstrap", 
        repweights = "v1028[0-9]+", 
        mse = TRUE, 
        replicates = length(sprintf("v1028%03d", seq(1:200))), 
        df = length(sprintf("v1028%03d", seq(1:200)))
    )

pnad_design <- convey_prep( pnad_design )

pnad_design <-
    update(
        pnad_design ,

        pia = as.numeric( v2009 >= 14 )

    )

pnad_design <-
    update(
        pnad_design ,

        # calculate usual income from all jobs
        # (variavel rendimento habitual de todos os trabalhos)
        vd4019n = ifelse( pia %in% 1 & vd4015 %in% 'Remuneração em dinheiro, produtos ou mercadorias no trabalho principal' , vd4019 , NA ) ,

        # calculate effective income from all jobs
        # (rendimento efetivo do todos os trabalhos) 
        vd4020n = ifelse( pia %in% 1 & vd4015 %in% 'Remuneração em dinheiro, produtos ou mercadorias no trabalho principal' , vd4020 , NA )
    )

domicilio_design <- subset( pnad_design , v2005 == 'Pessoa responsável pelo domicílio' )

svygini( ~ vd4020n , pnad_design , na.rm = TRUE )
#          gini     SE
# vd4020n 0.51259 0.0015

svygini( ~ vd4020n , domicilio_design , na.rm = TRUE )
#          gini     SE
# vd4020n 0.53555 0.0017
guilhermejacob commented 10 months ago

It is somewhat less complicated:

# load libraries
library(PNADcIBGE)
library(survey)
library(convey)
library(data.table)

# get dataset
pnadc.df <- get_pnadc( 2022 , interview = 5 , design = FALSE )

# fixes column names
colnames( pnadc.df ) <- tolower( colnames( pnadc.df ) )

# transforms into data.table
pnadc.df <- data.table( pnadc.df )

# household identfier
pnadc.df[ , id.domicilio := do.call( paste , c( .SD , sep = "" ) ) , .SDcols = c( "upa" , "v1008" , "v1014" ) ]

# deflation procedure
pnadc.df[ , def.vd4019 := vd4019 * co2 ] # labor incomes
pnadc.df[ , def.vd4048 := vd4048 * co2e ] # incomes from other sources

# create household totals
pnadc.df[ !(vd2002 %in% 17:19) , rtrab := sum( def.vd4019 , na.rm = TRUE ) , by = .( id.domicilio ) ]
pnadc.df[ !(vd2002 %in% 17:19) , routr := sum( def.vd4048 , na.rm = TRUE ) , by = .( id.domicilio ) ]

# compute per capita income
pnadc.df[ !(vd2002 %in% 17:19) , def.rdpc := ( rtrab + routr ) / vd2003 ]

# create survey design object
pnadc.design <- 
  svrepdesign( data = pnadc.df ,
               weight = ~v1032,
               type = "bootstrap" , 
               repweights = "v1032[0-9]{3}" ,
               mse = TRUE )

# prepare for convey functions
pnadc.design <- convey_prep( pnadc.design )

# gini for household income per capita
( gini1 <- svygini( ~ def.rdpc , pnadc.design , na.rm = TRUE ) )
# gini     SE
# def.rdpc 0.51846 0.0032

# gini for labor incomes
( gini2 <- svygini( ~ def.vd4019 , pnadc.design , na.rm = TRUE ) )
# gini     SE
# def.vd4019 0.48606 0.0036

You can also check their CVs:

first one should be around 0.6% (as per this):

cv( gini1 )
# def.rdpc 
# 0.006076709 

... and the second should be around 0.7% (as per this):

cv( gini2 )
# def.vd4019 
# 0.007401191