Open njtierney opened 2 years ago
demonstrated by this figure
library(conmat)
library(ggplot2)
ggplot(
abs_education_state_2020,
aes(
x = population,
y = population_interpolated
)
) +
geom_point() +
geom_abline() +
theme(aspect.ratio = 1) +
facet_wrap(~state,
ncol = 4)
Created on 2021-09-08 by the reprex package (v2.0.1)
Mentioned in #15
Discussing with Aarathy some notes on this:
- [ ] Calculate proportion summary similar to these numbers (2-4, 5-16, etc) https://github.com/njtierney/conmat/blob/713f08bed70715bc6fa9ee0030e245acc3b551e5/R/add_school_work_participation.R#L24-L47 )
- [ ] Then compare two approaches using
complete(age = 0:100)
vscomplete(0:24)
and see if those numbers are different
@njtierney Proportion of school goers in the given age group in 2020. Not considering complete(0:24)
as the interpolated population for 24+ ages gets excluded.
abs_education_state %>%
filter(year==2020)%>%
group_by(state, age) %>%
summarise(population_educated = sum(n_full_and_part_time)) %>%
ungroup() %>%
complete(
state,
age = 0:100,
fill = list(population_educated = 0)
)%>%
mutate(school_age_group=case_when(
between(age,0,1)~"0-1",
between(age,2,4)~"2-4",
between(age,5,16)~"5-16",
between(age,17,18)~"17-18",
between(age,19,20)~"19-20",
TRUE ~ "21+"
)) %>%
mutate(school_age_group = factor(school_age_group, levels = c(
"0-1", "2-4", "5-16", "17-18",
"19-20", "21+"
)))%>%
left_join(abs_state_age_lookup,
by = c(
"state",
"age"
)
) %>%
group_by(school_age_group) %>%
summarise(population_educated = sum(population_educated, na.rm = TRUE),
population_interpolated = sum(population_interpolated, na.rm = TRUE)) %>%
mutate(prop = population_educated / population_interpolated)
#> # A tibble: 6 x 4
#> school_age_group population_educated population_interpolated prop
#> <fct> <dbl> <dbl> <dbl>
#> 1 0-1 0 622638. 0
#> 2 2-4 3224 940352. 0.00343
#> 3 5-16 3696425 3766882. 0.981
#> 4 17-18 299641 630120. 0.476
#> 5 19-20 5166 644791. 0.00801
#> 6 21+ 2518 19086747. 0.000132
Plotting the school goer population and interpolated population of the given age groups. Outlier(21+) likely caused by small population of school goers in that age.
library(ggplot2)
options(scipen = 999)
ggplot(
school_prop,
aes(
x = population_educated,
y = population_interpolated,
color=school_age_group
)
) +
geom_point() +
geom_abline()+
theme(aspect.ratio = 1) +
facet_wrap(~state,
ncol = 4,
scales = "free_x")
We were running into issues getting education data cleaned up - here https://github.com/njtierney/conmat/blob/master/data-raw/clean-education.R#L120