walkerke / tidycensus

Load US Census boundary and attribute data as 'tidyverse' and 'sf'-ready data frames in R
https://walker-data.com/tidycensus
Other
639 stars 100 forks source link

Unexpected results from interpolate_pw() #483

Closed dmoul closed 1 year ago

dmoul commented 1 year ago

Hi Kyle. First, many thanks for making it so easy to work with US Census data in R!

It seems there are cases in which I am not getting correct values from interpolate_pw(). I have been unable to confirm whether I am making a mistake (surely the most likely case) or perhaps there is some unexpected pattern in the geometry that the algorithm is struggling with. I'm using tidycensus_1.2.3 and sf_1.0-8.

Below I consider only NC Orange County tracts 107.* in 2000 and 2010. Tract 107.02 was about twice the desired tract population in the 2000 decennial census and was split into tracts 107.05 and 107.06 for 2010. The interpolated values for tracts 01, 03, and 04 are sensible, however 05 is about the size of 02 and 06 is exactly the size of 02. Note that this pathology exists in other NC counties (not included below). I did not see this problem when recreating your interpolate_pw() example at https://walker-data.com/census-r/spatial-analysis-with-us-census-data.html#small-area-time-series-analysis

Would you take a look? Thanks in advance.

  suppressMessages({
    library(tidyverse)
    library(cowplot) # for theme_map()
    library(janitor)
    library(glue)
    library(sf)
    library(tidycensus)
    library(tigris)
  })

  #|# options(dplyr.summarise.inform = FALSE)
  #|# options(tigris_use_cache = TRUE)
  sf_use_s2(FALSE)
#> Spherical geometry (s2) switched off

  my_proj <- my_proj <- 6542
  #|# NAD83(2011) / North Carolina            projected    6318 m

  county_subset <- "Orange"

  d_pop_2000_tract <- get_decennial(
    geography = c("tract"),
    variables = c(total = "P001001"),
    state = "NC",
    county = county_subset,
    year = 2000,
    geometry = TRUE,
    output = "wide"
  ) |>
    clean_names() |>
    separate(name, into = c("tract_name", "county", "state"), sep = ", ") |>
    mutate(tract_name = str_extract(tract_name, "\\d+(\\.\\d+)?"),
           year = 2000
    ) |>
    filter(str_detect(tract_name, "^107")) |>
    st_transform(crs = my_proj)
#> Getting data from the 2000 decennial Census
#> Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
#> Using Census Summary File 1

  d_pop_2010_tract <- get_decennial(
    geography = c("tract"),
    variables = c(total = "P001001"),
    state = "NC",
    county = county_subset,
    year = 2010,
    geometry = TRUE,
    output = "wide"
  ) |>
    clean_names() |>
    separate(name, into = c("tract_name", "county", "state"), sep = ", ") |>
    mutate(tract_name = str_extract(tract_name, "\\d+(\\.\\d+)?"),
           year = 2010
    ) |>
    filter(str_detect(tract_name, "^107")) |>
    st_transform(crs = my_proj)
#> Getting data from the 2010 decennial Census
#> Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
#> Using Census Summary File 1

  debug_tracts <- bind_rows(
    d_pop_2000_tract,
    d_pop_2010_tract
    )

#|# get block-level data to use for weightings
debug_orange_blocks <-  get_decennial(geography = c("block"),
                                      variables = c(total = "P001001"),
                                      #sumfile = "sf1",
                                      state = "NC",
                                      county = county_subset,
                                      year = 2010,
                                      geometry = TRUE,
                                      output = "wide"
) |>
  clean_names() |>
  mutate(year = 2010) |>
  relocate(year, .after = name) |>
  filter(str_detect(name, "Census Tract 107")) |>
  mutate(tract_name = str_extract(name, '(?<=Tract )(\\d+)\\.(\\d+)')) |>
  rename(pop10 = total)
#> Getting data from the 2010 decennial Census
#> Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
#> Using Census Summary File 1

data_for_plot <- debug_tracts |>
  mutate(year = factor(year)) |>
  st_as_sf() |>
  st_transform(crs = my_proj)

med_value = data_for_plot |>
  filter(year == 2010) |>
  pull(total) |>
  median(na.rm = TRUE)

data_for_plot |>
  ggplot() +
  geom_sf(aes(fill = total), 
          color = "black", linewidth = 0.2) + 
  geom_sf_label(aes(label = glue("tract: {tract_name}\npop: {total}")), #
                color = "firebrick", fill = "white", size = 2) +
  scale_fill_gradient2(low = "navyblue", high = "lightskyblue",
                       midpoint = log10(med_value),
                       trans = "log10") +
  facet_grid(. ~ year) +
  theme_map() +
  labs(title = "NC Orange County population (107.* census tracts)",
       subtitle = glue("\n2010 median: {med_value}"),
       fill = "Population")


#|# 2000 and 2010 data in their natural census tracts
debug_tracts |>
  st_drop_geometry() |>
  arrange(year, tract_name)
#> # A tibble: 9 × 6
#>   geoid       tract_name county        state          total  year
#>   <chr>       <chr>      <chr>         <chr>          <dbl> <dbl>
#> 1 37135010701 107.01     Orange County North Carolina  1938  2000
#> 2 37135010702 107.02     Orange County North Carolina  8510  2000
#> 3 37135010703 107.03     Orange County North Carolina  5170  2000
#> 4 37135010704 107.04     Orange County North Carolina  4614  2000
#> 5 37135010701 107.01     Orange County North Carolina  1973  2010
#> 6 37135010703 107.03     Orange County North Carolina  6064  2010
#> 7 37135010704 107.04     Orange County North Carolina  5134  2010
#> 8 37135010705 107.05     Orange County North Carolina  4573  2010
#> 9 37135010706 107.06     Orange County North Carolina  3203  2010

##  This is exactly the case that interpolate_pw() is designed to address. So let's do that.

#|# transfer 2000 census data to 2010 census tract boundaries to simplify 2000-2010 comparisons
debug_interpolate_pw_2000 <- interpolate_pw(  
  from = debug_tracts |> filter(year == 2000) |> select(geoid, total),
  to = debug_tracts |> filter(year == 2010) |> select(geoid, total),
  to_id = "geoid",
  extensive = TRUE,
  weights = debug_orange_blocks,
  weight_column = "pop10",
  crs = my_proj
)

#|# We would expect the population of geoid 37135010702 (total = 8510) to be apportioned across 37135010705 and 37135010706. But that's not what happened:
debug_interpolate_pw_2000 |>
  st_drop_geometry() |>
  arrange(geoid)
#> # A tibble: 5 × 2
#>   geoid       total
#>   <chr>       <dbl>
#> 1 37135010701 1938 
#> 2 37135010703 5170 
#> 3 37135010704 4614 
#> 4 37135010705 8497.
#> 5 37135010706 8510

## Might there be a problem with the weightings? It doesn't seem like it: while there are some blocks with zero population, there are no cases here in which a whole tract has zero population or there is missing data.

debug_orange_blocks |>
  filter(pop10 > 0) |>
  ggplot() +
  geom_sf(data = d_pop_2010_tract,
          color = NA, fill = "firebrick", linewidth = 0.5) +
  geom_sf(aes(fill = pop10), 
          color = NA) +
  geom_sf(data = d_pop_2010_tract,
          color = "black", fill = NA, linewidth = 0.5) +
  geom_sf_label(data = d_pop_2010_tract,
                aes(label = tract_name),
                color = "firebrick", fill = "white", size = 2) + 
  scale_fill_gradient2(low = "navyblue", high = "lightskyblue",
                       midpoint = log10(med_value),
                       trans = "log10") +
  facet_grid(. ~ year) +
  theme_map() +
  labs(title = "NC Orange County population",
       subtitle = glue("Census blocks in 107.* census tracts",
                       "\nRed blocks have zero population"),
       fill = "Population")
#> Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
#> give correct results for longitude/latitude data


#|# 2010 tract population (sum of block pop)
debug_orange_blocks |>
  st_drop_geometry() |>
  group_by(tract_name) |>
  summarize(n_blocks = n(),
            tract_pop = sum(pop10)) |>
  ungroup()
#> # A tibble: 5 × 3
#>   tract_name n_blocks tract_pop
#>   <chr>         <int>     <dbl>
#> 1 107.01           46      1973
#> 2 107.03           44      6064
#> 3 107.04           78      5134
#> 4 107.05           71      4573
#> 5 107.06           50      3203

#|# end of document

Created on 2022-11-12 with reprex v2.0.2

walkerke commented 1 year ago

Thank for filing this detailed, reproducible issue! I will have to do some more digging. I can reproduce your error, but what's perplexing me at the moment is that when I run the process for all of Orange County then subset down, I get the correct answer. Try running this and let me know if you see this too:

library(tidycensus)
library(tidyverse)

orange00 <- get_decennial(
  geography = "tract",
  variables = "P001001",
  year = 2000,
  state = "NC", 
  county = "Orange",
  geometry = TRUE
)

orange10 <- get_decennial(
  geography = "tract",
  variables = "P001001",
  year = 2010,
  state = "NC", 
  county = "Orange",
  geometry = TRUE
) %>%
  select(GEOID)

orange_00_to_10 <- orange00 %>%
  interpolate_pw(
    to = orange10,
    to_id = "GEOID",
    extensive = TRUE,
    weights = orange_blocks,
    weight_column = "total",
    crs = my_proj
  )

filter(orange_00_to_10, str_sub(GEOID, 7, 9) == "107")
Simple feature collection with 5 features and 2 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 595274.1 ymin: 234144.7 xmax: 603836 ymax: 244269.5
Projected CRS: NAD83(2011) / North Carolina
# A tibble: 5 × 3
  GEOID                                                    geometry value
* <chr>                                          <MULTIPOLYGON [m]> <dbl>
1 37135010703 (((601003.3 240530, 601934.4 240212.1, 602282.7 2400… 5170 
2 37135010701 (((600383.9 242309.3, 600484.7 241938, 600541.5 2416… 1938 
3 37135010704 (((598798.1 234538.8, 598658.6 234556.2, 598630.6 23… 4614 
4 37135010705 (((603016.9 241210.8, 603442.2 240702.8, 603447.6 24… 5006.
5 37135010706 (((602113.8 243794.4, 602367.5 244179.4, 602771.1 24… 3505.
walkerke commented 1 year ago

I figured it out. It is related to #476. I use the name total local to the function and so when you try to interpolate a column named total, it doesn't work right. Naming your column anything else will work. Here is your exact code, but with the name pop instead of total for total population:

> debug_interpolate_pw_2000 |>
+   st_drop_geometry() |>
+   arrange(geoid)
# A tibble: 5 × 2
  geoid         pop
  <chr>       <dbl>
1 37135010701 1938 
2 37135010703 5170 
3 37135010704 4614 
4 37135010705 5002.
5 37135010706 3508.

As with #476, this is on me for not thinking through what names should be reserved internally. I'll push a fix and close this issue when I do, but in the meantime just don't use total and it'll work correctly.

walkerke commented 1 year ago

Fixed now - thanks again for the heads up!

dmoul commented 1 year ago

Confirmed: it works. Thanks for the fast turn-around.