Closed elliewhite-usgs closed 9 months ago
I've been chasing this down and think I understand what's going on now. gdptools is producing a normalized weight
-- which is the fractional intersection as produced by ncdfgeom divided by the target polygon area. This leverages the fact that the sum(w * areasqkm[source]) == areasqkm[target] by definition.
I think I have this figured out.
library(ncdfgeom)
x1 = sf::st_polygon(list(rbind(c(-1,-1), c(1,-1),
c(1,1), c(-1,1),
c(-1,-1))))
x2 = x1 + 2
x3 = x1 + c(0, 2)
x4 = x1 + c(2, 0)
x = sf::st_sfc(x1, x2, x3, x4)
y1 = x1 * 0.75 + c(-.25, -.25)
y2 = y1 + 1.5
y3 = y1 + c(0, 1.5)
y4 = y1 + c(1.5, 0)
y = sf::st_sfc(y1,y2,y3, y4)
plot(x, border = 'red', lwd = 3)
plot(y, border = 'green', add = TRUE)
sf::st_crs(x) <- sf::st_crs(y) <- sf::st_crs(5070)
x <- sf::st_sf(x, data.frame(idx = c(1, 2, 3, 4)))
y <- sf::st_sf(y, data.frame(idy = c("a", "b", "c", "d")))
sf::st_agr(y) <- sf::st_agr(x) <- "constant"
# say we have data from `x` that we want sampled to `y`.
# this gives the percent of each `x` that intersects each `y`
(x_y <- calculate_area_intersection_weights(x, y, normalize = FALSE))
#> Loading required namespace: areal
#> # A tibble: 9 × 3
#> idx idy w
#> <dbl> <chr> <dbl>
#> 1 1 a 0.562
#> 2 1 b 0.0625
#> 3 2 b 0.25
#> 4 3 b 0.125
#> 5 4 b 0.125
#> 6 1 c 0.188
#> 7 3 c 0.375
#> 8 1 d 0.188
#> 9 4 d 0.375
# note that `w` sums to 1 where `y` completely covers `x`.
# w does not sum to 1 where `y` partially covers `x`
dplyr::summarize(dplyr::group_by(x_y, idx), w = sum(w))
#> # A tibble: 4 × 2
#> idx w
#> <dbl> <dbl>
#> 1 1 1
#> 2 2 0.25
#> 3 3 0.5
#> 4 4 0.5
# and to use these in an area weighted sum we need the area of each polygon in
# the source data (x)
(z_x_y <- dplyr::tibble(idx = unique(x_y$idx),
val = c(1, 2, 3, 4)) |>
dplyr::left_join(x_y, by = "idx") |>
dplyr::mutate(val = ifelse(is.na(w), NA, val),
areasqkm = 2 ^ 2) |> # area of each polygon in `y`
dplyr::group_by(idy) |> # group so we get one row per `y`
# now we weight by the percent of the area from each polygon in `y` per polygon in `x`
dplyr::mutate(new_val = sum( (val * w * areasqkm), na.rm = TRUE ) / sum(w * areasqkm)))
#> # A tibble: 9 × 6
#> # Groups: idy [4]
#> idx val idy w areasqkm new_val
#> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 1 a 0.562 4 1
#> 2 1 1 b 0.0625 4 2.56
#> 3 1 1 c 0.188 4 2.33
#> 4 1 1 d 0.188 4 3
#> 5 2 2 b 0.25 4 2.56
#> 6 3 3 b 0.125 4 2.56
#> 7 3 3 c 0.375 4 2.33
#> 8 4 4 b 0.125 4 2.56
#> 9 4 4 d 0.375 4 3
# summarize to show the result
(z_x_y <- dplyr::summarise(z_x_y, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#> idy new_val
#> <chr> <dbl>
#> 1 a 1
#> 2 b 2.56
#> 3 c 2.33
#> 4 d 3
x$val <- c(1, 2, 3, 4)
y$val <- z_x_y$new_val
plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))
plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))
# we can go in reverse if we had data from x that we want sampled to y
x <- dplyr::select(x, -val)
y <- dplyr::select(y, -val)
# if we reverse the above, we take data from y that we want to sample to x
(y_x <- calculate_area_intersection_weights(y, x, normalize = FALSE))
#> # A tibble: 9 × 3
#> idy idx w
#> <chr> <dbl> <dbl>
#> 1 a 1 1
#> 2 b 1 0.111
#> 3 c 1 0.333
#> 4 d 1 0.333
#> 5 b 2 0.444
#> 6 b 3 0.222
#> 7 c 3 0.667
#> 8 b 4 0.222
#> 9 d 4 0.667
# note that `w` sums to 1 only where `x` completely covers `y`
# in this case all x are vully covered by y
dplyr::summarize(dplyr::group_by(y_x, idy), w = sum(w))
#> # A tibble: 4 × 2
#> idy w
#> <chr> <dbl>
#> 1 a 1
#> 2 b 1
#> 3 c 1
#> 4 d 1
# and to use these in an area weighted sum we need the area of each polygon in
# the source data (x)
(z_y_x <- dplyr::tibble(idy = unique(y_x$idy),
val = c(1, 2, 3, 4)) |>
dplyr::left_join(y_x, by = "idy") |>
dplyr::mutate(val = ifelse(is.na(w), NA, val),
areasqkm = 1.5 ^ 2) |> # area of each polygon in `x`
dplyr::group_by(idx) |> # group so we get one row per `x`
# now we weight by the percent of the area from each polygon in `x` per polygon in `y`
dplyr::mutate(new_val = sum( (val * w * areasqkm), na.rm = TRUE ) / sum(w * areasqkm)))
#> # A tibble: 9 × 6
#> # Groups: idx [4]
#> idy val idx w areasqkm new_val
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 a 1 1 1 2.25 2
#> 2 b 2 1 0.111 2.25 2
#> 3 b 2 2 0.444 2.25 2
#> 4 b 2 3 0.222 2.25 2.75
#> 5 b 2 4 0.222 2.25 3.5
#> 6 c 3 1 0.333 2.25 2
#> 7 c 3 3 0.667 2.25 2.75
#> 8 d 4 1 0.333 2.25 2
#> 9 d 4 4 0.667 2.25 3.5
# summarize to show the result
(z_y_x <- dplyr::summarise(z_y_x, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#> idx new_val
#> <dbl> <dbl>
#> 1 1 2
#> 2 2 2
#> 3 3 2.75
#> 4 4 3.5
x$val <- z_y_x$new_val
y$val <- c(1, 2, 3, 4)
plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))
plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))
x <- dplyr::select(x, -val)
y <- dplyr::select(y, -val)
# with `normalize = TRUE`, `w` will sum to 1 when the target
# completely covers the source rather than when the source completly
# covers the target.
(x_y_norm <- calculate_area_intersection_weights(x, y, normalize = TRUE))
#> # A tibble: 9 × 3
#> idx idy w
#> <dbl> <chr> <dbl>
#> 1 1 a 1
#> 2 1 b 0.111
#> 3 2 b 0.444
#> 4 3 b 0.222
#> 5 4 b 0.222
#> 6 1 c 0.333
#> 7 3 c 0.667
#> 8 1 d 0.333
#> 9 4 d 0.667
dplyr::summarize(dplyr::group_by(x_y_norm, idy), w = sum(w))
#> # A tibble: 4 × 2
#> idy w
#> <chr> <dbl>
#> 1 a 1
#> 2 b 1
#> 3 c 1
#> 4 d 1
# and to use these in an area weighted sum we need the area of each polygon in
# the source data (x)
(z_x_y_norm <- dplyr::tibble(idx = unique(x_y_norm$idx),
val = c(1, 2, 3, 4)) |>
dplyr::left_join(x_y_norm, by = "idx") |>
dplyr::mutate(val = ifelse(is.na(w), NA, val)) |> # area of each polygon in `y`
dplyr::group_by(idy) |> # group so we get one row per `y`
# now we weight by the percent of the area from each polygon in `y` per polygon in `x`
dplyr::mutate(new_val = sum( (val * w), na.rm = TRUE )))
#> # A tibble: 9 × 5
#> # Groups: idy [4]
#> idx val idy w new_val
#> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 1 1 a 1 1
#> 2 1 1 b 0.111 2.56
#> 3 1 1 c 0.333 2.33
#> 4 1 1 d 0.333 3
#> 5 2 2 b 0.444 2.56
#> 6 3 3 b 0.222 2.56
#> 7 3 3 c 0.667 2.33
#> 8 4 4 b 0.222 2.56
#> 9 4 4 d 0.667 3
# summarize to show the result
(z_x_y_norm <- dplyr::summarise(z_x_y_norm, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#> idy new_val
#> <chr> <dbl>
#> 1 a 1
#> 2 b 2.56
#> 3 c 2.33
#> 4 d 3
all.equal(z_x_y, z_x_y_norm)
#> [1] TRUE
x$val <- c(1, 2, 3, 4)
y$val <- z_x_y_norm$new_val
plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))
plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))
x <- dplyr::select(x, -val)
y <- dplyr::select(y, -val)
(y_x_norm <- calculate_area_intersection_weights(y, x, normalize = TRUE))
#> # A tibble: 9 × 3
#> idy idx w
#> <chr> <dbl> <dbl>
#> 1 a 1 0.562
#> 2 b 1 0.0625
#> 3 c 1 0.188
#> 4 d 1 0.188
#> 5 b 2 1
#> 6 b 3 0.25
#> 7 c 3 0.75
#> 8 b 4 0.25
#> 9 d 4 0.75
dplyr::summarize(dplyr::group_by(y_x_norm, idx), w = sum(w))
#> # A tibble: 4 × 2
#> idx w
#> <dbl> <dbl>
#> 1 1 1
#> 2 2 1
#> 3 3 1
#> 4 4 1
# and to use these in an area weighted sum we need the area of each polygon in
# the source data (x)
(z_y_x_norm <- dplyr::tibble(idy = unique(y_x_norm$idy),
val = c(1, 2, 3, 4)) |>
dplyr::left_join(y_x_norm, by = "idy") |>
dplyr::mutate(val = ifelse(is.na(w), NA, val)) |>
dplyr::group_by(idx) |> # group so we get one row per `x`
# now we weight by the percent of the area from each polygon in `x` per polygon in `y`
dplyr::mutate(new_val = sum( (val * w), na.rm = TRUE )))
#> # A tibble: 9 × 5
#> # Groups: idx [4]
#> idy val idx w new_val
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 1 1 0.562 2
#> 2 b 2 1 0.0625 2
#> 3 b 2 2 1 2
#> 4 b 2 3 0.25 2.75
#> 5 b 2 4 0.25 3.5
#> 6 c 3 1 0.188 2
#> 7 c 3 3 0.75 2.75
#> 8 d 4 1 0.188 2
#> 9 d 4 4 0.75 3.5
# summarize to show the result
(z_y_x_norm <- dplyr::summarise(z_y_x_norm, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#> idx new_val
#> <dbl> <dbl>
#> 1 1 2
#> 2 2 2
#> 3 3 2.75
#> 4 4 3.5
x$val <- z_y_x_norm$new_val
y$val <- c(1, 2, 3, 4)
plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))
plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))
all.equal(z_y_x$new_val, z_y_x_norm$new_val)
#> [1] TRUE
Created on 2024-02-03 with reprex v2.0.2
The major difference between the two methods is what goes in the denominator when calculating weights; in the case of ncdfgeom it is source geometry area and in the case of gdptools it is target geometry area.
This caused a bit of confusion, which would be nice to fix by: