Reconcile how ncdfgeom::calculate_area_intersection_weights() calculates weights with gdptools

elliewhite-usgs commented 10 months ago

The major difference between the two methods is what goes in the denominator when calculating weights; in the case of ncdfgeom it is source geometry area and in the case of gdptools it is target geometry area.

This caused a bit of confusion, which would be nice to fix by:

either adding a parameter to the function that allows the user to pick which sort of weight they want to calculate or
explaining in the documentation that you can flip between he two with a simple multiplication gdptools(w) = ncdfgeom(w)* Ax/Ay

dblodgett-usgs commented 9 months ago

I've been chasing this down and think I understand what's going on now. gdptools is producing a normalized weight -- which is the fractional intersection as produced by ncdfgeom divided by the target polygon area. This leverages the fact that the sum(w * areasqkm[source]) == areasqkm[target] by definition.

dblodgett-usgs commented 9 months ago

I think I have this figured out.

library(ncdfgeom)

x1 = sf::st_polygon(list(rbind(c(-1,-1), c(1,-1),
                           c(1,1), c(-1,1),
                           c(-1,-1))))
x2 = x1 + 2
x3 = x1 + c(0, 2)
x4 = x1 + c(2, 0)
x = sf::st_sfc(x1, x2, x3, x4)
y1 = x1 * 0.75 + c(-.25, -.25)
y2 = y1 + 1.5
y3 = y1 + c(0, 1.5)
y4 = y1 + c(1.5, 0)
y = sf::st_sfc(y1,y2,y3, y4)
plot(x, border = 'red', lwd = 3)
plot(y, border = 'green', add = TRUE)


sf::st_crs(x) <- sf::st_crs(y) <- sf::st_crs(5070)

x <- sf::st_sf(x, data.frame(idx = c(1, 2, 3, 4)))
y <- sf::st_sf(y, data.frame(idy = c("a", "b", "c", "d")))

sf::st_agr(y) <- sf::st_agr(x) <- "constant"

# say we have data from `x` that we want sampled to `y`.
# this gives the percent of each `x` that intersects each `y`

(x_y <- calculate_area_intersection_weights(x, y, normalize = FALSE))
#> Loading required namespace: areal
#> # A tibble: 9 × 3
#>     idx idy        w
#>   <dbl> <chr>  <dbl>
#> 1     1 a     0.562 
#> 2     1 b     0.0625
#> 3     2 b     0.25  
#> 4     3 b     0.125 
#> 5     4 b     0.125 
#> 6     1 c     0.188 
#> 7     3 c     0.375 
#> 8     1 d     0.188 
#> 9     4 d     0.375

# note that `w` sums to 1 where `y` completely covers `x`.
# w does not sum to 1 where `y` partially covers `x`
dplyr::summarize(dplyr::group_by(x_y, idx), w = sum(w))
#> # A tibble: 4 × 2
#>     idx     w
#>   <dbl> <dbl>
#> 1     1  1   
#> 2     2  0.25
#> 3     3  0.5 
#> 4     4  0.5

# and to use these in an area weighted sum we need the area of each polygon in 
# the source data (x)

(z_x_y <- dplyr::tibble(idx = unique(x_y$idx),
                   val = c(1, 2, 3, 4)) |>
  dplyr::left_join(x_y, by = "idx") |>
  dplyr::mutate(val = ifelse(is.na(w), NA, val),
                areasqkm = 2 ^ 2) |> # area of each polygon in `y`
  dplyr::group_by(idy) |> # group so we get one row per `y`
  # now we weight by the percent of the area from each polygon in `y` per polygon in `x`
  dplyr::mutate(new_val = sum( (val * w * areasqkm), na.rm = TRUE ) / sum(w * areasqkm)))
#> # A tibble: 9 × 6
#> # Groups:   idy [4]
#>     idx   val idy        w areasqkm new_val
#>   <dbl> <dbl> <chr>  <dbl>    <dbl>   <dbl>
#> 1     1     1 a     0.562         4    1   
#> 2     1     1 b     0.0625        4    2.56
#> 3     1     1 c     0.188         4    2.33
#> 4     1     1 d     0.188         4    3   
#> 5     2     2 b     0.25          4    2.56
#> 6     3     3 b     0.125         4    2.56
#> 7     3     3 c     0.375         4    2.33
#> 8     4     4 b     0.125         4    2.56
#> 9     4     4 d     0.375         4    3

# summarize to show the result
(z_x_y <- dplyr::summarise(z_x_y, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#>   idy   new_val
#>   <chr>   <dbl>
#> 1 a        1   
#> 2 b        2.56
#> 3 c        2.33
#> 4 d        3

x$val <- c(1, 2, 3, 4)
y$val <- z_x_y$new_val

plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))


plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))


# we can go in reverse if we had data from x that we want sampled to y

x <- dplyr::select(x, -val)
y <- dplyr::select(y, -val)

# if we reverse the above, we take data from y that we want to sample to x
(y_x <- calculate_area_intersection_weights(y, x, normalize = FALSE))
#> # A tibble: 9 × 3
#>   idy     idx     w
#>   <chr> <dbl> <dbl>
#> 1 a         1 1    
#> 2 b         1 0.111
#> 3 c         1 0.333
#> 4 d         1 0.333
#> 5 b         2 0.444
#> 6 b         3 0.222
#> 7 c         3 0.667
#> 8 b         4 0.222
#> 9 d         4 0.667

# note that `w` sums to 1 only where `x` completely covers `y`
# in this case all x are vully covered by y

dplyr::summarize(dplyr::group_by(y_x, idy), w = sum(w))
#> # A tibble: 4 × 2
#>   idy       w
#>   <chr> <dbl>
#> 1 a         1
#> 2 b         1
#> 3 c         1
#> 4 d         1

# and to use these in an area weighted sum we need the area of each polygon in 
# the source data (x)

(z_y_x <- dplyr::tibble(idy = unique(y_x$idy),
                    val = c(1, 2, 3, 4)) |>
    dplyr::left_join(y_x, by = "idy") |>
    dplyr::mutate(val = ifelse(is.na(w), NA, val),
                  areasqkm = 1.5 ^ 2) |> # area of each polygon in `x`
    dplyr::group_by(idx) |> # group so we get one row per `x`
    # now we weight by the percent of the area from each polygon in `x` per polygon in `y`
    dplyr::mutate(new_val = sum( (val * w * areasqkm), na.rm = TRUE ) / sum(w * areasqkm)))
#> # A tibble: 9 × 6
#> # Groups:   idx [4]
#>   idy     val   idx     w areasqkm new_val
#>   <chr> <dbl> <dbl> <dbl>    <dbl>   <dbl>
#> 1 a         1     1 1         2.25    2   
#> 2 b         2     1 0.111     2.25    2   
#> 3 b         2     2 0.444     2.25    2   
#> 4 b         2     3 0.222     2.25    2.75
#> 5 b         2     4 0.222     2.25    3.5 
#> 6 c         3     1 0.333     2.25    2   
#> 7 c         3     3 0.667     2.25    2.75
#> 8 d         4     1 0.333     2.25    2   
#> 9 d         4     4 0.667     2.25    3.5

# summarize to show the result
(z_y_x <- dplyr::summarise(z_y_x, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#>     idx new_val
#>   <dbl>   <dbl>
#> 1     1    2   
#> 2     2    2   
#> 3     3    2.75
#> 4     4    3.5

x$val <- z_y_x$new_val
y$val <- c(1, 2, 3, 4)

plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))


plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))


x <- dplyr::select(x, -val)
y <- dplyr::select(y, -val)

# with `normalize = TRUE`, `w` will sum to 1 when the target
# completely covers the source rather than when the source completly
# covers the target.

(x_y_norm <- calculate_area_intersection_weights(x, y, normalize = TRUE))
#> # A tibble: 9 × 3
#>     idx idy       w
#>   <dbl> <chr> <dbl>
#> 1     1 a     1    
#> 2     1 b     0.111
#> 3     2 b     0.444
#> 4     3 b     0.222
#> 5     4 b     0.222
#> 6     1 c     0.333
#> 7     3 c     0.667
#> 8     1 d     0.333
#> 9     4 d     0.667

dplyr::summarize(dplyr::group_by(x_y_norm, idy), w = sum(w))
#> # A tibble: 4 × 2
#>   idy       w
#>   <chr> <dbl>
#> 1 a         1
#> 2 b         1
#> 3 c         1
#> 4 d         1

# and to use these in an area weighted sum we need the area of each polygon in 
# the source data (x)

(z_x_y_norm <- dplyr::tibble(idx = unique(x_y_norm$idx),
                    val = c(1, 2, 3, 4)) |>
    dplyr::left_join(x_y_norm, by = "idx") |>
    dplyr::mutate(val = ifelse(is.na(w), NA, val)) |> # area of each polygon in `y`
    dplyr::group_by(idy) |> # group so we get one row per `y`
    # now we weight by the percent of the area from each polygon in `y` per polygon in `x`
    dplyr::mutate(new_val = sum( (val * w), na.rm = TRUE )))
#> # A tibble: 9 × 5
#> # Groups:   idy [4]
#>     idx   val idy       w new_val
#>   <dbl> <dbl> <chr> <dbl>   <dbl>
#> 1     1     1 a     1        1   
#> 2     1     1 b     0.111    2.56
#> 3     1     1 c     0.333    2.33
#> 4     1     1 d     0.333    3   
#> 5     2     2 b     0.444    2.56
#> 6     3     3 b     0.222    2.56
#> 7     3     3 c     0.667    2.33
#> 8     4     4 b     0.222    2.56
#> 9     4     4 d     0.667    3

# summarize to show the result
(z_x_y_norm <- dplyr::summarise(z_x_y_norm, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#>   idy   new_val
#>   <chr>   <dbl>
#> 1 a        1   
#> 2 b        2.56
#> 3 c        2.33
#> 4 d        3

all.equal(z_x_y, z_x_y_norm)
#> [1] TRUE

x$val <- c(1, 2, 3, 4)
y$val <- z_x_y_norm$new_val

plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))


plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))


x <- dplyr::select(x, -val)
y <- dplyr::select(y, -val)

(y_x_norm <- calculate_area_intersection_weights(y, x, normalize = TRUE))
#> # A tibble: 9 × 3
#>   idy     idx      w
#>   <chr> <dbl>  <dbl>
#> 1 a         1 0.562 
#> 2 b         1 0.0625
#> 3 c         1 0.188 
#> 4 d         1 0.188 
#> 5 b         2 1     
#> 6 b         3 0.25  
#> 7 c         3 0.75  
#> 8 b         4 0.25  
#> 9 d         4 0.75

dplyr::summarize(dplyr::group_by(y_x_norm, idx), w = sum(w))
#> # A tibble: 4 × 2
#>     idx     w
#>   <dbl> <dbl>
#> 1     1     1
#> 2     2     1
#> 3     3     1
#> 4     4     1

# and to use these in an area weighted sum we need the area of each polygon in 
# the source data (x)

(z_y_x_norm <- dplyr::tibble(idy = unique(y_x_norm$idy),
                    val = c(1, 2, 3, 4)) |>
    dplyr::left_join(y_x_norm, by = "idy") |>
    dplyr::mutate(val = ifelse(is.na(w), NA, val)) |>
    dplyr::group_by(idx) |> # group so we get one row per `x`
    # now we weight by the percent of the area from each polygon in `x` per polygon in `y`
    dplyr::mutate(new_val = sum( (val * w), na.rm = TRUE )))
#> # A tibble: 9 × 5
#> # Groups:   idx [4]
#>   idy     val   idx      w new_val
#>   <chr> <dbl> <dbl>  <dbl>   <dbl>
#> 1 a         1     1 0.562     2   
#> 2 b         2     1 0.0625    2   
#> 3 b         2     2 1         2   
#> 4 b         2     3 0.25      2.75
#> 5 b         2     4 0.25      3.5 
#> 6 c         3     1 0.188     2   
#> 7 c         3     3 0.75      2.75
#> 8 d         4     1 0.188     2   
#> 9 d         4     4 0.75      3.5

# summarize to show the result
(z_y_x_norm <- dplyr::summarise(z_y_x_norm, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#>     idx new_val
#>   <dbl>   <dbl>
#> 1     1    2   
#> 2     2    2   
#> 3     3    2.75
#> 4     4    3.5

x$val <- z_y_x_norm$new_val
y$val <- c(1, 2, 3, 4)

plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))


plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))


all.equal(z_y_x$new_val, z_y_x_norm$new_val)
#> [1] TRUE

^{Created on 2024-02-03 with reprex v2.0.2}

dblodgett-usgs commented 9 months ago

Also see https://doi-usgs.github.io/ncdfgeom/articles/polygon_intersection.html

DOI-USGS / ncdfgeom

Reconcile how ncdfgeom::calculate_area_intersection_weights() calculates weights with gdptools #94