DOI-USGS / ncdfgeom

NetCDF-CF Geometry and Timeseries Tools for R: https://code.usgs.gov/water/ncdfgeom
https://doi-usgs.github.io/ncdfgeom/
15 stars 7 forks source link

Reconcile how ncdfgeom::calculate_area_intersection_weights() calculates weights with gdptools #94

Closed elliewhite-usgs closed 8 months ago

elliewhite-usgs commented 9 months ago

image

image

The major difference between the two methods is what goes in the denominator when calculating weights; in the case of ncdfgeom it is source geometry area and in the case of gdptools it is target geometry area.

This caused a bit of confusion, which would be nice to fix by:

dblodgett-usgs commented 8 months ago

I've been chasing this down and think I understand what's going on now. gdptools is producing a normalized weight -- which is the fractional intersection as produced by ncdfgeom divided by the target polygon area. This leverages the fact that the sum(w * areasqkm[source]) == areasqkm[target] by definition.

dblodgett-usgs commented 8 months ago

I think I have this figured out.

library(ncdfgeom)

x1 = sf::st_polygon(list(rbind(c(-1,-1), c(1,-1),
                           c(1,1), c(-1,1),
                           c(-1,-1))))
x2 = x1 + 2
x3 = x1 + c(0, 2)
x4 = x1 + c(2, 0)
x = sf::st_sfc(x1, x2, x3, x4)
y1 = x1 * 0.75 + c(-.25, -.25)
y2 = y1 + 1.5
y3 = y1 + c(0, 1.5)
y4 = y1 + c(1.5, 0)
y = sf::st_sfc(y1,y2,y3, y4)
plot(x, border = 'red', lwd = 3)
plot(y, border = 'green', add = TRUE)


sf::st_crs(x) <- sf::st_crs(y) <- sf::st_crs(5070)

x <- sf::st_sf(x, data.frame(idx = c(1, 2, 3, 4)))
y <- sf::st_sf(y, data.frame(idy = c("a", "b", "c", "d")))

sf::st_agr(y) <- sf::st_agr(x) <- "constant"

# say we have data from `x` that we want sampled to `y`.
# this gives the percent of each `x` that intersects each `y`

(x_y <- calculate_area_intersection_weights(x, y, normalize = FALSE))
#> Loading required namespace: areal
#> # A tibble: 9 × 3
#>     idx idy        w
#>   <dbl> <chr>  <dbl>
#> 1     1 a     0.562 
#> 2     1 b     0.0625
#> 3     2 b     0.25  
#> 4     3 b     0.125 
#> 5     4 b     0.125 
#> 6     1 c     0.188 
#> 7     3 c     0.375 
#> 8     1 d     0.188 
#> 9     4 d     0.375

# note that `w` sums to 1 where `y` completely covers `x`.
# w does not sum to 1 where `y` partially covers `x`
dplyr::summarize(dplyr::group_by(x_y, idx), w = sum(w))
#> # A tibble: 4 × 2
#>     idx     w
#>   <dbl> <dbl>
#> 1     1  1   
#> 2     2  0.25
#> 3     3  0.5 
#> 4     4  0.5

# and to use these in an area weighted sum we need the area of each polygon in 
# the source data (x)

(z_x_y <- dplyr::tibble(idx = unique(x_y$idx),
                   val = c(1, 2, 3, 4)) |>
  dplyr::left_join(x_y, by = "idx") |>
  dplyr::mutate(val = ifelse(is.na(w), NA, val),
                areasqkm = 2 ^ 2) |> # area of each polygon in `y`
  dplyr::group_by(idy) |> # group so we get one row per `y`
  # now we weight by the percent of the area from each polygon in `y` per polygon in `x`
  dplyr::mutate(new_val = sum( (val * w * areasqkm), na.rm = TRUE ) / sum(w * areasqkm)))
#> # A tibble: 9 × 6
#> # Groups:   idy [4]
#>     idx   val idy        w areasqkm new_val
#>   <dbl> <dbl> <chr>  <dbl>    <dbl>   <dbl>
#> 1     1     1 a     0.562         4    1   
#> 2     1     1 b     0.0625        4    2.56
#> 3     1     1 c     0.188         4    2.33
#> 4     1     1 d     0.188         4    3   
#> 5     2     2 b     0.25          4    2.56
#> 6     3     3 b     0.125         4    2.56
#> 7     3     3 c     0.375         4    2.33
#> 8     4     4 b     0.125         4    2.56
#> 9     4     4 d     0.375         4    3

# summarize to show the result
(z_x_y <- dplyr::summarise(z_x_y, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#>   idy   new_val
#>   <chr>   <dbl>
#> 1 a        1   
#> 2 b        2.56
#> 3 c        2.33
#> 4 d        3

x$val <- c(1, 2, 3, 4)
y$val <- z_x_y$new_val

plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))


plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))


# we can go in reverse if we had data from x that we want sampled to y

x <- dplyr::select(x, -val)
y <- dplyr::select(y, -val)

# if we reverse the above, we take data from y that we want to sample to x
(y_x <- calculate_area_intersection_weights(y, x, normalize = FALSE))
#> # A tibble: 9 × 3
#>   idy     idx     w
#>   <chr> <dbl> <dbl>
#> 1 a         1 1    
#> 2 b         1 0.111
#> 3 c         1 0.333
#> 4 d         1 0.333
#> 5 b         2 0.444
#> 6 b         3 0.222
#> 7 c         3 0.667
#> 8 b         4 0.222
#> 9 d         4 0.667

# note that `w` sums to 1 only where `x` completely covers `y`
# in this case all x are vully covered by y

dplyr::summarize(dplyr::group_by(y_x, idy), w = sum(w))
#> # A tibble: 4 × 2
#>   idy       w
#>   <chr> <dbl>
#> 1 a         1
#> 2 b         1
#> 3 c         1
#> 4 d         1

# and to use these in an area weighted sum we need the area of each polygon in 
# the source data (x)

(z_y_x <- dplyr::tibble(idy = unique(y_x$idy),
                    val = c(1, 2, 3, 4)) |>
    dplyr::left_join(y_x, by = "idy") |>
    dplyr::mutate(val = ifelse(is.na(w), NA, val),
                  areasqkm = 1.5 ^ 2) |> # area of each polygon in `x`
    dplyr::group_by(idx) |> # group so we get one row per `x`
    # now we weight by the percent of the area from each polygon in `x` per polygon in `y`
    dplyr::mutate(new_val = sum( (val * w * areasqkm), na.rm = TRUE ) / sum(w * areasqkm)))
#> # A tibble: 9 × 6
#> # Groups:   idx [4]
#>   idy     val   idx     w areasqkm new_val
#>   <chr> <dbl> <dbl> <dbl>    <dbl>   <dbl>
#> 1 a         1     1 1         2.25    2   
#> 2 b         2     1 0.111     2.25    2   
#> 3 b         2     2 0.444     2.25    2   
#> 4 b         2     3 0.222     2.25    2.75
#> 5 b         2     4 0.222     2.25    3.5 
#> 6 c         3     1 0.333     2.25    2   
#> 7 c         3     3 0.667     2.25    2.75
#> 8 d         4     1 0.333     2.25    2   
#> 9 d         4     4 0.667     2.25    3.5

# summarize to show the result
(z_y_x <- dplyr::summarise(z_y_x, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#>     idx new_val
#>   <dbl>   <dbl>
#> 1     1    2   
#> 2     2    2   
#> 3     3    2.75
#> 4     4    3.5

x$val <- z_y_x$new_val
y$val <- c(1, 2, 3, 4)

plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))


plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))


x <- dplyr::select(x, -val)
y <- dplyr::select(y, -val)

# with `normalize = TRUE`, `w` will sum to 1 when the target
# completely covers the source rather than when the source completly
# covers the target.

(x_y_norm <- calculate_area_intersection_weights(x, y, normalize = TRUE))
#> # A tibble: 9 × 3
#>     idx idy       w
#>   <dbl> <chr> <dbl>
#> 1     1 a     1    
#> 2     1 b     0.111
#> 3     2 b     0.444
#> 4     3 b     0.222
#> 5     4 b     0.222
#> 6     1 c     0.333
#> 7     3 c     0.667
#> 8     1 d     0.333
#> 9     4 d     0.667

dplyr::summarize(dplyr::group_by(x_y_norm, idy), w = sum(w))
#> # A tibble: 4 × 2
#>   idy       w
#>   <chr> <dbl>
#> 1 a         1
#> 2 b         1
#> 3 c         1
#> 4 d         1

# and to use these in an area weighted sum we need the area of each polygon in 
# the source data (x)

(z_x_y_norm <- dplyr::tibble(idx = unique(x_y_norm$idx),
                    val = c(1, 2, 3, 4)) |>
    dplyr::left_join(x_y_norm, by = "idx") |>
    dplyr::mutate(val = ifelse(is.na(w), NA, val)) |> # area of each polygon in `y`
    dplyr::group_by(idy) |> # group so we get one row per `y`
    # now we weight by the percent of the area from each polygon in `y` per polygon in `x`
    dplyr::mutate(new_val = sum( (val * w), na.rm = TRUE )))
#> # A tibble: 9 × 5
#> # Groups:   idy [4]
#>     idx   val idy       w new_val
#>   <dbl> <dbl> <chr> <dbl>   <dbl>
#> 1     1     1 a     1        1   
#> 2     1     1 b     0.111    2.56
#> 3     1     1 c     0.333    2.33
#> 4     1     1 d     0.333    3   
#> 5     2     2 b     0.444    2.56
#> 6     3     3 b     0.222    2.56
#> 7     3     3 c     0.667    2.33
#> 8     4     4 b     0.222    2.56
#> 9     4     4 d     0.667    3

# summarize to show the result
(z_x_y_norm <- dplyr::summarise(z_x_y_norm, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#>   idy   new_val
#>   <chr>   <dbl>
#> 1 a        1   
#> 2 b        2.56
#> 3 c        2.33
#> 4 d        3

all.equal(z_x_y, z_x_y_norm)
#> [1] TRUE

x$val <- c(1, 2, 3, 4)
y$val <- z_x_y_norm$new_val

plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))


plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))


x <- dplyr::select(x, -val)
y <- dplyr::select(y, -val)

(y_x_norm <- calculate_area_intersection_weights(y, x, normalize = TRUE))
#> # A tibble: 9 × 3
#>   idy     idx      w
#>   <chr> <dbl>  <dbl>
#> 1 a         1 0.562 
#> 2 b         1 0.0625
#> 3 c         1 0.188 
#> 4 d         1 0.188 
#> 5 b         2 1     
#> 6 b         3 0.25  
#> 7 c         3 0.75  
#> 8 b         4 0.25  
#> 9 d         4 0.75

dplyr::summarize(dplyr::group_by(y_x_norm, idx), w = sum(w))
#> # A tibble: 4 × 2
#>     idx     w
#>   <dbl> <dbl>
#> 1     1     1
#> 2     2     1
#> 3     3     1
#> 4     4     1

# and to use these in an area weighted sum we need the area of each polygon in 
# the source data (x)

(z_y_x_norm <- dplyr::tibble(idy = unique(y_x_norm$idy),
                    val = c(1, 2, 3, 4)) |>
    dplyr::left_join(y_x_norm, by = "idy") |>
    dplyr::mutate(val = ifelse(is.na(w), NA, val)) |>
    dplyr::group_by(idx) |> # group so we get one row per `x`
    # now we weight by the percent of the area from each polygon in `x` per polygon in `y`
    dplyr::mutate(new_val = sum( (val * w), na.rm = TRUE )))
#> # A tibble: 9 × 5
#> # Groups:   idx [4]
#>   idy     val   idx      w new_val
#>   <chr> <dbl> <dbl>  <dbl>   <dbl>
#> 1 a         1     1 0.562     2   
#> 2 b         2     1 0.0625    2   
#> 3 b         2     2 1         2   
#> 4 b         2     3 0.25      2.75
#> 5 b         2     4 0.25      3.5 
#> 6 c         3     1 0.188     2   
#> 7 c         3     3 0.75      2.75
#> 8 d         4     1 0.188     2   
#> 9 d         4     4 0.75      3.5

# summarize to show the result
(z_y_x_norm <- dplyr::summarise(z_y_x_norm, new_val = unique(new_val)))
#> # A tibble: 4 × 2
#>     idx new_val
#>   <dbl>   <dbl>
#> 1     1    2   
#> 2     2    2   
#> 3     3    2.75
#> 4     4    3.5

x$val <- z_y_x_norm$new_val
y$val <- c(1, 2, 3, 4)

plot(x["val"], border = 'red', lwd = 3, breaks = c(0, 1, 2, 3, 4))


plot(x["val"], border = 'red', lwd = 3, reset = FALSE, breaks = c(0, 1, 2, 3, 4))
plot(y["val"], border = 'green', add = TRUE, breaks = c(0, 1, 2, 3, 4))


all.equal(z_y_x$new_val, z_y_x_norm$new_val)
#> [1] TRUE

Created on 2024-02-03 with reprex v2.0.2

dblodgett-usgs commented 8 months ago

Also see https://doi-usgs.github.io/ncdfgeom/articles/polygon_intersection.html