Robinlovelace/overline-tests

The aim of this repo is to test functionality in the overline function and try to speed-it up.

if(!file.exists("routes.geojson")) {
  routes = pct::get_pct_routes_fast("isle-of-wight")
  routes = routes %>% 
    slice(1:1000)
  sf::write_sf(routes, "routes.geojson", delete_dsn = TRUE)
}

routes = geojsonsf::geojson_sf("routes.geojson")
nrow(routes)

[1] 1000

res = bench::mark(time_unit = "s",
  original = {stplanr::overline(routes, attrib = "foot")}
)

2024-01-24 00:50:25.932463 constructing segments

2024-01-24 00:50:26.773686 building geometry

2024-01-24 00:50:26.972302 simplifying geometry

2024-01-24 00:50:26.972523 aggregating flows

2024-01-24 00:50:27.212716 rejoining segments into linestrings

2024-01-24 00:50:30.133867 constructing segments

2024-01-24 00:50:30.843851 building geometry

2024-01-24 00:50:31.032385 simplifying geometry

2024-01-24 00:50:31.0326 aggregating flows

2024-01-24 00:50:31.197022 rejoining segments into linestrings

Warning: Some expressions had a GC in every iteration; so filtering is
disabled.

res |>
  dplyr::select(expression, median, mem_alloc) |>
  mutate(routes_per_second = nrow(routes) / median) |>
  knitr::kable()

expression	median	mem_alloc	routes_per_second
original	1.120444	213MB	892.5036

A breakdown of the overline function is as follows:

sl = sf::st_sf(
  data.frame(
    id = c("a", "b"),
    foot = c(1, 2),
    geometry = sf::st_sfc(
      sf::st_linestring(
        matrix(
          c(0, 0, 1, 1, 2, 2),
          ncol = 2,
          byrow = TRUE
        )
      ),
      sf::st_linestring(
        matrix(
          c(0, 1, 1, 1, 2, 2),
          ncol = 2,
          byrow = TRUE
        )
      )
    )
  )
)
plot(sl)

sl_overline = stplanr::overline(sl, attrib = "foot")
sl_overline

Simple feature collection with 3 features and 1 field
Geometry type: LINESTRING
Dimension:     XY
Bounding box:  xmin: 0 ymin: 0 xmax: 2 ymax: 2
CRS:           NA
  foot              geometry
1    1 LINESTRING (0 0, 1 1)
2    2 LINESTRING (0 1, 1 1)
3    3 LINESTRING (1 1, 2 2)

plot(sl_overline)

We can represent this in GeoPandas as follows:

import geopandas as gpd
from shapely.geometry import LineString

sl = gpd.GeoDataFrame(
    {
        "id": ["a", "b"],
        "value": [1, 2]
    },
    geometry = [
        LineString([(0, 0), (1, 1), (2, 2)]),
        LineString([(0, 1), (1, 1), (2, 2)])
    ]
)

# plot with values:
sl.plot(column = "value")
# The output should be along the lines of:
sl_overline = gpd.GeoDataFrame(
    {
        "value": [1, 2, 3]
    },
    geometry = [
        LineString([(0, 0), (1, 1)]),
        LineString([(0, 1), (1, 1)]),
        LineString([(1, 1), (2, 2)])
    ]
)
sl_overline.plot(column = "value")

sfheaders is x faster:

bench::mark(check = FALSE,
  sf = {c1 = sf::st_coordinates(sl)},
  sfheaders = {c1_new = sfheaders::sf_to_df(sl)}
)

# A tibble: 2 × 6
  expression      min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 sf          20.78µs  23.83µs    39687.      384B     27.8
2 sfheaders    8.61µs   9.53µs   100085.      86KB     20.0

waldo::compare(c1, c1_new)

`old` is a double vector (0, 1, 2, 0, 1, ...)
`new` is an S3 object of class <data.frame>, a list

c1_new

  sfg_id linestring_id x y
1      1             1 0 0
2      1             1 1 1
3      1             1 2 2
4      2             2 0 1
5      2             2 1 1
6      2             2 2 2

bench::mark(check = FALSE,
  sf = {l1 = c1[, 3]},
  sfheaders = {l1_new = c1_new$sfg_id}
)

# A tibble: 2 × 6
  expression      min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 sf            382ns    476ns  1843788.        0B        0
2 sfheaders     435ns    461ns  1763858.        0B        0

l1_new

[1] 1 1 1 2 2 2

kit is 2x faster:

l1_start = duplicated(l1) # find the break points between lines
l1_start = c(l1_start[2:length(l1)], FALSE)
bench::mark(check = FALSE,
  old = {l1_start = c(duplicated(l1)[2:length(l1)], FALSE)},
  new = {l1_start_new = c(!kit::fduplicated(l1_new))}
)

# A tibble: 2 × 6
  expression      min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 old          1.81µs   2.23µs   410423.        0B     41.0
2 new        770.09ns 822.01ns  1131931.      31KB      0

get_start_end = function(l1) {
  dups = kit::fduplicated(l1)
  start_points = !dups
  end_points = c(start_points[-1], TRUE)
  list(start_points = start_points, end_points = end_points)
}
get_start_end(l1_new)

$start_points
[1]  TRUE FALSE FALSE  TRUE FALSE FALSE

$end_points
[1] FALSE FALSE  TRUE FALSE FALSE  TRUE

c1

     X Y L1
[1,] 0 0  1
[2,] 1 1  1
[3,] 2 2  1
[4,] 0 1  2
[5,] 1 1  2
[6,] 2 2  2

c2 <- c1[2:nrow(c1), 1:2] # Create new coords offset by one row
c2 <- rbind(c2, c(NA, NA))
c2[nrow(c1), ] <- c(NA, NA)
c2[!l1_start, 1] <- NA
c2[!l1_start, 2] <- NA
c3 <- cbind(c1, c2)
c3

     X Y L1  X  Y
[1,] 0 0  1  1  1
[2,] 1 1  1  2  2
[3,] 2 2  1 NA NA
[4,] 0 1  2  1  1
[5,] 1 1  2  2  2
[6,] 2 2  2 NA NA

coordinate_offset_old = function(c1, l1_start) {
  c2 <- c1[2:nrow(c1), 1:2] # Create new coords offset by one row
  c2 <- rbind(c2, c(NA, NA))
  c2[nrow(c1), ] <- c(NA, NA)
  c2[!l1_start, 1] <- NA
  c2[!l1_start, 2] <- NA
  c3 <- cbind(c1[, c("L1", "X", "Y")], c2)
  c3 <- c3[!is.na(c3[, 4]), ]
  c3
}
coordinate_offset = function(c1) {
  se = get_start_end(c1[, "L1"])
  coords_exploded = cbind(
    c1[!se$end_points, c("L1", "X", "Y")],
    c1[!se$start_points, 1:2]
    )
  colnames(coords_exploded) = c("L1", "X", "Y", "X2", "Y2")
}
coordinate_offset(c1)

bench::mark(check = FALSE,
  old = {c3 = coordinate_offset_old(c1, l1_start)},
  new = {c3_new = coordinate_offset(c1)}
)

# A tibble: 2 × 6
  expression      min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 old          4.52µs   5.99µs   150659.   111.5KB     45.2
2 new          5.17µs   6.07µs   150227.    35.9KB     30.1

waldo::compare(c3, c3_new)

`old` is a double vector (1, 1, 2, 2, 0, ...)
`new` is a character vector ('L1', 'X', 'Y', 'X2', 'Y2')

sl_df = sf::st_drop_geometry(sl)
se = get_start_end(c1[, "L1"])
sl_df_expanded = sl_df[c3[, "L1"], ]

knitr::opts_chunk$set(
  eval = FALSE
)

sl_wide <- cbind(c3_new, sl_df_expanded)
bench::mark(check = FALSE,
  dplyr = dplyr::group_by_at(sl_wide, c("1", "2", "3", "4")) |>
    summarise_if(is.numeric, sum),
  collapse = collapse::fgroup_by(sl_wide, c("1", "2", "3", "4")) |>
    collapse::fsummarise(res = mean(foot))
)
sls = collapse::fgroup_by(sl_wide, c("1", "2", "3", "4")) |>
  collapse::fsummarise(foot = sum(foot))
attrib <- names(sls)[5:ncol(sls)]
coords <- as.matrix(sls[, 1:4])
sl_values <- sls[, -c(1:4)]
sl_crs = sf::st_crs(sl)

bench::mark(check = FALSE,
  sf = {geom <- sf::st_as_sfc(
        pbapply::pblapply(1:nrow(coords), function(y) {
          sf::st_linestring(matrix(coords[y, ], ncol = 2, byrow = T))
        }),
      crs = sl_crs
    )},
  sfh = {geomh <- 
        pbapply::pblapply(1:nrow(coords), function(y) {
          sfheaders::sfc_linestring(matrix(coords[y, ], ncol = 2, byrow = T))
        })}
    )

plot(geom)

            lapply(sl, function(y) {
              y <- dplyr::group_by_at(y, attrib)
              y <- dplyr::summarise(y, do_union = FALSE, .groups = "drop")
            })

    # Recombine into fewer lines
    if (simplify) {
      if (!quiet) {
        message(paste0(Sys.time(), " simplifying geometry"))
      }
      if (nrow(sl) > regionalise) {
        message(paste0("large data detected, using regionalisation, nrow = ", nrow(sl)))
        suppressWarnings(cents <- sf::st_centroid(sl))
        # Fix for https://github.com/r-spatial/sf/issues/1777
        if(sf::st_is_longlat(cents)){
          bbox <- sf::st_bbox(cents)
          bbox[1] <- bbox[1] - abs(bbox[1] * 0.001)
          bbox[2] <- bbox[2] - abs(bbox[2] * 0.001)
          bbox[3] <- bbox[3] + abs(bbox[3] * 0.001)
          bbox[4] <- bbox[4] + abs(bbox[4] * 0.001)
          bbox <- sf::st_as_sfc(bbox)
          grid <- sf::st_make_grid(bbox, what = "polygons")
        } else {
          grid <- sf::st_make_grid(cents, what = "polygons")
        }

        suppressWarnings(inter <- unlist(lapply(sf::st_intersects(cents, grid), `[[`, 1)))
        sl$grid <- inter
        rm(cents, grid, inter)
        # split into a list of df by grid
        sl <- dplyr::group_split(sl, grid)
        message(paste0(Sys.time(), " regionalisation complete, aggregating flows"))
        if (ncores > 1) {
          cl <- parallel::makeCluster(ncores)
          parallel::clusterExport(
            cl = cl,
            varlist = c("attrib"),
            envir = environment()
          )
          parallel::clusterEvalQ(cl, {
            library(sf)
            # library(dplyr)
          })
          overlined_simple <- if (requireNamespace("pbapply", quietly = TRUE)) {
            pbapply::pblapply(sl, function(y) {
              y <- dplyr::group_by_at(y, attrib)
              y <- dplyr::summarise(y, do_union = FALSE, .groups = "drop")
            }, cl = cl)
          } else {
            lapply(sl, function(y) {
              y <- dplyr::group_by_at(y, attrib)
              y <- dplyr::summarise(y, do_union = FALSE, .groups = "drop")
            })
          }

          parallel::stopCluster(cl)
          rm(cl)
        } else {
          overlined_simple <- if (requireNamespace("pbapply", quietly = TRUE)) {
            pbapply::pblapply(sl, function(y) {
              y <- dplyr::group_by_at(y, attrib)
              y <- dplyr::summarise(y, do_union = FALSE, .groups = "drop")
            })
          } else {
            lapply(sl, function(y) {
              y <- dplyr::group_by_at(y, attrib)
              y <- dplyr::summarise(y, do_union = FALSE, .groups = "drop")
            })
          }
        }
        rm(sl)
        overlined_simple <- data.table::rbindlist(overlined_simple)
        overlined_simple <- sf::st_sf(overlined_simple)
        overlined_simple <- overlined_simple[seq_len(nrow(overlined_simple)), ]
      } else {
        if (!quiet) {
          message(paste0(Sys.time(), " aggregating flows"))
        }
        overlined_simple <- dplyr::group_by_at(sl, attrib)
        overlined_simple <- dplyr::summarise(overlined_simple, do_union = FALSE, .groups = "drop")
        rm(sl)
      }

      overlined_simple <- as.data.frame(overlined_simple)
      overlined_simple <- sf::st_sf(overlined_simple)

      # Separate our the linestrings and the mulilinestrings
      if (!quiet) {
        message(paste0(Sys.time(), " rejoining segments into linestrings"))
      }
      overlined_simple <- sf::st_line_merge(overlined_simple)
      geom_types <- sf::st_geometry_type(overlined_simple)
      overlined_simple_l <- overlined_simple[geom_types == "LINESTRING", ]
      overlined_simple_ml <- overlined_simple[geom_types == "MULTILINESTRING", ]
      suppressWarnings(overlined_simple_ml <-
        sf::st_cast(
          sf::st_cast(overlined_simple_ml, "MULTILINESTRING"),
          "LINESTRING"
        ))

      return(rbind(overlined_simple_l, overlined_simple_ml))
    } else {
      return(sl)
    }

Testing…

overline2 <-
  function(sl,
           attrib,
           ncores = 1,
           simplify = TRUE,
           regionalise = 1e9,
           quiet = ifelse(nrow(sl) < 1000, TRUE, FALSE),
           fun = sum) {
    browser()
    if(as.character(unique(sf::st_geometry_type(sl))) == "MULTILINESTRING") {
      message("Converting from MULTILINESTRING to LINESTRING")
      sl <- sf::st_cast(sl, "LINESTRING")
    }
    if (!"sfc_LINESTRING" %in% class(sf::st_geometry(sl))) {
      stop("Only LINESTRING is supported")
    }
    if (is(sl, "data.table")) {
      sl_df <- as.data.frame(sf::st_drop_geometry(sl))
      sl <- sf::st_sf(sl_df, geometry = sl$geometry)
    }
    if (any(c("1", "2", "3", "4", "grid") %in% attrib)) {
      stop("1, 2, 3, 4, grid are not a permitted column names, please rename that column")
    }
    sl <- sf::st_zm(sl)
    sl <- sl[, attrib]
    sl_crs <- sf::st_crs(sl)
    if (!quiet) {
      message(paste0(Sys.time(), " constructing segments"))
    }
    c1 <- sf::st_coordinates(sl)
    sf::st_geometry(sl) <- NULL
    l1 <- c1[, 3] # Get which line each point is part of
    c1 <- c1[, 1:2]
    l1_start <- duplicated(l1) # find the break points between lines
    l1_start <- c(l1_start[2:length(l1)], FALSE)
    c2 <- c1[2:nrow(c1), 1:2] # Create new coords offset by one row
    c2 <- rbind(c2, c(NA, NA))
    c2[nrow(c1), ] <- c(NA, NA)
    c2[!l1_start, 1] <- NA
    c2[!l1_start, 2] <- NA
    c3 <- cbind(c1, c2) # make new matrix of start and end coords
    rm(c1, c2)
    c3 <- c3[!is.na(c3[, 3]), ]
    sl <- sl[l1[l1_start], , drop = FALSE] # repeate attributes
    rm(l1, l1_start)

    # if (!quiet) {
    #   message(paste0(Sys.time(), " transposing 'B to A' to 'A to B'"))
    # }
    attributes(c3)$dimnames <- NULL
    c3 <- t(apply(c3, MARGIN = 1, FUN = function(y) {
      if (y[1] != y[3]) {
        if (y[1] > y[3]) {
          c(y[3], y[4], y[1], y[2])
        } else {
          y
        }
      } else {
        if (y[2] > y[4]) {
          c(y[3], y[4], y[1], y[2])
        } else {
          y
        }
      }
    }))

    # if (!quiet) {
    #   message(paste0(Sys.time(), " removing duplicates"))
    # }
    sl <- cbind(c3, sl)
    rm(c3)

    # browser()
    # if(requireNamespace("data.table", quietly = TRUE)) {
    #   sl = data.table::data.table(sl)
    # }
    slg <- dplyr::group_by_at(sl, c("1", "2", "3", "4"))
    sls <- dplyr::ungroup(dplyr::summarise_all(slg, .funs = fun))
    attrib <- names(sls)[5:ncol(sls)]
    coords <- as.matrix(sls[, 1:4])
    sl <- sls[, -c(1:4)]

    # Make Geometry
    if (!quiet) {
      message(paste0(Sys.time(), " building geometry"))
    }
    sf::st_geometry(sl) <- sf::st_as_sfc(
      if (requireNamespace("pbapply", quietly = TRUE)) {
        pbapply::pblapply(1:nrow(coords), function(y) {
          sf::st_linestring(matrix(coords[y, ], ncol = 2, byrow = T))
        })
      } else {
        lapply(1:nrow(coords), function(y) {
          sf::st_linestring(matrix(coords[y, ], ncol = 2, byrow = T))
        })
      },
      crs = sl_crs
    )
    rm(coords)

    # Recombine into fewer lines
    if (simplify) {
      if (!quiet) {
        message(paste0(Sys.time(), " simplifying geometry"))
      }
      if (nrow(sl) > regionalise) {
        message(paste0("large data detected, using regionalisation, nrow = ", nrow(sl)))
        suppressWarnings(cents <- sf::st_centroid(sl))
        # Fix for https://github.com/r-spatial/sf/issues/1777
        if(sf::st_is_longlat(cents)){
          bbox <- sf::st_bbox(cents)
          bbox[1] <- bbox[1] - abs(bbox[1] * 0.001)
          bbox[2] <- bbox[2] - abs(bbox[2] * 0.001)
          bbox[3] <- bbox[3] + abs(bbox[3] * 0.001)
          bbox[4] <- bbox[4] + abs(bbox[4] * 0.001)
          bbox <- sf::st_as_sfc(bbox)
          grid <- sf::st_make_grid(bbox, what = "polygons")
        } else {
          grid <- sf::st_make_grid(cents, what = "polygons")
        }

        suppressWarnings(inter <- unlist(lapply(sf::st_intersects(cents, grid), `[[`, 1)))
        sl$grid <- inter
        rm(cents, grid, inter)
        # split into a list of df by grid
        sl <- dplyr::group_split(sl, grid)
        message(paste0(Sys.time(), " regionalisation complete, aggregating flows"))
        if (ncores > 1) {
          cl <- parallel::makeCluster(ncores)
          parallel::clusterExport(
            cl = cl,
            varlist = c("attrib"),
            envir = environment()
          )
          parallel::clusterEvalQ(cl, {
            library(sf)
            # library(dplyr)
          })
          overlined_simple <- if (requireNamespace("pbapply", quietly = TRUE)) {
            pbapply::pblapply(sl, function(y) {
              y <- dplyr::group_by_at(y, attrib)
              y <- dplyr::summarise(y, do_union = FALSE, .groups = "drop")
            }, cl = cl)
          } else {
            lapply(sl, function(y) {
              y <- dplyr::group_by_at(y, attrib)
              y <- dplyr::summarise(y, do_union = FALSE, .groups = "drop")
            })
          }

          parallel::stopCluster(cl)
          rm(cl)
        } else {
          overlined_simple <- if (requireNamespace("pbapply", quietly = TRUE)) {
            pbapply::pblapply(sl, function(y) {
              y <- dplyr::group_by_at(y, attrib)
              y <- dplyr::summarise(y, do_union = FALSE, .groups = "drop")
            })
          } else {
            lapply(sl, function(y) {
              y <- dplyr::group_by_at(y, attrib)
              y <- dplyr::summarise(y, do_union = FALSE, .groups = "drop")
            })
          }
        }
        rm(sl)
        overlined_simple <- data.table::rbindlist(overlined_simple)
        overlined_simple <- sf::st_sf(overlined_simple)
        overlined_simple <- overlined_simple[seq_len(nrow(overlined_simple)), ]
      } else {
        if (!quiet) {
          message(paste0(Sys.time(), " aggregating flows"))
        }
        overlined_simple <- dplyr::group_by_at(sl, attrib)
        overlined_simple <- dplyr::summarise(overlined_simple, do_union = FALSE, .groups = "drop")
        rm(sl)
      }

      overlined_simple <- as.data.frame(overlined_simple)
      overlined_simple <- sf::st_sf(overlined_simple)

      # Separate our the linestrings and the mulilinestrings
      if (!quiet) {
        message(paste0(Sys.time(), " rejoining segments into linestrings"))
      }
      overlined_simple <- sf::st_line_merge(overlined_simple)
      geom_types <- sf::st_geometry_type(overlined_simple)
      overlined_simple_l <- overlined_simple[geom_types == "LINESTRING", ]
      overlined_simple_ml <- overlined_simple[geom_types == "MULTILINESTRING", ]
      suppressWarnings(overlined_simple_ml <-
        sf::st_cast(
          sf::st_cast(overlined_simple_ml, "MULTILINESTRING"),
          "LINESTRING"
        ))

      return(rbind(overlined_simple_l, overlined_simple_ml))
    } else {
      return(sl)
    }
  }

Robinlovelace / overline-tests

readme