Open njtierney opened 4 months ago
Were you thinking that geoarrow/geoparquet would be used only as a storage format? If so, I think it would make sense to restore the geometry format that was defined in the target, unlike what I did in the example which forced wk::wkb()
or geoarrow::geoarrow_vctr()
formats.
Something like the following should preserve input format, provided the geometries are able to be converted to/from geoarrow_vctr, via wk::wk_handle()
. The basic idea is to just preserve the class of each geometry and use that to lookup the appropriate wk::wk_writer()
.
[!NOTE]
I didn't test sf objects with their overloaded mutate and sticky geometries.
There's likely a much better approach than this! Suggest getting input from @paleolimbot.
library(targets)
tar_script({
requireNamespace("geoarrow")
tar_format_geoparquet <- tar_format(
read = function(path) {
translate <- function(x, cls) {
writer <- wk::wk_writer(structure(list(), class = cls))
wk::wk_handle(x, writer) |>
wk::wk_set_crs(wk::wk_crs(x)) |>
wk::wk_set_geodesic(wk::wk_is_geodesic(x))
}
object <- arrow::read_parquet(path)
classes <- attr(object, "wk_classes")
object |>
dplyr::mutate(
dplyr::across(
wk::is_handleable,
\(x) translate(x, classes[[dplyr::cur_column()]])
)
)
},
write = function(object, path) {
attr(object, "wk_classes") <- lapply(dplyr::select(object, wk::is_handleable), class)
object |>
dplyr::mutate(
dplyr::across(
wk::is_handleable,
geoarrow::as_geoarrow_vctr
)
) |>
arrow::write_parquet(path)
},
# omitted
marshal = function(object) NULL,
unmarshal = function(object) NULL
)
# plan
list(
tar_target(
sa4s_wkb,
as.data.frame(strayr::read_absmap("sa42021")) |>
dplyr::mutate(geometry = wk::as_wkb(geometry)),
format = tar_format_geoparquet
),
tar_target(
sa4s_sfc,
as.data.frame(strayr::read_absmap("sa42021")),
format = tar_format_geoparquet
)
)
})
tar_make()
#> Loading required namespace: geoarrow
#> ▶ dispatched target sa4s_sfc
#> trying URL 'https://github.com/wfmackey/absmapsdata/blob/master/data/absmapsdata_file_list.rda?raw=true'
#> Content type 'application/octet-stream' length 407 bytes
#> ==================================================
#> downloaded 407 bytes
#>
#> trying URL 'https://github.com/wfmackey/absmapsdata/raw/master/data/sa42021.rda'
#> Content type 'application/octet-stream' length 3044178 bytes (2.9 MB)
#> ==================================================
#> downloaded 2.9 MB
#>
#> ● completed target sa4s_sfc [1.164 seconds]
#> ▶ dispatched target sa4s_wkb
#> Reading sa42021 file found in /tmp/RtmpqFXeVe
#> ● completed target sa4s_wkb [0.258 seconds]
#> ▶ ended pipeline [2.3 seconds]
requireNamespace("geoarrow")
#> Loading required namespace: geoarrow
str(targets::tar_read(sa4s_wkb))
#> tibble [108 × 10] (S3: tbl_df/tbl/data.frame)
#> $ sa4_code_2021 : chr [1:108] "101" "102" "103" "104" ...
#> $ sa4_name_2021 : chr [1:108] "Capital Region" "Central Coast" "Central West" "Coffs Harbour - Grafton" ...
#> $ gcc_code_2021 : chr [1:108] "1RNSW" "1GSYD" "1RNSW" "1RNSW" ...
#> $ gcc_name_2021 : chr [1:108] "Rest of NSW" "Greater Sydney" "Rest of NSW" "Rest of NSW" ...
#> $ state_code_2021: chr [1:108] "1" "1" "1" "1" ...
#> $ state_name_2021: chr [1:108] "New South Wales" "New South Wales" "New South Wales" "New South Wales" ...
#> $ areasqkm_2021 : num [1:108] 51896 1681 70297 13230 339356 ...
#> $ cent_lat : num [1:108] -35.6 -33.3 -33.2 -29.8 -31 ...
#> $ cent_long : num [1:108] 149 151 148 153 145 ...
#> $ geometry : wk_wkb[1:108] <MULTIPOLYGON (((150.3113 -35.66587, 150.3126 -35.66813, 150
#> - attr(*, "sf_column")= chr "geometry"
#> - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA
#> ..- attr(*, "names")= chr [1:9] "sa4_code_2021" "sa4_name_2021" "gcc_code_2021" "gcc_name_2021" ...
#> - attr(*, "wk_classes")=List of 1
#> ..$ geometry: chr [1:2] "wk_wkb" "wk_vctr"
str(targets::tar_read(sa4s_sfc))
#> tibble [108 × 10] (S3: tbl_df/tbl/data.frame)
#> $ sa4_code_2021 : chr [1:108] "101" "102" "103" "104" ...
#> $ sa4_name_2021 : chr [1:108] "Capital Region" "Central Coast" "Central West" "Coffs Harbour - Grafton" ...
#> $ gcc_code_2021 : chr [1:108] "1RNSW" "1GSYD" "1RNSW" "1RNSW" ...
#> $ gcc_name_2021 : chr [1:108] "Rest of NSW" "Greater Sydney" "Rest of NSW" "Rest of NSW" ...
#> $ state_code_2021: chr [1:108] "1" "1" "1" "1" ...
#> $ state_name_2021: chr [1:108] "New South Wales" "New South Wales" "New South Wales" "New South Wales" ...
#> $ areasqkm_2021 : num [1:108] 51896 1681 70297 13230 339356 ...
#> $ cent_lat : num [1:108] -35.6 -33.3 -33.2 -29.8 -31 ...
#> $ cent_long : num [1:108] 149 151 148 153 145 ...
#> $ geometry :sfc_MULTIPOLYGON of length 108; first list element: List of 7
#> ..$ :List of 1
#> .. ..$ : num [1:4, 1:2] 150.3 150.3 150.3 150.3 -35.7 ...
#> ..$ :List of 1
#> .. ..$ : num [1:5, 1:2] 150 150 150 150 150 ...
#> ..$ :List of 1
#> .. ..$ : num [1:5, 1:2] 150 150 150 150 150 ...
#> ..$ :List of 1
#> .. ..$ : num [1:18, 1:2] 150 150 150 150 150 ...
#> ..$ :List of 1
#> .. ..$ : num [1:6, 1:2] 150 150 150 150 150 ...
#> ..$ :List of 1
#> .. ..$ : num [1:4, 1:2] 150.3 150.3 150.3 150.3 -35.7 ...
#> ..$ :List of 1
#> .. ..$ : num [1:5217, 1:2] 150 150 150 150 150 ...
#> ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
#> - attr(*, "sf_column")= chr "geometry"
#> - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA
#> ..- attr(*, "names")= chr [1:9] "sa4_code_2021" "sa4_name_2021" "gcc_code_2021" "gcc_name_2021" ...
#> - attr(*, "wk_classes")=List of 1
#> ..$ geometry: chr [1:2] "sfc_MULTIPOLYGON" "sfc"
Created on 2024-03-04 with reprex v2.1.0
This is very cool!
At a high level, yes, geoarrow + feather|ipc|Parquet via the Arrow package will give you very fast read/write and small file sizes! IPC is often overlooked but is one of the fastest and has a forthcoming bonus that the nanoarrow package will be able to read it in a month or two.
I think a standard arrow::write_parquet|feather|ipc_stream()
will work for the write: I'm pretty sure I implemented all the requisite methods so that sf or a data.frame with an sfc will be written with geoarrow extension types (using basically the code that you have above).
For the read, I would recommend using a restore function like sf::st_as_sfc()
or wk::as_wkb()
. Currently that goes through the wk system for most things but it doesn't have to and is probably faster without wk in the middle.
See examples from @anthonynorth https://github.com/njtierney/demo-geospatial-targets/issues/1