Open eitsupi opened 6 months ago
The current implementation of the next branch is much slower.
# Construct an Arrow array from an R vector
long_vec_1 <- 1:10^6
bench::mark(
arrow = {
arrow::as_arrow_array(long_vec_1)
},
nanoarrow = {
nanoarrow::as_nanoarrow_array(long_vec_1)
},
polars = {
polars::as_polars_series(long_vec_1)
},
neopolars = {
neopolars::as_polars_series(long_vec_1)
},
check = FALSE,
min_iterations = 5
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 arrow 2.62ms 2.92ms 328. 19.82MB 2.04
#> 2 nanoarrow 496.13µs 644.87µs 1252. 458.41KB 2.03
#> 3 polars 2.06ms 2.26ms 405. 6.33MB 0
#> 4 neopolars 84.6ms 90.1ms 10.9 1.59MB 0
# Export Arrow data as an R vector
arrow_array_1 <- arrow::as_arrow_array(long_vec_1)
nanoarrow_array_1 <- nanoarrow::as_nanoarrow_array(long_vec_1)
polars_series_1 <- polars::as_polars_series(long_vec_1)
neopolars_series_1 <- neopolars::as_polars_series(long_vec_1)
bench::mark(
arrow = {
as.vector(arrow_array_1)
},
nanoarrow = {
as.vector(nanoarrow_array_1)
},
polars = {
as.vector(polars_series_1)
},
neopolars = {
as.vector(neopolars_series_1)
},
check = TRUE,
min_iterations = 5
)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 arrow 13.94µs 15.84µs 46309. 4.59KB 4.63
#> 2 nanoarrow 559.9µs 1.85ms 513. 3.85MB 72.8
#> 3 polars 6.45ms 8.79ms 112. 5.93MB 9.13
#> 4 neopolars 148.82ms 164.65ms 6.02 5.24MB 0
Created on 2024-09-05 with reprex v2.1.1
This is strange because the construction process seems to be almost identical (the main branch branches with or without NA
, but in fact the speed does not seem to change with or without NA
).
https://github.com/pola-rs/r-polars/blob/f55eade690b6b505b310d05ef66492e5e47ddb68/src/rust/src/conversion_r_to_s.rs#L138-L150 https://github.com/pola-rs/r-polars/blob/72897e5bd1331e1ca9c079b02094e1d326249a71/src/rust/src/series/construction.rs#L22-L28
Is the superior export speed of arrow
and nanoarrow
probably due to the use of ALTREP?
It seems to take 100 times longer than the conversion from
arrow::Table
.Could
arrow
be using ALTREP to make the materialization later?Details
``` r library(polars) library(arrow, warn.conflicts = FALSE) polars_info() #> Polars R package version : 0.16.3 #> Rust Polars crate version: 0.39.2 #> #> Thread pool size: 16 #> #> Features: #> default TRUE #> full_features TRUE #> disable_limit_max_threads TRUE #> nightly TRUE #> sql TRUE #> rpolars_debug_print FALSE #> #> Code completion: deactivated arrow_info() #> Arrow package version: 15.0.1 #> #> Capabilities: #> #> acero TRUE #> dataset TRUE #> substrait FALSE #> parquet TRUE #> json TRUE #> s3 TRUE #> gcs TRUE #> utf8proc TRUE #> re2 TRUE #> snappy TRUE #> gzip TRUE #> brotli TRUE #> zstd TRUE #> lz4 TRUE #> lz4_frame TRUE #> lzo FALSE #> bz2 TRUE #> jemalloc TRUE #> mimalloc TRUE #> #> Memory: #> #> Allocator jemalloc #> Current 0 bytes #> Max 0 bytes #> #> Runtime: #> #> SIMD Level avx2 #> Detected SIMD Level avx2 #> #> Build: #> #> C++ Library Version 15.0.1 #> C++ Compiler GNU #> C++ Compiler Version 11.4.0 big_df <- do.call(rbind, lapply(1:5, \(x) nycflights13::flights)) from_r <- bench::mark( as_polars_df = as_polars_df(big_df), as_arrow_table = as_arrow_table(big_df), check = FALSE, min_iterations = 5 ) big_pldf <- as_polars_df(big_df) big_at <- as_arrow_table(big_df) to_r <- bench::mark( pldf = as.data.frame(big_pldf), at = as.data.frame(big_at), check = FALSE, min_iterations = 5 ) #> Warning: Some expressions had a GC in every iteration; so filtering is #> disabled. from_r #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #>