duckdb / duckdb-r

The duckdb R package
https://r.duckdb.org/
Other
105 stars 21 forks source link

`duckdb_register_arrow()` lose factor variable type #188

Open brianmsm opened 1 week ago

brianmsm commented 1 week ago

When a tibble with a variable as a factor is registered in duckdb, it maintains its characteristic of being a factor, just as when it is registered as an arrow table. However, when an arrow table with this characteristic is registered in duckdb, the configuration is lost.

library(dplyr, warn.conflicts = FALSE)
library(duckdb)
#> Cargando paquete requerido: DBI
library(arrow)
#> 
#> Adjuntando el paquete: 'arrow'
#> The following object is masked from 'package:utils':
#> 
#>     timestamp

data(penguins, package = "palmerpenguins")
penguins_arrow <- arrow_table(penguins)
penguins_arrow
#> Table
#> 344 rows x 8 columns
#> $species <dictionary<values=string, indices=int8>>
#> $island <dictionary<values=string, indices=int8>>
#> $bill_length_mm <double>
#> $bill_depth_mm <double>
#> $flipper_length_mm <int32>
#> $body_mass_g <int32>
#> $sex <dictionary<values=string, indices=int8>>
#> $year <int32>

con_duck <- DBI::dbConnect(duckdb::duckdb(), ":memory:")

# species, island and sex as a factors when penguins is tbl
copy_to(con_duck, penguins)
tbl(con_duck, "penguins")
#> # Source:   table<penguins> [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <fct>, year <int>

# missing factors when penguins is arrow table
duckdb_register_arrow(con_duck,
                      "penguins_arrow",
                      penguins_arrow)

tbl(con_duck, "penguins_arrow")
#> # Source:   table<penguins_arrow> [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <chr>   <chr>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <chr>, year <int>

Created on 2024-06-26 with reprex v2.1.0

krlmlr commented 3 days ago

Thanks. Do you see the same behavior when storing the penguins in a Parquet file?

brianmsm commented 3 days ago

By storing the data as a parquet file and reading it from arrow, it keeps the configuration... but if it is imported directly to duckdb, it loses it.

I have done several different combinations. I'm not sure if it was redundant

library(dplyr, warn.conflicts = FALSE)
library(duckdb)
#> Cargando paquete requerido: DBI
library(arrow)
#> 
#> Adjuntando el paquete: 'arrow'
#> The following object is masked from 'package:utils':
#> 
#>     timestamp

# Load data
data(penguins, package = "palmerpenguins")
penguins_arrow <- arrow_table(penguins)

# Write 2 files
write_parquet(penguins, "penguins.parquet")
write_parquet(penguins_arrow, "penguins_arrow.parquet")

# Read 2 files keeping factor/dictionary type
read_parquet("penguins.parquet")
#> # A tibble: 344 × 8
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ 334 more rows
#> # ℹ 2 more variables: sex <fct>, year <int>
read_parquet("penguins.parquet", as_data_frame = FALSE)
#> Table
#> 344 rows x 8 columns
#> $species <dictionary<values=string, indices=int32>>
#> $island <dictionary<values=string, indices=int32>>
#> $bill_length_mm <double>
#> $bill_depth_mm <double>
#> $flipper_length_mm <int32>
#> $body_mass_g <int32>
#> $sex <dictionary<values=string, indices=int32>>
#> $year <int32>

read_parquet("penguins_arrow.parquet")
#> # A tibble: 344 × 8
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ 334 more rows
#> # ℹ 2 more variables: sex <fct>, year <int>
read_parquet("penguins_arrow.parquet", as_data_frame = FALSE)
#> Table
#> 344 rows x 8 columns
#> $species <dictionary<values=string, indices=int32>>
#> $island <dictionary<values=string, indices=int32>>
#> $bill_length_mm <double>
#> $bill_depth_mm <double>
#> $flipper_length_mm <int32>
#> $body_mass_g <int32>
#> $sex <dictionary<values=string, indices=int32>>
#> $year <int32>

# Settings duck connection
con_duck <- DBI::dbConnect(duckdb::duckdb(), ":memory:")

# Doesn't keep the variable as a factor
tbl(con_duck, "read_parquet('penguins.parquet')")
#> # Source:   SQL [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <chr>   <chr>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <chr>, year <int>
tbl(con_duck, "read_parquet('penguins_arrow.parquet')")
#> # Source:   SQL [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <chr>   <chr>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <chr>, year <int>

# Another way: Doesn't keep the variable as a factor, also
dbExecute(con_duck,
          "CREATE VIEW penguins AS
           SELECT * FROM penguins.parquet")
#> [1] 0

tbl(con_duck, "penguins")
#> # Source:   table<penguins> [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <chr>   <chr>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <chr>, year <int>

dbExecute(con_duck,
          "CREATE VIEW penguins_arrow AS
           SELECT * FROM penguins_arrow.parquet")
#> [1] 0

tbl(con_duck, "penguins_arrow")
#> # Source:   table<penguins_arrow> [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <chr>   <chr>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <chr>, year <int>

# Another way: Doesn't keep the variable as a factor, also
dbExecute(con_duck, "COPY penguins TO 'penguins2.parquet'")
#> [1] 344
tbl(con_duck, "read_parquet('penguins2.parquet')")
#> # Source:   SQL [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <chr>   <chr>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <chr>, year <int>

dbExecute(con_duck, "COPY penguins TO 'penguins_arrow2.parquet'")
#> [1] 344
tbl(con_duck, "read_parquet('penguins_arrow2.parquet')")
#> # Source:   SQL [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <chr>   <chr>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <chr>, year <int>

dbDisconnect(con_duck)

Created on 2024-07-04 with reprex v2.1.0

krlmlr commented 3 days ago

Thanks, interesting. This means it's more difficult to create a reproducible example using duckdb only, and might be a problem with the R bindings. Need to investigate that.

eitsupi commented 3 days ago

Since there are no categorical types in Parquet, DuckDB, which follows the Parquet specification, cannot store or read them. See this issue duckdb/duckdb#3309.

brianmsm commented 3 days ago

Unfortunately it seems that this issue is now closed. One of the last comments indicates the possibility of using the ENUM type to work with categorical data/factors, but I understand that it is not implemented yet. Arrow stores the categorical data as dictionaries and maintains the information by saving the files in parquet format and bringing them back into session (transforming dictionary to factor).

library(dplyr, warn.conflicts = FALSE)
library(arrow)
#> 
#> Adjuntando el paquete: 'arrow'
#> The following object is masked from 'package:utils':
#> 
#>     timestamp

# Load data
data(penguins, package = "palmerpenguins")

# Write parquet
write_parquet(penguins, "penguins.parquet")

# Read 2 files keeping factor/dictionary type
read_parquet("penguins.parquet", as_data_frame = FALSE)
#> Table
#> 344 rows x 8 columns
#> $species <dictionary<values=string, indices=int32>>
#> $island <dictionary<values=string, indices=int32>>
#> $bill_length_mm <double>
#> $bill_depth_mm <double>
#> $flipper_length_mm <int32>
#> $body_mass_g <int32>
#> $sex <dictionary<values=string, indices=int32>>
#> $year <int32>
read_parquet("penguins.parquet")
#> # A tibble: 344 × 8
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ 334 more rows
#> # ℹ 2 more variables: sex <fct>, year <int>

Created on 2024-07-04 with reprex v2.1.0

The same happens in python, where it transforms dictionary to categorical:

import polars as pl
import pyarrow.parquet as pq
pq.read_table("penguins.parquet")
#> pyarrow.Table
species: dictionary<values=string, indices=int32, ordered=0>
island: dictionary<values=string, indices=int32, ordered=0>
bill_length_mm: double
bill_depth_mm: double
flipper_length_mm: int32
body_mass_g: int32
sex: dictionary<values=string, indices=int32, ordered=0>
year: int32
----
species: [  -- dictionary:
["Adelie","Chinstrap","Gentoo"]  -- indices:
[0,0,0,0,0,...,1,1,1,1,1]]
island: [  -- dictionary:
["Biscoe","Dream","Torgersen"]  -- indices:
[2,2,2,2,2,...,1,1,1,1,1]]
bill_length_mm: [[39.1,39.5,40.3,null,36.7,...,55.8,43.5,49.6,50.8,50.2]]
bill_depth_mm: [[18.7,17.4,18,null,19.3,...,19.8,18.1,18.2,19,18.7]]
flipper_length_mm: [[181,186,195,null,193,...,207,202,193,210,198]]
body_mass_g: [[3750,3800,3250,null,3450,...,4000,3400,3775,4100,3775]]
sex: [  -- dictionary:
["female","male"]  -- indices:
[1,0,0,null,0,...,1,0,1,1,0]]
year: [[2007,2007,2007,2007,2007,...,2009,2009,2009,2009,2009]]
pl.read_parquet('penguins.parquet')
#> shape: (344, 8)
┌───────────┬───────────┬────────────────┬───────────────┬───────────────────┬─────────────┬────────┬──────┐
│ species   ┆ island    ┆ bill_length_mm ┆ bill_depth_mm ┆ flipper_length_mm ┆ body_mass_g ┆ sex    ┆ year │
│ ---       ┆ ---       ┆ ---            ┆ ---           ┆ ---               ┆ ---         ┆ ---    ┆ ---  │
│ cat       ┆ cat       ┆ f64            ┆ f64           ┆ i32               ┆ i32         ┆ cat    ┆ i32  │
╞═══════════╪═══════════╪════════════════╪═══════════════╪═══════════════════╪═════════════╪════════╪══════╡
│ Adelie    ┆ Torgersen ┆ 39.1           ┆ 18.7          ┆ 181               ┆ 3750        ┆ male   ┆ 2007 │
│ Adelie    ┆ Torgersen ┆ 39.5           ┆ 17.4          ┆ 186               ┆ 3800        ┆ female ┆ 2007 │
│ Adelie    ┆ Torgersen ┆ 40.3           ┆ 18.0          ┆ 195               ┆ 3250        ┆ female ┆ 2007 │
│ Adelie    ┆ Torgersen ┆ null           ┆ null          ┆ null              ┆ null        ┆ null   ┆ 2007 │
│ Adelie    ┆ Torgersen ┆ 36.7           ┆ 19.3          ┆ 193               ┆ 3450        ┆ female ┆ 2007 │
│ …         ┆ …         ┆ …              ┆ …             ┆ …                 ┆ …           ┆ …      ┆ …    │
│ Chinstrap ┆ Dream     ┆ 55.8           ┆ 19.8          ┆ 207               ┆ 4000        ┆ male   ┆ 2009 │
│ Chinstrap ┆ Dream     ┆ 43.5           ┆ 18.1          ┆ 202               ┆ 3400        ┆ female ┆ 2009 │
│ Chinstrap ┆ Dream     ┆ 49.6           ┆ 18.2          ┆ 193               ┆ 3775        ┆ male   ┆ 2009 │
│ Chinstrap ┆ Dream     ┆ 50.8           ┆ 19.0          ┆ 210               ┆ 4100        ┆ male   ┆ 2009 │
│ Chinstrap ┆ Dream     ┆ 50.2           ┆ 18.7          ┆ 198               ┆ 3775        ┆ female ┆ 2009 │
└───────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴──────┘

Certainly, you could have as a workaround to import the data first from arrow... however, when you want to move to duckdb, you would lose this information in the same way (duckdb_register_arrow()). In this particular case, it makes it difficult to take advantage of the Zero-copy Data Integration.

library(dplyr, warn.conflicts = FALSE)
library(duckdb)
#> Cargando paquete requerido: DBI
library(arrow)
#> 
#> Adjuntando el paquete: 'arrow'
#> The following object is masked from 'package:utils':
#> 
#>     timestamp

# Load data
data(penguins, package = "palmerpenguins")
write_parquet(penguins, "penguins.parquet")
# Settings duck connection
con_duck <- DBI::dbConnect(duckdb::duckdb(), ":memory:")
copy_to(con_duck, penguins, "penguins")

# Keep factor variable
tbl(con_duck, "penguins")
#> # Source:   table<penguins> [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <fct>, year <int>

# Try Zero-Copy Data Transfer: Keep dictionary settings
tbl(con_duck, "penguins") %>% 
  to_arrow()
#> RecordBatchReader
#> 8 columns
#> species: dictionary<values=string, indices=uint8>
#> island: dictionary<values=string, indices=uint8>
#> bill_length_mm: double
#> bill_depth_mm: double
#> flipper_length_mm: int32
#> body_mass_g: int32
#> sex: dictionary<values=string, indices=uint8>
#> year: int32

# Try Zero-Copy Data Transfer: Doesn't keep the variable as a factor
tbl(con_duck, "penguins") %>% 
  to_arrow() %>% 
  # Intermediate operations
  compute() %>% 
  to_duckdb()
#> # Source:   table<arrow_001> [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <chr>   <chr>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <chr>, year <int>

# Try Zero-Copy Data Transfer: Doesn't keep the variable as a factor
read_parquet("penguins.parquet", as_data_frame = FALSE) %>% 
  to_duckdb()
#> # Source:   table<arrow_002> [?? x 8]
#> # Database: DuckDB v1.0.0 [brian@Windows 10 x64:R 4.4.1/:memory:]
#>    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#>    <chr>   <chr>              <dbl>         <dbl>             <int>       <int>
#>  1 Adelie  Torgersen           39.1          18.7               181        3750
#>  2 Adelie  Torgersen           39.5          17.4               186        3800
#>  3 Adelie  Torgersen           40.3          18                 195        3250
#>  4 Adelie  Torgersen           NA            NA                  NA          NA
#>  5 Adelie  Torgersen           36.7          19.3               193        3450
#>  6 Adelie  Torgersen           39.3          20.6               190        3650
#>  7 Adelie  Torgersen           38.9          17.8               181        3625
#>  8 Adelie  Torgersen           39.2          19.6               195        4675
#>  9 Adelie  Torgersen           34.1          18.1               193        3475
#> 10 Adelie  Torgersen           42            20.2               190        4250
#> # ℹ more rows
#> # ℹ 2 more variables: sex <chr>, year <int>

Created on 2024-07-04 with reprex v2.1.0