Open annakrystalli opened 1 year ago
Feedback from seeking help on rOpenSci slack showed that in simplified cases, casting using a schema and then filtering of mixed data based on the schema data type works fine. I've been trying to work up the complexity ladder from the simplified example that works to the breaking behaviour described above.
So far no joy, cannot reproduce the errors encountered in my flusight hub example.
Here's what I've tried so far:
library(arrow)
#>
#> Attaching package: 'arrow'
#> The following object is masked from 'package:utils':
#>
#> timestamp
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
tmpdir <- tempdir()
data.frame(a = 1:10) |>
write_csv_arrow(sink = file.path(tmpdir, "int.csv"))
data.frame(a = letters[1:10]) |>
write_csv_arrow(sink = file.path(tmpdir, "alpha.csv"))
ds1 <- open_dataset(file.path(tmpdir, "int.csv"),
format = "csv", schema = schema(a = string()))
ds2 <- open_dataset(file.path(tmpdir, "alpha.csv"),
format = "csv", schema = schema(a = string()))
open_dataset(list(ds1, ds2)) |> filter(a == "1") |>
collect()
#> # A tibble: 1 × 1
#> a
#> <chr>
#> 1 1
# Add NAs and parquet files
data.frame(a = c(NA, 1:10)) |>
write_csv_arrow(sink = file.path(tmpdir, "int.csv"))
data.frame(a = c(NA, NA, seq(0, 1, by = 0.1))) |>
write.csv(file.path(tmpdir, "dbl.csv"), row.names = FALSE)
data.frame(a = c(NA, NA, letters[1:10])) |>
write_parquet(sink = file.path(tmpdir, "alpha.parquet"))
data.frame(a = c(NA, NA, letters[1:10])) |>
write.csv(file.path(tmpdir, "alpha.csv"), row.names = FALSE)
ds1 <- open_dataset(file.path(tmpdir, "int.csv"),
format = "csv", schema = schema(a = string()))
ds2 <- open_dataset(file.path(tmpdir, "alpha.parquet"),
format = "parquet", schema = schema(a = string()))
ds3 <- open_dataset(file.path(tmpdir, "dbl.csv"),
format = "csv", schema = schema(a = string()))
ds4 <- open_dataset(file.path(tmpdir, "alpha.csv"),
format = "csv", schema = schema(a = string()))
open_dataset(list(ds1, ds2)) |> filter(a == "1") |>
collect()
#> # A tibble: 1 × 1
#> a
#> <chr>
#> 1 1
open_dataset(list(ds1, ds2, ds3, ds4)) |> filter(a == "1") |>
collect()
#> # A tibble: 2 × 1
#> a
#> <chr>
#> 1 1
#> 2 1
open_dataset(list(ds1, ds2, ds3, ds4)) |> filter(a == "0.5") |>
collect()
#> # A tibble: 1 × 1
#> a
#> <chr>
#> 1 0.5
# Use folder instead of filenames and exclude invalid files
csv <- open_dataset(tmpdir,
format = "csv", col_types = schema(a = string()),
strings_can_be_null = TRUE,
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = TRUE))
parquet <- open_dataset(tmpdir,
format = "parquet", schema = schema(a = string()),
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = TRUE))
open_dataset(list(csv, parquet)) |> filter(a == "0.5") |>
collect()
#> # A tibble: 1 × 1
#> a
#> <chr>
#> 1 0.5
open_dataset(list(csv, parquet)) |> filter(!is.na(a)) |>
collect()
#> # A tibble: 41 × 1
#> a
#> <chr>
#> 1 a
#> 2 b
#> 3 c
#> 4 d
#> 5 e
#> 6 f
#> 7 g
#> 8 h
#> 9 i
#> 10 j
#> # ℹ 31 more rows
# Add folder partitioning
library(fs)
tmpdir <- tempdir()
dir_create(tmpdir, "partition")
tmpdir <- path(tmpdir, "partition")
dir_create(tmpdir, "csv")
dir_create(tmpdir, "parquet")
data.frame(a = c(NA, 1:10)) |>
write_csv_arrow(sink = file.path(tmpdir, "csv", "int.csv"))
data.frame(a = c(NA, NA, seq(0, 1, by = 0.1))) |>
write.csv(file.path(tmpdir, "csv", "dbl.csv"), row.names = FALSE)
data.frame(a = c(NA, NA, letters[1:10])) |>
write.csv(file.path(tmpdir, "csv", "alpha.csv"), row.names = FALSE)
data.frame(a = c(NA, NA, letters[1:10])) |>
write_parquet(sink = file.path(tmpdir, "parquet", "alpha.parquet"))
data.frame(a = c(NA, 1:10)) |>
write_parquet(sink = file.path(tmpdir, "parquet", "int.parquet"))
data.frame(a = c(NA, NA, seq(0, 1, by = 0.1))) |>
write_parquet(file.path(tmpdir, "parquet", "dbl.parquet"))
csv <- open_dataset(tmpdir,
format = "csv", col_types = schema(a = string()),
strings_can_be_null = TRUE,
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
parquet <- open_dataset(tmpdir,
format = "parquet",
schema = schema(a = string(),
model = string()),
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
open_dataset(list(csv, parquet)) |> filter(a == "0.5") |>
collect()
#> # A tibble: 2 × 2
#> a model
#> <chr> <chr>
#> 1 0.5 csv
#> 2 0.5 parquet
open_dataset(list(csv, parquet)) |> filter(model == "csv", !is.na(a)) |>
collect()
#> # A tibble: 31 × 2
#> a model
#> <chr> <chr>
#> 1 0 csv
#> 2 0.1 csv
#> 3 0.2 csv
#> 4 0.3 csv
#> 5 0.4 csv
#> 6 0.5 csv
#> 7 0.6 csv
#> 8 0.7 csv
#> 9 0.8 csv
#> 10 0.9 csv
#> # ℹ 21 more rows
open_dataset(list(csv, parquet)) |> filter(model == "parquet", !is.na(a)) |>
collect()
#> # A tibble: 31 × 2
#> a model
#> <chr> <chr>
#> 1 0 parquet
#> 2 0.1 parquet
#> 3 0.2 parquet
#> 4 0.30000000000000004 parquet
#> 5 0.4 parquet
#> 6 0.5 parquet
#> 7 0.6000000000000001 parquet
#> 8 0.7000000000000001 parquet
#> 9 0.8 parquet
#> 10 0.9 parquet
#> # ℹ 21 more rows
open_dataset(list(csv, parquet)) |> filter(model == "csv", a == "0.5") |>
collect()
#> # A tibble: 1 × 2
#> a model
#> <chr> <chr>
#> 1 0.5 csv
open_dataset(list(csv, parquet)) |> filter(model == "parquet", a == "0.5") |>
collect()
#> # A tibble: 1 × 2
#> a model
#> <chr> <chr>
#> 1 0.5 parquet
# Additional note:
# In contrast to casting using a schema when opening the dataset,
# while mutating/casting to a different data type within a query can
# be done just prior to collecting data
open_dataset(list(csv, parquet)) |>
filter(!a %in% letters) |>
mutate(a = as.double(a)) |>
collect()
#> # A tibble: 51 × 2
#> a model
#> <dbl> <chr>
#> 1 NA csv
#> 2 NA csv
#> 3 NA parquet
#> 4 NA parquet
#> 5 1 csv
#> 6 2 csv
#> 7 3 csv
#> 8 4 csv
#> 9 5 csv
#> 10 6 csv
#> # ℹ 41 more rows
# it does not work when trying to filter immediately afterwards
# even though the first filtering clause ensures all
# data after that can be safely converted to double. It seems the second
# filter is being on rows that should have already been excluded by the first
# filter.
open_dataset(list(csv, parquet)) |>
filter(!a %in% letters) |>
mutate(a = as.double(a)) |>
filter(a == 0.5) |>
collect()
#> Error in `compute.arrow_dplyr_query()`:
#> ! Invalid: Failed to parse string: 'j' as a scalar of type double
#> Backtrace:
#> ▆
#> 1. ├─dplyr::collect(...)
#> 2. └─arrow:::collect.arrow_dplyr_query(...)
#> 3. └─arrow:::compute.arrow_dplyr_query(x)
#> 4. └─base::tryCatch(...)
#> 5. └─base (local) tryCatchList(expr, classes, parentenv, handlers)
#> 6. └─base (local) tryCatchOne(expr, names, parentenv, handlers[[1L]])
#> 7. └─value[[3L]](cond)
#> 8. └─arrow:::augment_io_error_msg(e, call, schema = schema())
#> 9. └─rlang::abort(msg, call = call)
# Use unify_schemas = FALSE when opening individula datasets
csv <- open_dataset(tmpdir,
format = "csv", col_types = schema(a = string()),
strings_can_be_null = TRUE,
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
parquet <- open_dataset(tmpdir,
format = "parquet",
schema = schema(a = string(),
model = string()),
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
open_dataset(list(csv, parquet)) |> filter(a == "0.5") |>
collect()
#> # A tibble: 2 × 2
#> a model
#> <chr> <chr>
#> 1 0.5 csv
#> 2 0.5 parquet
open_dataset(list(csv, parquet)) |> filter(model == "csv", !is.na(a)) |>
collect()
#> # A tibble: 31 × 2
#> a model
#> <chr> <chr>
#> 1 0 csv
#> 2 0.1 csv
#> 3 0.2 csv
#> 4 0.3 csv
#> 5 0.4 csv
#> 6 0.5 csv
#> 7 0.6 csv
#> 8 0.7 csv
#> 9 0.8 csv
#> 10 0.9 csv
#> # ℹ 21 more rows
open_dataset(list(csv, parquet)) |> filter(model == "parquet", !is.na(a)) |>
collect()
#> # A tibble: 31 × 2
#> a model
#> <chr> <chr>
#> 1 a parquet
#> 2 b parquet
#> 3 c parquet
#> 4 d parquet
#> 5 e parquet
#> 6 f parquet
#> 7 g parquet
#> 8 h parquet
#> 9 i parquet
#> 10 j parquet
#> # ℹ 21 more rows
open_dataset(list(csv, parquet)) |> filter(model == "csv", a == "0.5") |>
collect()
#> # A tibble: 1 × 2
#> a model
#> <chr> <chr>
#> 1 0.5 csv
open_dataset(list(csv, parquet)) |> filter(model == "parquet", a == "0.5") |>
collect()
#> # A tibble: 1 × 2
#> a model
#> <chr> <chr>
#> 1 0.5 parquet
# Use extra column in one parquet file
data.frame(a = c(NA, NA, letters[1:10]), b = "extra-parquet-column") |>
write_parquet(sink = file.path(tmpdir, "parquet", "alpha.parquet"))
csv <- open_dataset(tmpdir,
format = "csv", col_types = schema(a = string()),
strings_can_be_null = TRUE,
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
parquet <- open_dataset(tmpdir,
format = "parquet",
schema = schema(a = string(),
b = string(),
model = string()),
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
open_dataset(list(csv, parquet)) |> filter(a == "0.5") |>
collect()
#> # A tibble: 2 × 3
#> a model b
#> <chr> <chr> <chr>
#> 1 0.5 csv <NA>
#> 2 0.5 parquet <NA>
open_dataset(list(csv, parquet)) |> filter(model == "csv", !is.na(a)) |>
collect()
#> # A tibble: 31 × 3
#> a model b
#> <chr> <chr> <chr>
#> 1 0 csv <NA>
#> 2 0.1 csv <NA>
#> 3 0.2 csv <NA>
#> 4 0.3 csv <NA>
#> 5 0.4 csv <NA>
#> 6 0.5 csv <NA>
#> 7 0.6 csv <NA>
#> 8 0.7 csv <NA>
#> 9 0.8 csv <NA>
#> 10 0.9 csv <NA>
#> # ℹ 21 more rows
open_dataset(list(csv, parquet)) |> filter(model == "parquet", !is.na(a)) |>
collect()
#> # A tibble: 31 × 3
#> a model b
#> <chr> <chr> <chr>
#> 1 0 parquet <NA>
#> 2 0.1 parquet <NA>
#> 3 0.2 parquet <NA>
#> 4 0.30000000000000004 parquet <NA>
#> 5 0.4 parquet <NA>
#> 6 0.5 parquet <NA>
#> 7 0.6000000000000001 parquet <NA>
#> 8 0.7000000000000001 parquet <NA>
#> 9 0.8 parquet <NA>
#> 10 0.9 parquet <NA>
#> # ℹ 21 more rows
open_dataset(list(csv, parquet)) |> filter(model == "csv", a == "0.5") |>
collect()
#> # A tibble: 1 × 3
#> a model b
#> <chr> <chr> <chr>
#> 1 0.5 csv <NA>
open_dataset(list(csv, parquet)) |> filter(model == "parquet", a == "0.5") |>
collect()
#> # A tibble: 1 × 3
#> a model b
#> <chr> <chr> <chr>
#> 1 0.5 parquet <NA>
# Use extra column in one csv file
data.frame(a = c(NA, NA, letters[1:10])) |>
write_parquet(sink = file.path(tmpdir, "parquet", "alpha.parquet"))
data.frame(a = c(NA, NA, letters[1:10]),
b = "extra-csv-column") |>
write.csv(file.path(tmpdir, "csv", "alpha.csv"), row.names = FALSE)
csv <- open_dataset(tmpdir,
format = "csv",
col_types = schema(a = string(),
b = string()),
strings_can_be_null = TRUE,
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
parquet <- open_dataset(tmpdir,
format = "parquet",
schema = schema(a = string(),
b = string(),
model = string()),
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
open_dataset(list(csv, parquet)) |> filter(a == "0.5") |>
collect()
#> # A tibble: 2 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0.5 <NA> csv
#> 2 0.5 <NA> parquet
open_dataset(list(csv, parquet)) |> filter(model == "csv", !is.na(a)) |>
collect()
#> # A tibble: 31 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0 <NA> csv
#> 2 0.1 <NA> csv
#> 3 0.2 <NA> csv
#> 4 0.3 <NA> csv
#> 5 0.4 <NA> csv
#> 6 0.5 <NA> csv
#> 7 0.6 <NA> csv
#> 8 0.7 <NA> csv
#> 9 0.8 <NA> csv
#> 10 0.9 <NA> csv
#> # ℹ 21 more rows
open_dataset(list(csv, parquet)) |> filter(model == "parquet", !is.na(a)) |>
collect()
#> # A tibble: 31 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 a <NA> parquet
#> 2 b <NA> parquet
#> 3 c <NA> parquet
#> 4 d <NA> parquet
#> 5 e <NA> parquet
#> 6 f <NA> parquet
#> 7 g <NA> parquet
#> 8 h <NA> parquet
#> 9 i <NA> parquet
#> 10 j <NA> parquet
#> # ℹ 21 more rows
open_dataset(list(csv, parquet)) |> filter(model == "csv", a == "0.5") |>
collect()
#> # A tibble: 1 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0.5 <NA> csv
open_dataset(list(csv, parquet)) |> filter(model == "parquet", a == "0.5") |>
collect()
#> # A tibble: 1 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0.5 <NA> parquet
# Use extra column in one parquet and one csv file
data.frame(a = c(NA, NA, letters[1:10]),
b = "extra-csv-column") |>
write.csv(file.path(tmpdir, "csv", "alpha.csv"), row.names = FALSE)
data.frame(a = c(NA, NA, letters[1:10]), b = "extra-parquet-column") |>
write_parquet(sink = file.path(tmpdir, "parquet", "alpha.parquet"))
csv <- open_dataset(tmpdir,
format = "csv",
col_types = schema(a = string(),
b = string()),
strings_can_be_null = TRUE,
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
parquet <- open_dataset(tmpdir,
format = "parquet",
schema = schema(a = string(),
b = string(),
model = string()),
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
open_dataset(list(csv, parquet)) |> filter(a == "0.5") |>
collect()
#> # A tibble: 2 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0.5 <NA> csv
#> 2 0.5 <NA> parquet
open_dataset(list(csv, parquet)) |> filter(model == "csv", is.na(a)) |>
collect()
#> # A tibble: 4 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 <NA> extra-csv-column csv
#> 2 <NA> extra-csv-column csv
#> 3 <NA> <NA> csv
#> 4 <NA> <NA> csv
open_dataset(list(csv, parquet)) |> filter(model == "parquet", is.na(a)) |>
collect()
#> # A tibble: 5 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 <NA> extra-parquet-column parquet
#> 2 <NA> extra-parquet-column parquet
#> 3 <NA> <NA> parquet
#> 4 <NA> <NA> parquet
#> 5 <NA> <NA> parquet
open_dataset(list(csv, parquet)) |> filter(model == "csv", a == "0.5") |>
collect()
#> # A tibble: 1 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0.5 <NA> csv
open_dataset(list(csv, parquet)) |> filter(model == "parquet", a == "0.5") |>
collect()
#> # A tibble: 1 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0.5 <NA> parquet
# Use extra column in one parquet and one csv file + unify_schemas = TRUE
csv <- open_dataset(tmpdir,
format = "csv",
col_types = schema(a = string(),
b = string()),
strings_can_be_null = TRUE,
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
parquet <- open_dataset(tmpdir,
format = "parquet",
schema = schema(a = string(),
b = string(),
model = string()),
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = TRUE),
partitioning = "model")
open_dataset(list(csv, parquet)) |> filter(a == "0.5") |>
collect()
#> # A tibble: 2 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0.5 <NA> csv
#> 2 0.5 <NA> parquet
open_dataset(list(csv, parquet)) |> filter(model == "csv", is.na(a)) |>
collect()
#> # A tibble: 4 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 <NA> extra-csv-column csv
#> 2 <NA> extra-csv-column csv
#> 3 <NA> <NA> csv
#> 4 <NA> <NA> csv
open_dataset(list(csv, parquet)) |> filter(model == "parquet", is.na(a)) |>
collect()
#> # A tibble: 5 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 <NA> extra-parquet-column parquet
#> 2 <NA> extra-parquet-column parquet
#> 3 <NA> <NA> parquet
#> 4 <NA> <NA> parquet
#> 5 <NA> <NA> parquet
open_dataset(list(csv, parquet)) |> filter(model == "csv", a == "0.5") |>
collect()
#> # A tibble: 1 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0.5 <NA> csv
open_dataset(list(csv, parquet)) |> filter(model == "parquet", a == "0.5") |>
collect()
#> # A tibble: 1 × 3
#> a b model
#> <chr> <chr> <chr>
#> 1 0.5 <NA> parquet
Created on 2023-06-06 with reprex v2.0.2
I'm having issues with filtering data from arrow Datasets on columns where the data type in the underlying source data files is mixed. In particular, the
type_id
column contains double data in some files and character data in others.I've written a reprex to seek outside advise on this. The reprex contains stripped down source code of functions used within
connect_hub()
but overall describes the workflow accurately.Data used in this example can be reviewed here (in the
forecasts/
directory): https://github.com/Infectious-Disease-Modeling-Hubs/hubUtils/tree/schema-from-config/inst/testhubs/flusightCreated on 2023-05-17 with reprex v2.0.2
I'm trying to understand the purpose of providing a schema to the
arrow::open_dataset()
function.It appears that it works on the way out, i.e. it converts the data to the data type specified in the schema when extracting data but does not appear to have any effect on the way in, i.e. when sending
dplyr
queries back to the dataset.Is there a way to make queries aware of the trasformations specified in the schema?
If not, is there any plans to do so in
arrow
?I do appreciate that taking into account the schema when sending queries is a lot more work that just using it when extracting data but it feels like a limitation that hampers the package's utility and the purpose of the schema.
To get around it in our own use case we would have to enforce certain data types when writing the data which is something we would ideally like to avoid. Any advise is greatly welcomed!
P.S. this behaviour is not a result of mixing file formats as it's also a problem when opening for example only csv files as the datasest has files with different
type_id
data types across different file.