Open cole-brokamp opened 4 months ago
I've been working through this example, but getting stuck.. not sure what to do with the list of attributes after removing duplicated fields. How do I re-assign this to an fr_tdr
as a schema to use as a template?
library(tidyverse)
library(fr)
t1 <- tibble::tibble(id = c("a", "b", "c"),
value = c(1, 2, 3))
t2 <- tibble::tibble(id = c("a", "b", "c"),
symbol = c(".", ",", "!"))
f1 <-
as_fr_tdr(t1, name = "tibble 1") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("value", title = "Value", description = "value")
f2 <-
as_fr_tdr(t2, name = "tibble 2") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("symbol", title = "Symbol", description = "symbol")
# left_join(f1, f2, by = "id")
f_list <- list(f1, f2)
schema_list <-
purrr::map(f_list, as_list) |>
purrr::map("schema") |>
purrr::map("fields") |>
flatten()
schema_list <- schema_list[!duplicated(schema_list)]
Your example made me realize that we could use the .template
option to recover the metadata from the fields in the second tdr that we are joining in:
library(fr)
f1 <- tibble::tibble(
id = c("a", "b", "c"),
value = c(1, 2, 3)
) |>
as_fr_tdr(name = "tibble 1") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("value", title = "Value", description = "value")
f2 <- tibble::tibble(
id = c("a", "b", "c"),
symbol = c(".", ",", "!")
) |>
as_fr_tdr(name = "tibble 2") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("symbol", title = "Symbol", description = "symbol")
f3 <- tibble::tibble(
id = c("a", "b", "c"),
length = c(4, 5, 6)
) |>
as_fr_tdr(name = "tibble 3") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("length", title = "Length", description = "the length")
out <-
purrr::reduce(
list(f1, f2, f3),
\(x, y) as_fr_tdr(dplyr::left_join(x, y, by = "id"), .template = y))
out@schema
Using the .template
option ensures that any fields with missing metadata in x are pulled from y. But, the .template tdr-specific metadata is always "overwritten", so the final TDR ends up with the metadata from the last TDR in the list. In practice, the TDR-specific metadata for merged TDRs would likely have to be set by a user at the end of a script anyway, like we are doing in hh_acs_data.
side note, I just tried to use reprex
for the first time and it worked great!
library(fr)
f1 <- tibble::tibble(
id = c("a", "b", "c"),
value = c(1, 2, 3)
) |>
as_fr_tdr(name = "tibble 1") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("value", title = "Value", description = "value")
f2 <- tibble::tibble(
id = c("a", "b", "c"),
symbol = c(".", ",", "!")
) |>
as_fr_tdr(name = "tibble 2") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("symbol", title = "Symbol", description = "symbol")
f3 <- tibble::tibble(
id = c("a", "b", "c"),
length = c(4, 5, 6)
) |>
as_fr_tdr(name = "tibble 3") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("length", title = "Length", description = "the length")
out <-
purrr::reduce(
list(f1, f2, f3),
\(x, y) as_fr_tdr(dplyr::left_join(x, y, by = "id"), .template = y))
out
#> tibble 3
#> # A tibble: 3 × 4
#> id value symbol length
#> <chr> <dbl> <chr> <dbl>
#> 1 a 1 . 4
#> 2 b 2 , 5
#> 3 c 3 ! 6
out@schema
#> id
#> - type: string
#> - title: ID
#> - description: identifier
#> value
#> - type: number
#> symbol
#> - type: string
#> length
#> - type: number
#> - title: Length
#> - description: the length
Created on 2024-06-27 with reprex v2.1.0
Something fishy still going on because the intermediate fields are missing title
and description
schema field properties. It works in the case of two TDRs, but something is going wrong during the reduce...
Oh, thanks! I've never used reprex
before.. super nice.
I lose title
and description
for value
even just using left_join
on 2 TDRs.
library(fr)
f1 <- tibble::tibble(
id = c("a", "b", "c"),
value = c(1, 2, 3)
) |>
as_fr_tdr(name = "tibble 1") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("value", title = "Value", description = "value")
f2 <- tibble::tibble(
id = c("a", "b", "c"),
symbol = c(".", ",", "!")
) |>
as_fr_tdr(name = "tibble 2") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("symbol", title = "Symbol", description = "symbol")
out <- as_fr_tdr(dplyr::left_join(f1, f2, by = "id"), .template = f2)
out
#> tibble 2
#> # A tibble: 3 × 3
#> id value symbol
#> <chr> <dbl> <chr>
#> 1 a 1 .
#> 2 b 2 ,
#> 3 c 3 !
out@schema
#> id
#> - type: string
#> - title: ID
#> - description: identifier
#> value
#> - type: number
#> symbol
#> - type: string
#> - title: Symbol
#> - description: symbol
Created on 2024-06-28 with reprex v2.1.0
looks like it is only keeping metadata for the fields specified in the tdr supplied to .template
.
what about ...
library(fr)
f1 <- tibble::tibble(
id = c("a", "b", "c"),
value = c(1, 2, 3)
) |>
as_fr_tdr(name = "tibble 1") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("value", title = "Value", description = "value")
f2 <- tibble::tibble(
id = c("a", "b", "c"),
symbol = c(".", ",", "!")
) |>
as_fr_tdr(name = "tibble 2") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("symbol", title = "Symbol", description = "symbol")
f3 <- tibble::tibble(
id = c("a", "b", "c"),
length = c(4, 5, 6)
) |>
as_fr_tdr(name = "tibble 3") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("length", title = "Length", description = "the length")
out <-
purrr::reduce(
list(f1, f2, f3),
\(x, y) as_fr_tdr(dplyr::left_join(x, y, by = "id"), .template = x))
make_combined_schema <- function(f1_schema_fields, f2_schema_fields) {
common_fields <- f1_schema_fields[names(f1_schema_fields) %in% names(f2_schema_fields)]
unique_f1_fields <- f1_schema_fields[!names(f1_schema_fields) %in% names(f2_schema_fields)]
unique_f2_fields <- f2_schema_fields[!names(f2_schema_fields) %in% names(f1_schema_fields)]
c(common_fields, unique_f1_fields, unique_f2_fields)
}
out@schema@fields <-
purrr::reduce(
list(f1@schema@fields, f2@schema@fields, f3@schema@fields),
\(x, y) make_combined_schema(x, y))
out
#> tibble 1
#> # A tibble: 3 × 4
#> id value symbol length
#> <chr> <dbl> <chr> <dbl>
#> 1 a 1 . 4
#> 2 b 2 , 5
#> 3 c 3 ! 6
out@schema
#> id
#> - type: string
#> - title: ID
#> - description: identifier
#> value
#> - type: number
#> - title: Value
#> - description: value
#> symbol
#> - type: string
#> - title: Symbol
#> - description: symbol
#> length
#> - type: number
#> - title: Length
#> - description: the length
Created on 2024-06-28 with reprex v2.1.0
okay, re-wrote to be one function
library(fr)
f1 <- tibble::tibble(
id = c("a", "b", "c"),
value = c(1, 2, 3)
) |>
as_fr_tdr(name = "tibble 1") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("value", title = "Value", description = "value")
f2 <- tibble::tibble(
id = c("a", "b", "c"),
symbol = c(".", ",", "!")
) |>
as_fr_tdr(name = "tibble 2") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("symbol", title = "Symbol", description = "symbol")
f3 <- tibble::tibble(
id = c("a", "b", "c"),
length = c(4, 5, 6)
) |>
as_fr_tdr(name = "tibble 3") |>
update_field("id", title = "ID", description = "identifier") |>
update_field("length", title = "Length", description = "the length")
fr_left_join <- function(x, y, ...) {
out <- as_fr_tdr(dplyr::left_join(x, y, ...), .template = x)
x_schema_fields <- x@schema@fields
y_schema_fields <- y@schema@fields
common_fields <- x_schema_fields[names(x_schema_fields) %in% names(y_schema_fields)]
unique_x_fields <- x_schema_fields[!names(x_schema_fields) %in% names(y_schema_fields)]
unique_y_fields <- y_schema_fields[!names(y_schema_fields) %in% names(x_schema_fields)]
out@schema@fields <- c(common_fields, unique_x_fields, unique_y_fields)
return(out)
}
out <-
purrr::reduce(
list(f1, f2, f3),
\(x, y) fr_left_join(x, y, by = "id"))
out
#> tibble 1
#> # A tibble: 3 × 4
#> id value symbol length
#> <chr> <dbl> <chr> <dbl>
#> 1 a 1 . 4
#> 2 b 2 , 5
#> 3 c 3 ! 6
out@schema
#> id
#> - type: string
#> - title: ID
#> - description: identifier
#> value
#> - type: number
#> - title: Value
#> - description: value
#> symbol
#> - type: string
#> - title: Symbol
#> - description: symbol
#> length
#> - type: number
#> - title: Length
#> - description: the length
Created on 2024-06-28 with reprex v2.1.0
looks like it is only keeping metadata for the fields specified in the tdr supplied to
.template
.
That is by design: https://github.com/cole-brokamp/fr/blob/e6fc0a9a12f1bbde668987f6f433f0af11e8b87e/R/fr_tdr.R#L30-L31
But this function is really for converting a data.frame to tdr object, and is not designed to take tdr objects. We are kind of bending it here because the input tdr object is converted to a data.frame in order to be used in the as_tdr() function, which drops the metadata and returns only the data.
I think the best thing to do would be to add a method for as_fr_tdr
for an existing fr_tdr object, instead of a data.frame. We would have to think more carefully about the .template option here and what to do with conflicting metadata.
by
column)s) exist and have same exact field-based metadata