Open krlmlr opened 2 years ago
If we changed the way dm_wrap_tbl()
works by default, we could make unwrapping much easier to guess. The idea is to:
library(dm)
library(tidyverse)
nyc13 <-
dm_nycflights13() %>%
dm_select(flights, origin, dest, time_hour, tailnum, carrier)
nyc13_flights <-
nyc13 %>%
dm_wrap_tbl(flights) %>%
pull_tbl(flights)
names(nyc13_flights)
#> [1] "origin" "dest" "time_hour" "tailnum" "carrier" "airlines"
#> [7] "airports" "planes" "weather"
# Current state:
nyc13_flights[1, ] %>%
jsonlite::toJSON(pretty = TRUE)
#> [
#> {
#> "origin": "JFK",
#> "dest": "BQN",
#> "time_hour": "2013-01-10 23:00:00",
#> "tailnum": "N571JB",
#> "carrier": "B6",
#> "airlines": {
#> "name": "JetBlue Airways"
#> },
#> "airports": {
#> "name": "John F Kennedy Intl",
#> "lat": 40.6398,
#> "lon": -73.7789,
#> "alt": 13,
#> "tz": -5,
#> "dst": "A",
#> "tzone": "America/New_York"
#> },
#> "planes": {
#> "year": 2003,
#> "type": "Fixed wing multi engine",
#> "manufacturer": "AIRBUS",
#> "model": "A320-232",
#> "engines": 2,
#> "seats": 200,
#> "engine": "Turbo-fan"
#> },
#> "weather": {
#> "year": 2013,
#> "month": 1,
#> "day": 10,
#> "hour": 23,
#> "temp": 39.02,
#> "dewp": 21.92,
#> "humid": 49.93,
#> "wind_dir": 30,
#> "wind_speed": 4.6031,
#> "precip": 0,
#> "pressure": 1034.6,
#> "visib": 10
#> }
#> }
#> ]
# Desired:
nyc13_flights[1, ] %>%
jsonlite::toJSON(pretty = TRUE)
#> [
#> {
#> "dest": "BQN",
#> "airports=origin": {
#> "faa": "JFK",
#> "name": "John F Kennedy Intl",
#> "lat": 40.6398,
#> "lon": -73.7789,
#> "alt": 13,
#> "tz": -5,
#> "dst": "A",
#> "tzone": "America/New_York"
#> },
#> "planes=tailnum": {
#> "tailnum": "N571JB",
#> "year": 2003,
#> "type": "Fixed wing multi engine",
#> "manufacturer": "AIRBUS",
#> "model": "A320-232",
#> "engines": 2,
#> "seats": 200,
#> "engine": "Turbo-fan"
#> },
#> "airlines=carrier": {
#> "carrier": "B6",
#> "name": "JetBlue Airways"
#> },
#> "weather=origin,time_hour": {
#> "origin": "JFK",
#> "time_hour": "2013-01-10 23:00:00",
#> "year": 2013,
#> "month": 1,
#> "day": 10,
#> "hour": 23,
#> "temp": 39.02,
#> "dewp": 21.92,
#> "humid": 49.93,
#> "wind_dir": 30,
#> "wind_speed": 4.6031,
#> "precip": 0,
#> "pressure": 1034.6,
#> "visib": 10
#> }
#> }
#> ]
Created on 2022-06-09 by the reprex package (v2.0.1)
library(dm)
library(tidyverse)
nyc13 <-
dm_nycflights13() %>%
dm_select(flights, origin, dest, time_hour, tailnum, carrier) %>%
dm_select_tbl(airlines, flights) %>%
dm_zoom_to(flights) %>%
head(150) %>%
dm_update_zoomed()
nyc13_airlines <-
nyc13 %>%
dm_wrap_tbl(airlines) %>%
pull_tbl(airlines)
# Current state
names(nyc13_airlines)
#> [1] "carrier" "name" "flights"
nyc13_airlines[1, ] %>%
jsonlite::toJSON(pretty = TRUE)
#> [
#> {
#> "carrier": "9E",
#> "name": "Endeavor Air Inc.",
#> "flights": [
#> {
#> "origin": "JFK",
#> "dest": "ATL",
#> "time_hour": "2013-01-10 06:00:00",
#> "tailnum": "N170PQ"
#> },
#> {
#> "origin": "JFK",
#> "dest": "PIT",
#> "time_hour": "2013-01-10 07:00:00",
#> "tailnum": "N801AY"
#> },
#> {
#> "origin": "EWR",
#> "dest": "CVG",
#> "time_hour": "2013-01-10 07:00:00",
#> "tailnum": "N8855A"
#> }
#> ]
#> }
#> ]
# Desired state:
nyc13_airlines[1, ] %>%
jsonlite::toJSON(pretty = TRUE)
#> [
#> {
#> "carrier": "9E",
#> "name": "Endeavor Air Inc.",
#> "flights=carrier": [
#> {
#> "origin": "JFK",
#> "dest": "ATL",
#> "time_hour": "2013-01-10 06:00:00",
#> "tailnum": "N170PQ"
#> },
#> {
#> "origin": "JFK",
#> "dest": "PIT",
#> "time_hour": "2013-01-10 07:00:00",
#> "tailnum": "N801AY"
#> },
#> {
#> "origin": "EWR",
#> "dest": "CVG",
#> "time_hour": "2013-01-10 07:00:00",
#> "tailnum": "N8855A"
#> }
#> ]
#> }
#> ]
Created on 2022-06-09 by the reprex package (v2.0.1)
I wonder if this changes our ability to rely on only the ptype to specify the transformation. Should our spec be more like a plan that defines what tables to nest-join or pack-join, and what the names of the new tables should be?
We could still compute directly a spec from the source ptype in dm_wrap_tbl()
, but it feels like the spec should be a bit richer or more flexible.
A few comments @krlmlr
origin
and faa
is not enforced, or it would be by ordering the columnsflights
after packing to place them only in packed columns. A key in the packing table might be used by several parents (in compound keys for example), so I'm not sure that they should be removed when placed in the packed column. In a partially wrapped model this would also break some queries that would have still worked had we kept those.We might solve those issues by :
Your packed example would become :
# Desired:
nyc13_flights[1, ] %>%
jsonlite::toJSON(pretty = TRUE)
#> [
#> {
#> "origin": "JFK",
#> "dest": "BQN",
#> "time_hour": "2013-01-10 23:00:00",
#> "tailnum": "N571JB",
#> "carrier": "B6",
#> "airports<": {
#> "faa=origin": "JFK",
#> "name": "John F Kennedy Intl",
#> "lat": 40.6398,
#> "lon": -73.7789,
#> "alt": 13,
#> "tz": -5,
#> "dst": "A",
#> "tzone": "America/New_York"
#> },
#> "planes<": {
#> "tailnum=tailnum": "N571JB",
#> "year": 2003,
#> "type": "Fixed wing multi engine",
#> "manufacturer": "AIRBUS",
#> "model": "A320-232",
#> "engines": 2,
#> "seats": 200,
#> "engine": "Turbo-fan"
#> },
#> "airlines<": {
#> "carrier= carrier": "B6",
#> "name": "JetBlue Airways"
#> },
#> "weather<,": {
#> "origin=origin": "JFK",
#> "time_hour=time_hour": "2013-01-10 23:00:00",
#> "year": 2013,
#> "month": 1,
#> "day": 10,
#> "hour": 23,
#> "temp": 39.02,
#> "dewp": 21.92,
#> "humid": 49.93,
#> "wind_dir": 30,
#> "wind_speed": 4.6031,
#> "precip": 0,
#> "pressure": 1034.6,
#> "visib": 10
#> }
#> }
#> ]
your nested example would become :
# Desired state:
nyc13_airlines[1, ] %>%
jsonlite::toJSON(pretty = TRUE)
#> [
#> {
#> "carrier": "9E",
#> "name": "Endeavor Air Inc.",
#> "flights>": [
#> {
#> "origin": "JFK",
#> "dest": "ATL",
#> "time_hour": "2013-01-10 06:00:00",
#> "tailnum": "N170PQ",
#> "carrier=carrier": "9E"
#> },
#> {
#> "origin": "JFK",
#> "dest": "PIT",
#> "time_hour": "2013-01-10 07:00:00",
#> "tailnum": "N801AY",
#> "carrier=carrier": "9E"
#> },
#> {
#> "origin": "EWR",
#> "dest": "CVG",
#> "time_hour": "2013-01-10 07:00:00",
#> "tailnum": "N8855A",
#> "carrier=carrier": "9E"
#> }
#> ]
#> }
#> ]
All of the above doesn't mention primary keys. To reconstruct robustly without ptype we need to tag them too. I propose to suffix col names of primary keys with *
.
Invariant:
dm_unwrap_tbl()
with guessed ptype, thendm_wrap_tbl()
should yield the same result.