cynkra / dm

Working with relational data models in R
https://dm.cynkra.com
Other
494 stars 50 forks source link

Guess ptype for `dm_unwrap_tbl()` #1046

Open krlmlr opened 2 years ago

krlmlr commented 2 years ago

Invariant: dm_unwrap_tbl() with guessed ptype, then dm_wrap_tbl() should yield the same result.

krlmlr commented 2 years ago

If we changed the way dm_wrap_tbl() works by default, we could make unwrapping much easier to guess. The idea is to:

Packed

library(dm)
library(tidyverse)

nyc13 <- 
  dm_nycflights13() %>% 
  dm_select(flights, origin, dest, time_hour, tailnum, carrier)

nyc13_flights <-
  nyc13 %>%
  dm_wrap_tbl(flights) %>% 
  pull_tbl(flights)
names(nyc13_flights)
#> [1] "origin"    "dest"      "time_hour" "tailnum"   "carrier"   "airlines" 
#> [7] "airports"  "planes"    "weather"

# Current state:
nyc13_flights[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "origin": "JFK",
#>     "dest": "BQN",
#>     "time_hour": "2013-01-10 23:00:00",
#>     "tailnum": "N571JB",
#>     "carrier": "B6",
#>     "airlines": {
#>       "name": "JetBlue Airways"
#>     },
#>     "airports": {
#>       "name": "John F Kennedy Intl",
#>       "lat": 40.6398,
#>       "lon": -73.7789,
#>       "alt": 13,
#>       "tz": -5,
#>       "dst": "A",
#>       "tzone": "America/New_York"
#>     },
#>     "planes": {
#>       "year": 2003,
#>       "type": "Fixed wing multi engine",
#>       "manufacturer": "AIRBUS",
#>       "model": "A320-232",
#>       "engines": 2,
#>       "seats": 200,
#>       "engine": "Turbo-fan"
#>     },
#>     "weather": {
#>       "year": 2013,
#>       "month": 1,
#>       "day": 10,
#>       "hour": 23,
#>       "temp": 39.02,
#>       "dewp": 21.92,
#>       "humid": 49.93,
#>       "wind_dir": 30,
#>       "wind_speed": 4.6031,
#>       "precip": 0,
#>       "pressure": 1034.6,
#>       "visib": 10
#>     }
#>   }
#> ]

# Desired:
nyc13_flights[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "dest": "BQN",
#>     "airports=origin": {
#>       "faa": "JFK",
#>       "name": "John F Kennedy Intl",
#>       "lat": 40.6398,
#>       "lon": -73.7789,
#>       "alt": 13,
#>       "tz": -5,
#>       "dst": "A",
#>       "tzone": "America/New_York"
#>     },
#>     "planes=tailnum": {
#>       "tailnum": "N571JB",
#>       "year": 2003,
#>       "type": "Fixed wing multi engine",
#>       "manufacturer": "AIRBUS",
#>       "model": "A320-232",
#>       "engines": 2,
#>       "seats": 200,
#>       "engine": "Turbo-fan"
#>     },
#>     "airlines=carrier": {
#>       "carrier": "B6",
#>       "name": "JetBlue Airways"
#>     },
#>     "weather=origin,time_hour": {
#>       "origin": "JFK",
#>       "time_hour": "2013-01-10 23:00:00",
#>       "year": 2013,
#>       "month": 1,
#>       "day": 10,
#>       "hour": 23,
#>       "temp": 39.02,
#>       "dewp": 21.92,
#>       "humid": 49.93,
#>       "wind_dir": 30,
#>       "wind_speed": 4.6031,
#>       "precip": 0,
#>       "pressure": 1034.6,
#>       "visib": 10
#>     }
#>   }
#> ]

Created on 2022-06-09 by the reprex package (v2.0.1)

krlmlr commented 2 years ago

Nested

library(dm)
library(tidyverse)

nyc13 <- 
  dm_nycflights13() %>% 
  dm_select(flights, origin, dest, time_hour, tailnum, carrier) %>% 
  dm_select_tbl(airlines, flights) %>% 
  dm_zoom_to(flights) %>% 
  head(150) %>% 
  dm_update_zoomed()

nyc13_airlines <-
  nyc13 %>%
  dm_wrap_tbl(airlines) %>% 
  pull_tbl(airlines)

# Current state
names(nyc13_airlines)
#> [1] "carrier" "name"    "flights"
nyc13_airlines[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "carrier": "9E",
#>     "name": "Endeavor Air Inc.",
#>     "flights": [
#>       {
#>         "origin": "JFK",
#>         "dest": "ATL",
#>         "time_hour": "2013-01-10 06:00:00",
#>         "tailnum": "N170PQ"
#>       },
#>       {
#>         "origin": "JFK",
#>         "dest": "PIT",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N801AY"
#>       },
#>       {
#>         "origin": "EWR",
#>         "dest": "CVG",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N8855A"
#>       }
#>     ]
#>   }
#> ]

# Desired state:
nyc13_airlines[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "carrier": "9E",
#>     "name": "Endeavor Air Inc.",
#>     "flights=carrier": [
#>       {
#>         "origin": "JFK",
#>         "dest": "ATL",
#>         "time_hour": "2013-01-10 06:00:00",
#>         "tailnum": "N170PQ"
#>       },
#>       {
#>         "origin": "JFK",
#>         "dest": "PIT",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N801AY"
#>       },
#>       {
#>         "origin": "EWR",
#>         "dest": "CVG",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N8855A"
#>       }
#>     ]
#>   }
#> ]

Created on 2022-06-09 by the reprex package (v2.0.1)

krlmlr commented 2 years ago

I wonder if this changes our ability to rely on only the ptype to specify the transformation. Should our spec be more like a plan that defines what tables to nest-join or pack-join, and what the names of the new tables should be?

krlmlr commented 2 years ago

We could still compute directly a spec from the source ptype in dm_wrap_tbl(), but it feels like the spec should be a bit richer or more flexible.

moodymudskipper commented 1 year ago

A few comments @krlmlr

We might solve those issues by :


Your packed example would become :

# Desired:
nyc13_flights[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "origin": "JFK",
#>     "dest": "BQN",
#>     "time_hour": "2013-01-10 23:00:00",
#>     "tailnum": "N571JB",
#>     "carrier": "B6",
#>     "airports<": {
#>       "faa=origin": "JFK",
#>       "name": "John F Kennedy Intl",
#>       "lat": 40.6398,
#>       "lon": -73.7789,
#>       "alt": 13,
#>       "tz": -5,
#>       "dst": "A",
#>       "tzone": "America/New_York"
#>     },
#>     "planes<": {
#>       "tailnum=tailnum": "N571JB",
#>       "year": 2003,
#>       "type": "Fixed wing multi engine",
#>       "manufacturer": "AIRBUS",
#>       "model": "A320-232",
#>       "engines": 2,
#>       "seats": 200,
#>       "engine": "Turbo-fan"
#>     },
#>     "airlines<": {
#>       "carrier= carrier": "B6",
#>       "name": "JetBlue Airways"
#>     },
#>     "weather<,": {
#>       "origin=origin": "JFK",
#>       "time_hour=time_hour": "2013-01-10 23:00:00",
#>       "year": 2013,
#>       "month": 1,
#>       "day": 10,
#>       "hour": 23,
#>       "temp": 39.02,
#>       "dewp": 21.92,
#>       "humid": 49.93,
#>       "wind_dir": 30,
#>       "wind_speed": 4.6031,
#>       "precip": 0,
#>       "pressure": 1034.6,
#>       "visib": 10
#>     }
#>   }
#> ]

your nested example would become :

# Desired state:
nyc13_airlines[1, ] %>% 
  jsonlite::toJSON(pretty = TRUE)
#> [
#>   {
#>     "carrier": "9E",
#>     "name": "Endeavor Air Inc.",
#>     "flights>": [
#>       {
#>         "origin": "JFK",
#>         "dest": "ATL",
#>         "time_hour": "2013-01-10 06:00:00",
#>         "tailnum": "N170PQ",
#>         "carrier=carrier": "9E"
#>       },
#>       {
#>         "origin": "JFK",
#>         "dest": "PIT",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N801AY",
#>         "carrier=carrier": "9E"
#>       },
#>       {
#>         "origin": "EWR",
#>         "dest": "CVG",
#>         "time_hour": "2013-01-10 07:00:00",
#>         "tailnum": "N8855A",
#>         "carrier=carrier": "9E"
#>       }
#>     ]
#>   }
#> ]
moodymudskipper commented 1 year ago

All of the above doesn't mention primary keys. To reconstruct robustly without ptype we need to tag them too. I propose to suffix col names of primary keys with *.