ropensci / drake

An R-focused pipeline toolkit for reproducibility and high-performance computing
https://docs.ropensci.org/drake
GNU General Public License v3.0
1.34k stars 128 forks source link

Add nesting in cross #1342

Closed djbirke closed 3 years ago

djbirke commented 3 years ago

Prework

Proposal

I often create multiple targets dynamically using cross and at a later stage want to cross these targets with another target. For these cases, I would love to the ability of using something like tidyr::nesting. Currently, drake only allows all parallel map or full cross.

Would this be a helpful feature? Is there an easy alternative solution that I am missing?

Here is a made up example:

library(drake)
library(tidyverse)
library(broom)

compute_mpg_mean <- function(df, size_filter, weight_filter) {
  df %>%
    (pluck(size_filter, 1)) %>%
    (pluck(weight_filter, 1)) %>%
    summarize(mean(mpg))
}

compute_mpg_model <- function(df, size_filter, weight_filter) {
  df %>%
    (pluck(size_filter, 1)) %>%
    (pluck(weight_filter, 1)) %>%
    lm(formula = mpg ~ cyl + wt) %>%
    tidy()
}

create_report <- function(mpg_mean, mpg_model, rounding) {
  list(
    rounding = names(rounding),
    mean = pluck(rounding, 1)(mpg_mean),
    model = mpg_model %>%
      mutate(across(where(is.numeric), pluck(rounding, 1)))
  )
}

plan <- drake::drake_plan(
  size_filter = list(
    small = function(df) filter(df, cyl <= 4),
    large = function(df) filter(df, cyl > 4)
  ),

  weight_filter = list(
    light = function(df) filter(df, wt <= 3),
    heavy = function(df) filter(df, wt > 3)
  ),

  mpg_mean = target(
    compute_mpg_mean(mtcars, size_filter, weight_filter),
    dynamic = cross(size_filter, weight_filter)
  ),

  mpg_model = target(
    compute_mpg_model(mtcars, size_filter, weight_filter),
    dynamic = cross(size_filter, weight_filter)
  ),

  rounding = list(
    up = ceiling,
    down = floor
  ),

  report = target(
    create_report(mpg_mean, mpg_model, rounding),
    dynamic = cross(rounding, mpg_mean, mpg_model) # Too many crossings
    # What I want:
    # dynamic = expand(rounding, nesting(mpg_mean, mpg_model))
  )
)

drake::make(plan)
#> ▶ target size_filter
#> ▶ target weight_filter
#> ▶ target rounding
#> ▶ dynamic mpg_mean
#> > subtarget mpg_mean_598176e9
#> > subtarget mpg_mean_486930e2
#> > subtarget mpg_mean_b2592e0a
#> > subtarget mpg_mean_73e9b787
#> ■ finalize mpg_mean
#> ▶ dynamic mpg_model
#> > subtarget mpg_model_598176e9
#> > subtarget mpg_model_486930e2
#> > subtarget mpg_model_b2592e0a
#> > subtarget mpg_model_73e9b787
#> ■ finalize mpg_model
#> ▶ dynamic report
#> > subtarget report_7e3078df
#> > subtarget report_3115dded
#> > subtarget report_6a141431
#> > subtarget report_22f01bbf
#> > subtarget report_b8b5fcb2
#> > subtarget report_b9b01686
#> > subtarget report_c621ae01
#> > subtarget report_133b302a
#> > subtarget report_331d8008
#> > subtarget report_51d30895
#> > subtarget report_12818c3e
#> > subtarget report_00a97a63
#> > subtarget report_39043afb
#> > subtarget report_3542397c
#> > subtarget report_68e7163f
#> > subtarget report_e66beed0
#> > subtarget report_9d7cda8f
#> > subtarget report_66956bb3
#> > subtarget report_4a97a130
#> > subtarget report_86d8781b
#> > subtarget report_597d6c17
#> > subtarget report_8f876914
#> > subtarget report_e2bc5429
#> > subtarget report_30b9037f
#> > subtarget report_2a610e92
#> > subtarget report_6fd7d395
#> > subtarget report_d926fd5d
#> > subtarget report_7a40901e
#> > subtarget report_97a44e74
#> > subtarget report_d70d0041
#> > subtarget report_ae76502d
#> > subtarget report_1dc203ae
#> ■ finalize report

Created on 2020-11-09 by the reprex package (v0.3.0)

wlandau commented 3 years ago

If I understand correctly, the desired pattern is something like cross(rounding, map(mpg_mean, mpg_model)). You can achieve this behavior by adding a new intermediate target like mpg_combo below.

library(drake)
library(tidyverse)
library(broom)

compute_mpg_mean <- function(df, size_filter, weight_filter) {
  df %>%
    (pluck(size_filter, 1)) %>%
    (pluck(weight_filter, 1)) %>%
    summarize(mean = mean(mpg)) %>%
    as_tibble()
}

compute_mpg_model <- function(df, size_filter, weight_filter) {
  df %>%
    (pluck(size_filter, 1)) %>%
    (pluck(weight_filter, 1)) %>%
    lm(formula = mpg ~ cyl + wt) %>%
    tidy() %>%
    as_tibble()
}

create_report <- function(mpg_mean, mpg_model, rounding) {
  list(
    rounding = names(rounding),
    mean = pluck(rounding, 1)(mpg_mean),
    model = mpg_model %>%
      mutate(across(where(is.numeric), pluck(rounding, 1)))
  )
}

plan <- drake::drake_plan(
  size_filter = list(
    small = function(df) filter(df, cyl <= 4),
    large = function(df) filter(df, cyl > 4)
  ),
  weight_filter = list(
    light = function(df) filter(df, wt <= 3),
    heavy = function(df) filter(df, wt > 3)
  ),
  mpg_mean = target(
    compute_mpg_mean(mtcars, size_filter, weight_filter),
    dynamic = cross(size_filter, weight_filter)
  ),
  mpg_model = target(
    compute_mpg_model(mtcars, size_filter, weight_filter),
    dynamic = cross(size_filter, weight_filter)
  ),
  rounding = list(
    up = ceiling,
    down = floor
  ),
  # New target:
  mpg_combo = target(
    list(mpg_mean = mpg_mean, mpg_model = mpg_model),
    dynamic = map(mpg_mean, mpg_model)
  ),
  # Use mpg_combo:
  report = target(
    create_report(mpg_combo$mpg_mean, mpg_combo$mpg_model, rounding),
    dynamic = cross(rounding, mpg_combo)
  )
)

make(plan)
#> ▶ target size_filter
#> ▶ target weight_filter
#> ▶ target rounding
#> ▶ dynamic mpg_mean
#> > subtarget mpg_mean_f128af79
#> > subtarget mpg_mean_163d6d92
#> > subtarget mpg_mean_5c75ea3a
#> > subtarget mpg_mean_cf74ce20
#> ■ finalize mpg_mean
#> ▶ dynamic mpg_model
#> > subtarget mpg_model_f128af79
#> > subtarget mpg_model_163d6d92
#> > subtarget mpg_model_5c75ea3a
#> > subtarget mpg_model_cf74ce20
#> ■ finalize mpg_model
#> ▶ dynamic mpg_combo
#> > subtarget mpg_combo_91a0be6a
#> > subtarget mpg_combo_8b56ff2f
#> > subtarget mpg_combo_4c0ed8c1
#> > subtarget mpg_combo_f99862b7
#> ■ finalize mpg_combo
#> ▶ dynamic report
#> > subtarget report_e30bdb11
#> > subtarget report_2ac5718b
#> > subtarget report_0717b285
#> > subtarget report_64995612
#> > subtarget report_734119fe
#> > subtarget report_25ed0c15
#> > subtarget report_b6c7543e
#> > subtarget report_12d4d8e1
#> ■ finalize report

length(subtargets(report))
#> [1] 8

readd(report, subtargets = 1)
#> $rounding
#> [1] "up"
#> 
#> $mean
#> # A tibble: 1 x 1
#>    mean
#>   <dbl>
#> 1    28
#> 
#> $model
#> # A tibble: 3 x 5
#>   term        estimate std.error statistic p.value
#>   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
#> 1 (Intercept)       46         7         8       1
#> 2 cyl               NA        NA        NA      NA
#> 3 wt                -8         3        -3       1

Created on 2020-11-09 by the reprex package (v0.3.0)

djbirke commented 3 years ago

Thank you for the fast response. Does the creation of the new mpg_combo target duplicate the storage/cache taken due to mpg_mean and mpg_model? In my use case, these objects are fairly large datasets.

And do you have any thoughts on introducing a smarter map/cross/nesting syntax into drake/targets? From my experience the need for map+cross occurs frequently, e.g. when doing analysis on different overlapping subsets of the data with different parameters and wanting to combine some of them into several tables. I would love to keep the plan "slim" by avoiding a whole bunch of meta-targets.

wlandau commented 3 years ago

Thank you for the fast response. Does the creation of the new mpg_combo target duplicate the storage/cache taken due to mpg_mean and mpg_model? In my use case, these objects are fairly large datasets.

Unfortunately it does duplicate data. To reduce storage, the best advice I can give is to use a specialized storage format, switch to targets, or find ways to reduce the size of the data.

And do you have any thoughts on introducing a smarter map/cross/nesting syntax into drake/targets? From my experience the need for map+cross occurs frequently, e.g. when doing analysis on different overlapping subsets of the data with different parameters and wanting to combine some of them into several tables. I would love to keep the plan "slim" by avoiding a whole bunch of meta-targets.

I do not plan on it. Dynamic branching is already an enormous undertaking. Nested syntax multiplies the challenges, and the workaround is straightforward.

djbirke commented 3 years ago

Thank you for the additional advice and sharing your plans.

wlandau commented 3 years ago

Sure.

On reflection, although drake's internals can't handle this, there might be something we can do in targets. I will think about it.

djbirke commented 3 years ago

Thank you - I look forward to using targets for my next project!

I want to share one more note to my example above. If report and the inputs that go into are needed subsequently, for example for an export, then the workaround introduces an easy way to make a mistake, because the order of arguments in cross in the meta targets needs to be in line with the order from the cross's of the initial targets. Otherwise, the following can happen.

(If I am missing an easy alternative here to access the inputs that went into report, please let me know.)

library(drake)
library(tidyverse)
library(broom)

compute_mpg_mean <- function(df, size_filter, weight_filter) {
  df %>%
    (pluck(size_filter, 1)) %>%
    (pluck(weight_filter, 1)) %>%
    summarize(mean = mean(mpg)) %>%
    as_tibble()
}

compute_mpg_model <- function(df, size_filter, weight_filter) {
  df %>%
    (pluck(size_filter, 1)) %>%
    (pluck(weight_filter, 1)) %>%
    lm(formula = mpg ~ cyl + wt) %>%
    tidy() %>%
    as_tibble()
}

create_report <- function(mpg_mean, mpg_model, rounding) {
  list(
    rounding = names(rounding),
    mean = pluck(rounding, 1)(mpg_mean),
    model = mpg_model %>%
      mutate(across(where(is.numeric), pluck(rounding, 1)))
  )
}

export_report <- function(report, size_filter, weight_filter, rounding) {
  filename <- paste0(paste(names(size_filter), names(weight_filter), names(rounding), sep = "_"), ".txt")
  print(filename)
  print(report$rounding)
  report
}

plan <- drake::drake_plan(
  size_filter = list(
    small = function(df) filter(df, cyl <= 4),
    large = function(df) filter(df, cyl > 4)
  ),
  weight_filter = list(
    light = function(df) filter(df, wt <= 3),
    heavy = function(df) filter(df, wt > 1)
  ),
  mpg_mean = target(
    compute_mpg_mean(mtcars, size_filter, weight_filter),
    dynamic = cross(size_filter, weight_filter)
  ),
  mpg_model = target(
    compute_mpg_model(mtcars, size_filter, weight_filter),
    dynamic = cross(size_filter, weight_filter)
  ),
  rounding = list(
    up = ceiling,
    down = floor
  ),
  # New target:
  mpg_combo = target(
    list(mpg_mean = mpg_mean, mpg_model = mpg_model),
    dynamic = map(mpg_mean, mpg_model)
  ),

  # Use mpg_combo:
  report = target(
    create_report(mpg_combo$mpg_mean, mpg_combo$mpg_model, rounding),
    dynamic = cross(rounding, mpg_combo)
  ),

  report_export_meta = target(
    list(size_filter = size_filter, weight_filter = weight_filter, rounding = rounding),
    dynamic = cross(size_filter, weight_filter, rounding) # Here the order is wrong!
  ),

  report_export = target(
    export_report(
      report,
      report_export_meta$size_filter,
      report_export_meta$weight_filter,
      report_export_meta$rounding
      ),
    dynamic = map(report, report_export_meta)
  )
)

make(plan)
#> ▶ target size_filter
#> ▶ target weight_filter
#> ▶ target rounding
#> ▶ dynamic mpg_mean
#> > subtarget mpg_mean_c66bc902
#> > subtarget mpg_mean_eb271d0b
#> > subtarget mpg_mean_2bb00aff
#> > subtarget mpg_mean_0c5cb176
#> ■ finalize mpg_mean
#> ▶ dynamic mpg_model
#> > subtarget mpg_model_c66bc902
#> > subtarget mpg_model_eb271d0b
#> > subtarget mpg_model_2bb00aff
#> > subtarget mpg_model_0c5cb176
#> ■ finalize mpg_model
#> ▶ dynamic report_export_meta
#> > subtarget report_export_meta_83dbd833
#> > subtarget report_export_meta_9cd00daa
#> > subtarget report_export_meta_2e81ba3c
#> > subtarget report_export_meta_5dccb22b
#> > subtarget report_export_meta_e9626856
#> > subtarget report_export_meta_25a59884
#> > subtarget report_export_meta_5826a845
#> > subtarget report_export_meta_ed252f9f
#> ■ finalize report_export_meta
#> ▶ dynamic mpg_combo
#> > subtarget mpg_combo_91a0be6a
#> > subtarget mpg_combo_e645fe85
#> > subtarget mpg_combo_4c0ed8c1
#> > subtarget mpg_combo_be50761a
#> ■ finalize mpg_combo
#> ▶ dynamic report
#> > subtarget report_e30bdb11
#> > subtarget report_dfb57c7a
#> > subtarget report_0717b285
#> > subtarget report_f1ee8803
#> > subtarget report_734119fe
#> > subtarget report_1de28e16
#> > subtarget report_b6c7543e
#> > subtarget report_639a4cb2
#> ■ finalize report
#> ▶ dynamic report_export
#> > subtarget report_export_3ac734a5
#> [1] "small_light_up.txt"
#> [1] "up"
#> > subtarget report_export_410ab743
#> [1] "small_light_down.txt"
#> [1] "up"
#> > subtarget report_export_5af56515
#> [1] "small_heavy_up.txt"
#> [1] "up"
#> > subtarget report_export_21617a50
#> [1] "small_heavy_down.txt"
#> [1] "up"
#> > subtarget report_export_4fd365eb
#> [1] "large_light_up.txt"
#> [1] "down"
#> > subtarget report_export_af2e1b99
#> [1] "large_light_down.txt"
#> [1] "down"
#> > subtarget report_export_bb7c2f1f
#> [1] "large_heavy_up.txt"
#> [1] "down"
#> > subtarget report_export_8b1ba519
#> [1] "large_heavy_down.txt"
#> [1] "down"
#> ■ finalize report_export

Created on 2020-11-10 by the reprex package (v0.3.0)

wlandau commented 3 years ago

All these are good points. My initial resistance was the implementation difficulties. In general, something this flexible has the potential to totally break the design. It certainly would in drake. But in targets, it actually turned out to help the internals rather than hurt them, which really surprised me. An implementation is now available via https://github.com/wlandau/targets/pull/212. I recommend having a look.

djbirke commented 3 years ago

Amazing. I already appreciate your development of drake and targets, but your incredible responsiveness here and the fact that you incorporated this feature request within the day of its posting completely fills me with gratitude.

I want to buy you a beer/pizza or donate to a project/charity of your choice. Would you accept anything like that?

wlandau commented 3 years ago

So glad to hear this helps you. I do not always know if these improvements are possible, and it is gratifying when they do work out.

That's nice of you to offer. Unfortunately, even small/indirect gifts raise ethics and compliance issues at work, so I cannot accept. However, spreading the word in the R world is always a huge contribution. rOpenSci is always looking for use cases like this one, and blogs, tweets, talks, and even simple word of mouth make a difference.