tidyverse / dplyr

dplyr: A grammar of data manipulation
https://dplyr.tidyverse.org/
Other
4.78k stars 2.12k forks source link

Mutate using a function that creates multiple columns from a single column #6933

Closed cgoo4 closed 1 year ago

cgoo4 commented 1 year ago

I use this pattern quite frequently (usually grouped) with the TTR (technical trading) package. It has functions like TDI and MACD which create multiple columns from a single column.

The code below does the job, but I'm wondering if there might be a more elegant way to mutate or reframe (new feature)?

Maybe something along the lines of: reframe(TTR::TDI(var1), .name_prefix = "{.col}_") to create an arbitrary number of columns from a single column?

library(dplyr)
library(tidyr)

df <- tibble(
  var1 = seq(1, 100, 1),
  var2 = seq(100, 1, -1)
)

# Example 1 - Trend Detection

df |> 
  mutate(var1_ = TTR::TDI(var1) |> bind_cols()) |> 
  unnest(cols = var1_, names_sep = "")
#> # A tibble: 100 × 4
#>     var1  var2 var1_tdi var1_di
#>    <dbl> <dbl>    <dbl>   <dbl>
#>  1     1   100       NA      NA
#>  2     2    99       NA      NA
#>  3     3    98       NA      NA
#>  4     4    97       NA      NA
#>  5     5    96       NA      NA
#>  6     6    95       NA      NA
#>  7     7    94       NA      NA
#>  8     8    93       NA      NA
#>  9     9    92       NA      NA
#> 10    10    91       NA      NA
#> # ℹ 90 more rows

df |> 
  mutate(across(starts_with("var"), \(x) TTR::TDI(x) |> bind_cols())) |> 
  unnest(cols = c(var1, var2), names_sep = "_")
#> # A tibble: 100 × 4
#>    var1_tdi var1_di var2_tdi var2_di
#>       <dbl>   <dbl>    <dbl>   <dbl>
#>  1       NA      NA       NA      NA
#>  2       NA      NA       NA      NA
#>  3       NA      NA       NA      NA
#>  4       NA      NA       NA      NA
#>  5       NA      NA       NA      NA
#>  6       NA      NA       NA      NA
#>  7       NA      NA       NA      NA
#>  8       NA      NA       NA      NA
#>  9       NA      NA       NA      NA
#> 10       NA      NA       NA      NA
#> # ℹ 90 more rows

# Example 2 - Price Oscillator

df |> 
  mutate(var1_ = TTR::MACD(var1) |> bind_cols()) |> 
  unnest(cols = var1_, names_sep = "")
#> # A tibble: 100 × 4
#>     var1  var2 var1_macd var1_signal
#>    <dbl> <dbl>     <dbl>       <dbl>
#>  1     1   100        NA          NA
#>  2     2    99        NA          NA
#>  3     3    98        NA          NA
#>  4     4    97        NA          NA
#>  5     5    96        NA          NA
#>  6     6    95        NA          NA
#>  7     7    94        NA          NA
#>  8     8    93        NA          NA
#>  9     9    92        NA          NA
#> 10    10    91        NA          NA
#> # ℹ 90 more rows

Created on 2023-10-02 with reprex v2.0.2

DavisVaughan commented 1 year ago

across() gained a new .unpack argument in 1.1.0

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)

df <- tibble(
  var1 = seq(1, 100, 1),
  var2 = seq(100, 1, -1)
)

df |>
  mutate(across(c(var1, var2), \(.x) as_tibble(TTR::TDI(.x)), .unpack = TRUE))
#> # A tibble: 100 × 6
#>     var1  var2 var1_tdi var1_di var2_tdi var2_di
#>    <dbl> <dbl>    <dbl>   <dbl>    <dbl>   <dbl>
#>  1     1   100       NA      NA       NA      NA
#>  2     2    99       NA      NA       NA      NA
#>  3     3    98       NA      NA       NA      NA
#>  4     4    97       NA      NA       NA      NA
#>  5     5    96       NA      NA       NA      NA
#>  6     6    95       NA      NA       NA      NA
#>  7     7    94       NA      NA       NA      NA
#>  8     8    93       NA      NA       NA      NA
#>  9     9    92       NA      NA       NA      NA
#> 10    10    91       NA      NA       NA      NA
#> # ℹ 90 more rows

df |>
  mutate(across(
    c(var1, var2),
    list(
      tdi = \(.x) as_tibble(TTR::TDI(.x)),
      macd = \(.x) as_tibble(TTR::MACD(.x))
    ),
    .unpack = TRUE
  ))
#> # A tibble: 100 × 10
#>     var1  var2 var1_tdi_tdi var1_tdi_di var1_macd_macd var1_macd_signal
#>    <dbl> <dbl>        <dbl>       <dbl>          <dbl>            <dbl>
#>  1     1   100           NA          NA             NA               NA
#>  2     2    99           NA          NA             NA               NA
#>  3     3    98           NA          NA             NA               NA
#>  4     4    97           NA          NA             NA               NA
#>  5     5    96           NA          NA             NA               NA
#>  6     6    95           NA          NA             NA               NA
#>  7     7    94           NA          NA             NA               NA
#>  8     8    93           NA          NA             NA               NA
#>  9     9    92           NA          NA             NA               NA
#> 10    10    91           NA          NA             NA               NA
#> # ℹ 90 more rows
#> # ℹ 4 more variables: var2_tdi_tdi <dbl>, var2_tdi_di <dbl>,
#> #   var2_macd_macd <dbl>, var2_macd_signal <dbl>

It might be nice to use .names = "{.col}_" here rather than naming the list elements with tdi = and macd = since you know that the functions return column names that are descriptive enough on their own, but right now that errors because the column name uniqueness check is done too early when .unpack is used. I'll open another issue for that.