tidymodels / rules

parsnip extension for rule-based models
https://rules.tidymodels.org
Other
40 stars 4 forks source link

Error: undefined columns selected #21

Closed vidarsumo closed 3 years ago

vidarsumo commented 3 years ago

The problem

I'm running many different ML algorithms an Cubist is giving me an error I can't solve. I do have the developement version of the rules package.

/cc @mdancho84

Reproducible example

``` r
library(tidymodels)
#> -- Attaching packages -------------------------------------- tidymodels 0.1.1 --
#> v broom     0.7.2          v recipes   0.1.15    
#> v dials     0.0.9.9000     v rsample   0.0.8.9000
#> v dplyr     1.0.2          v tibble    3.0.4     
#> v ggplot2   3.3.2          v tidyr     1.1.2     
#> v infer     0.5.3          v tune      0.1.1.9001
#> v modeldata 0.1.0          v workflows 0.2.1.9000
#> v parsnip   0.1.4          v yardstick 0.0.7     
#> v purrr     0.3.4
#> Warning: package 'broom' was built under R version 4.0.3
#> Warning: package 'modeldata' was built under R version 4.0.3
#> Warning: package 'recipes' was built under R version 4.0.3
#> -- Conflicts ----------------------------------------- tidymodels_conflicts() --
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()
library(modeltime)
library(rules)
#> 
#> Attaching package: 'rules'
#> The following object is masked from 'package:dials':
#> 
#>     max_rules
library(timetk)
library(tidyverse)

df <- read_csv("https://raw.githubusercontent.com/vidarsumo/reprex_data/main/reprex_data.csv")
#> 
#> -- Column specification --------------------------------------------------------
#> cols(
#>   date = col_date(format = ""),
#>   sala = col_double(),
#>   abc = col_character(),
#>   id = col_double()
#> )

df$id <- as.character(df$id)
df <- df %>%
    mutate(sala = ifelse(sala < 0, 0, sala))

horizon <- 8
number_of_groups <- n_distinct(df$id)

full_data_tbl <- df %>%

    mutate(sala = log1p(sala)) %>%

    group_by(id) %>%
    future_frame(date, .length_out = horizon, .bind_data = TRUE) %>%

    # Add lags and rolling features / Fouries
    mutate(id = as_factor(id)) %>%
    group_by(id) %>%
    arrange(date) %>%
    tk_augment_fourier(date, .periods = c(1:6, 8, 10, 12, 26, 52), .K = 5) %>%
    tk_augment_lags(sala, .lags = horizon) %>%
    tk_augment_slidify(
        paste0("sala_lag", horizon),
        # sala_lag12,
        .f = ~mean(.x, na.rm = TRUE),
        .period = c(4, 8, 26, 52),
        .partial = TRUE,
        .align = "center"
    ) %>%
    rowid_to_column(var = "rowid")
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4
#> New names:
#> * NA -> ...1
#> * NA -> ...2
#> * NA -> ...3
#> * NA -> ...4

data_prepared_tbl <- full_data_tbl %>%
    filter(!is.na(sala)) %>%
    drop_na()

splits <- data_prepared_tbl %>%
    time_series_split(date, assess = horizon, cumulative = TRUE)
#> Groups detected. Removing groups.
#> Data is not ordered by the 'date_var'. Resamples will be arranged by `date`.
#> Overlapping Timestamps Detected. Processing overlapping time series together using sliding windows.

recipe_spec <- recipe(sala ~ ., data = training(splits)) %>%
    update_role(rowid, new_role = "indicator") %>%
    step_timeseries_signature(date) %>%
    step_rm(matches("(.xts)|(.iso)|(hour)|(minute)|(second)|(am.pm)")) %>%
    step_rm(contains("day")) %>%
    step_ns(contains("index.num"), deg_free = 3) %>%
    step_normalize(contains("index.num"), date_year) %>%
    step_other(id, threshold = 1/(number_of_groups + 1)) %>%
    step_dummy(all_nominal(), one_hot = TRUE) %>%
    update_role(date, new_role = "indicator")

wlfw_fit_cubist <- workflow() %>%
    add_model(
        spec = cubist_rules(mode = "regression") %>% set_engine("Cubist")
    ) %>%
    add_recipe(recipe_spec) %>%
    fit(training(splits))
#> Error in `[.data.frame`(x, , as.character(splits$variable[i])): undefined columns selected
#> Timing stopped at: 2.85 0.02 2.86

Created on 2020-11-15 by the reprex package (v0.3.0)

Session info ``` r devtools::session_info() #> - Session info --------------------------------------------------------------- #> setting value #> version R version 4.0.2 (2020-06-22) #> os Windows 10 x64 #> system x86_64, mingw32 #> ui RTerm #> language (EN) #> collate Icelandic_Iceland.1252 #> ctype Icelandic_Iceland.1252 #> tz Africa/Casablanca #> date 2020-11-15 #> #> - Packages ------------------------------------------------------------------- #> ! package * version date lib #> assertthat 0.2.1 2019-03-21 [1] #> backports 1.2.0 2020-11-02 [1] #> broom * 0.7.2 2020-10-20 [1] #> callr 3.5.1 2020-10-13 [1] #> cellranger 1.1.0 2016-07-27 [1] #> class 7.3-17 2020-04-26 [2] #> cli 2.1.0 2020-10-12 [1] #> codetools 0.2-16 2018-12-24 [2] #> colorspace 2.0-0 2020-11-11 [1] #> crayon 1.3.4 2017-09-16 [1] #> Cubist 0.2.3 2020-01-10 [1] #> curl 4.3 2019-12-02 [1] #> DBI 1.1.0 2019-12-15 [1] #> dbplyr 2.0.0 2020-11-03 [1] #> desc 1.2.0 2018-05-01 [1] #> devtools 2.3.2 2020-09-18 [1] #> dials * 0.0.9.9000 2020-09-20 [1] #> DiceDesign 1.8-1 2019-07-31 [1] #> digest 0.6.27 2020-10-24 [1] #> dplyr * 1.0.2 2020-08-18 [1] #> ellipsis 0.3.1 2020-05-15 [1] #> evaluate 0.14 2019-05-28 [1] #> fansi 0.4.1 2020-01-08 [1] #> forcats * 0.5.0 2020-03-01 [1] #> foreach 1.5.1 2020-10-15 [1] #> fs 1.5.0 2020-07-31 [1] #> furrr 0.2.1 2020-10-21 [1] #> future 1.20.1 2020-11-03 [1] #> generics 0.1.0 2020-10-31 [1] #> ggplot2 * 3.3.2 2020-06-19 [1] #> globals 0.13.1 2020-10-11 [1] #> glue 1.4.2 2020-08-27 [1] #> gower 0.2.2 2020-06-23 [1] #> GPfit 1.0-8 2019-02-08 [1] #> gtable 0.3.0 2019-03-25 [1] #> hardhat 0.1.5 2020-11-09 [1] #> haven 2.3.1 2020-06-01 [1] #> highr 0.8 2019-03-20 [1] #> hms 0.5.3 2020-01-08 [1] #> htmltools 0.5.0 2020-06-16 [1] #> httr 1.4.2 2020-07-20 [1] #> infer * 0.5.3 2020-07-14 [1] #> ipred 0.9-9 2019-04-28 [1] #> iterators 1.0.13 2020-10-15 [1] #> jsonlite 1.7.1 2020-09-07 [1] #> knitr 1.30 2020-09-22 [1] #> lattice 0.20-41 2020-04-02 [2] #> lava 1.6.8.1 2020-11-04 [1] #> lhs 1.1.1 2020-10-05 [1] #> lifecycle 0.2.0 2020-03-06 [1] #> listenv 0.8.0 2019-12-05 [1] #> lubridate 1.7.9.2 2020-11-13 [1] #> magrittr 1.5 2014-11-22 [1] #> MASS 7.3-51.6 2020-04-26 [2] #> Matrix 1.2-18 2019-11-27 [2] #> memoise 1.1.0 2017-04-21 [1] #> modeldata * 0.1.0 2020-10-22 [1] #> modelr 0.1.8 2020-05-19 [1] #> modeltime * 0.3.1.9000 2020-11-15 [1] #> munsell 0.5.0 2018-06-12 [1] #> nnet 7.3-14 2020-04-26 [2] #> parallelly 1.21.0 2020-10-27 [1] #> parsnip * 0.1.4 2020-10-27 [1] #> pillar 1.4.6 2020-07-10 [1] #> pkgbuild 1.1.0 2020-07-13 [1] #> pkgconfig 2.0.3 2019-09-22 [1] #> pkgload 1.1.0 2020-05-29 [1] #> plyr 1.8.6 2020-03-03 [1] #> prettyunits 1.1.1 2020-01-24 [1] #> pROC 1.16.2 2020-03-19 [1] #> processx 3.4.4 2020-09-03 [1] #> prodlim 2019.11.13 2019-11-17 [1] #> ps 1.4.0 2020-10-07 [1] #> purrr * 0.3.4 2020-04-17 [1] #> R6 2.5.0 2020-10-28 [1] #> Rcpp 1.0.5 2020-07-06 [1] #> D RcppParallel 5.0.2 2020-06-24 [1] #> readr * 1.4.0 2020-10-05 [1] #> readxl 1.3.1 2019-03-13 [1] #> recipes * 0.1.15 2020-11-11 [1] #> remotes 2.2.0 2020-07-21 [1] #> reprex 0.3.0 2019-05-16 [1] #> reshape2 1.4.4 2020-04-09 [1] #> rlang 0.4.8 2020-10-08 [1] #> rmarkdown 2.5 2020-10-21 [1] #> rpart 4.1-15 2019-04-12 [2] #> rprojroot 2.0.2 2020-11-15 [1] #> rsample * 0.0.8.9000 2020-10-31 [1] #> rstudioapi 0.13 2020-11-12 [1] #> rules * 0.1.0.9000 2020-11-15 [1] #> rvest 0.3.6 2020-07-25 [1] #> scales * 1.1.1 2020-05-11 [1] #> sessioninfo 1.1.1 2018-11-05 [1] #> slider 0.1.5 2020-07-21 [1] #> StanHeaders 2.21.0-6 2020-08-16 [1] #> stringi 1.5.3 2020-09-09 [1] #> stringr * 1.4.0 2019-02-10 [1] #> survival 3.1-12 2020-04-10 [2] #> testthat 3.0.0 2020-10-31 [1] #> tibble * 3.0.4 2020-10-12 [1] #> tidymodels * 0.1.1 2020-07-14 [1] #> tidyr * 1.1.2 2020-08-27 [1] #> tidyselect 1.1.0 2020-05-11 [1] #> tidyverse * 1.3.0 2019-11-21 [1] #> timeDate 3043.102 2018-02-21 [1] #> timetk * 2.5.0.9000 2020-11-15 [1] #> tune * 0.1.1.9001 2020-10-31 [1] #> usethis 1.6.3 2020-09-17 [1] #> vctrs 0.3.4 2020-08-29 [1] #> warp 0.2.0 2020-10-21 [1] #> withr 2.3.0 2020-09-22 [1] #> workflows * 0.2.1.9000 2020-10-10 [1] #> xfun 0.19 2020-10-30 [1] #> xml2 1.3.2 2020-04-23 [1] #> xts 0.12.1 2020-09-09 [1] #> yaml 2.2.1 2020-02-01 [1] #> yardstick * 0.0.7 2020-07-13 [1] #> zoo 1.8-8 2020-05-02 [1] #> source #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.3) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> Github (tidymodels/dials@2b79300) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.3) #> CRAN (R 4.0.1) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.1) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.1) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.1) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.3) #> CRAN (R 4.0.0) #> Github (business-science/modeltime@69329e5) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.3) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.3) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> Github (tidymodels/rsample@13f990b) #> CRAN (R 4.0.3) #> Github (tidymodels/rules@ecbbc5d) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.1) #> CRAN (R 4.0.1) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.3) #> CRAN (R 4.0.2) #> CRAN (R 4.0.1) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> CRAN (R 4.0.0) #> Github (business-science/timetk@4f6882d) #> Github (tidymodels/tune@a8ef10e) #> CRAN (R 4.0.2) #> CRAN (R 4.0.1) #> CRAN (R 4.0.3) #> CRAN (R 4.0.2) #> Github (tidymodels/workflows@8f5d914) #> CRAN (R 4.0.3) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.0) #> CRAN (R 4.0.2) #> CRAN (R 4.0.2) #> #> [1] C:/Users/vidar/Documents/R/win-library/4.0 #> [2] C:/Program Files/R/R-4.0.2/library #> #> D -- DLL MD5 mismatch, broken installation. ```
vidarsumo commented 3 years ago

The error was related to column names. Everyting is working now after I fixed it.

github-actions[bot] commented 3 years ago

This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.