business-science / modeltime

Modeltime unlocks time series forecast models and machine learning in one framework
https://business-science.github.io/modeltime/
Other
522 stars 79 forks source link

modeltime_nested_refit overwriting issue #167

Open brianbarry7 opened 2 years ago

brianbarry7 commented 2 years ago

I am running into an issue with modeltime_nested_refit. I am not able to run parallel processing. When I run sequentially, I lose the confidence intervals. I couldn't reproduce it in the reprex, but on my real dataset, when I run sequentially, only the last group is forecasted. I was able to run a reprex where I can parallel process for the test set but can't for the future set.

# Libraries 
# Wrangle
library(tidyverse)
library(lubridate)
library(skimr)

# Timeseries
library(timetk)

# Model
library(tidymodels)
library(modeltime)

# Visualize
library(plotly)

# Raw Data
raw_data <- tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "1", orders = 1:449) %>% 
  rbind(tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "2", orders = 1:449)) %>% 
  rbind(tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "3", orders = 1:449)) %>% 
  rbind(tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "4", orders = 1:449)) %>% 
  rbind(tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "5", orders = 1:449)) %>% 
  rbind(tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "6", orders = 1:449)) %>% 
  rbind(tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "7", orders = 1:449)) %>% 
  rbind(tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "8", orders = 1:449)) %>% 
  rbind(tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "9", orders = 1:449)) %>% 
  rbind(tibble(Date = tk_make_timeseries(start_date = "2021-01-01", end_date = "2022-03-25"), Product_ID = "10", orders = 1:449)) 

filtered_data <- raw_data %>% 
  select(Product_ID, Date, orders) 

prep_full_tbl <- filtered_data %>% 
  # Log transform 
  group_by(Product_ID) %>% 
  mutate(orders_trans = log1p(orders)) %>% 
  extend_timeseries(
    .id_var = Product_ID,
    .date_var = Date,
    .length_future = 28
  ) %>% 
  select(-orders) %>% 
  tk_augment_lags(orders_trans, .lags = c(28, 29, 30, 35, 42, 49)) %>% 
  tk_augment_slidify(
    orders_trans_lag28,
    .period = 7,
    .partial = TRUE,
    .align = "center",
    .f = mean
  ) %>% 
  tk_augment_slidify(
    orders_trans_lag42,
    .period = 7,
    .partial = TRUE,
    .align = "center",
    .f = mean
  ) %>%
  ungroup()

nested_data_tbl <- prep_full_tbl %>%
  group_by(Product_ID) %>%
  nest_timeseries(
    .id_var = Product_ID,
    .length_future = 28
  ) %>%
  split_nested_timeseries(
    .length_test = 28
  )

# Recipes
rec_spec_base <- recipe(orders_trans ~ ., extract_nested_train_split(nested_data_tbl)) %>%
  step_timeseries_signature(Date) %>%
  step_rm(matches("(.xts)|(.iso)|(hour)|(minute)|(second)|(am.pm)")) %>%
  step_normalize(matches("(index.num)|(year)|(yday)")) %>%
  step_zv(all_predictors()) %>%
  step_normalize(matches("(index.num)|(year)|(yday)")) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_naomit(matches("_lag")) %>% 
  step_fourier(Date, period = c(2, 7, 14, 21, 28, 90), K = 2) 

rec_glmnet <- rec_spec_base %>% 
  step_rm(Date) 

rec_xgb <- rec_spec_base %>% 
  step_rm(Date)

recipe_profit <- rec_spec_base 

# Workflows 
wflw_glmnet_1 <- workflow() %>% 
  add_model(
    spec = linear_reg(
      penalty = 0.1,
      mixture = 0.5
    ) %>%
      set_engine("glmnet")
  ) %>% 
  add_recipe(rec_glmnet) 

wflw_glmnet_2 <- workflow() %>% 
  add_model(
    spec = linear_reg(
      penalty = 0.01,
      mixture = 0
    ) %>%
      set_engine("glmnet")
  ) %>% 
  add_recipe(rec_glmnet)

wflw_glmnet_3 <- workflow() %>% 
  add_model(
    spec = linear_reg(
      penalty = 0.01,
      mixture = 1
    ) %>%
      set_engine("glmnet")
  ) %>% 
  add_recipe(rec_glmnet)

wflw_xgb_1 <- workflow() %>%
  add_model(boost_tree("regression", learn_rate = 0.35) %>% set_engine("xgboost")) %>%
  add_recipe(rec_xgb) 

wflw_xgb_2 <- workflow() %>%
  add_model(boost_tree("regression", learn_rate = 0.5) %>% set_engine("xgboost")) %>%
  add_recipe(rec_xgb) 

wflw_prophet <- workflow() %>%
  add_model(
    spec = prophet_reg() %>%  set_engine("prophet")
  ) %>%
  add_recipe(recipe_profit)

parallel_start(4)

nested_modeltime_tbl <- nested_data_tbl %>%
  modeltime_nested_fit(

    model_list = list(
      wflw_glmnet_1,
      wflw_glmnet_2,
      wflw_glmnet_3,
      wflw_xgb_1,
      wflw_xgb_2,
      wflw_prophet
    ),

    control = control_nested_fit(
      verbose   = TRUE,
      allow_par = TRUE
    )
  )

parallel_stop()

# Grab best fit 
nested_best_tbl <- nested_modeltime_tbl %>%
  modeltime_nested_select_best(
    metric                = "mae", 
    minimize              = TRUE, 
    filter_test_forecasts = TRUE
  ) 

# Cannot for the life of me figure out how to make this parallel 
parallel_start(4)
nested_best_refit_tbl <- nested_best_tbl %>%
  modeltime_nested_refit(
    control = control_nested_refit(
      verbose   = TRUE,
      allow_par = TRUE
    )
  )

# Matt, I see this error when I try to parallel process
# Using existing parallel backend with 4 clusters (cores)...
# Beginning Parallel Loop | 0.011 seconds
# Error in `dplyr::mutate()`:
#   ! Problem while computing `.modeltime_tables = mdl_time_list`.
# x `.modeltime_tables` must be size 1, not 10.
# i The error occurred in group 1: Product_ID = "1".
# Run `rlang::last_error()` to see where the error occurred.
# Warning messages:
#   1: closing unused connection 10 (<-kubernetes.docker.internal:11433) 
# 2: closing unused connection 9 (<-kubernetes.docker.internal:11433) 
# 3: closing unused connection 8 (<-kubernetes.docker.internal:11433) 
# 4: closing unused connection 7 (<-kubernetes.docker.internal:11433) 

# Try seqential 
nested_best_refit_tbl <- nested_best_tbl %>%
  modeltime_nested_refit(
    control = control_nested_refit(
      verbose   = TRUE,
      allow_par = FALSE
    )
  )

# Everything runs 
# i [1/10] Starting Modeltime Table: ID 1...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [1/10] Finished Modeltime Table: ID 1
# 
# i [10/10] Starting Modeltime Table: ID 10...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [10/10] Finished Modeltime Table: ID 10
# 
# i [2/10] Starting Modeltime Table: ID 2...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [2/10] Finished Modeltime Table: ID 2
# 
# i [3/10] Starting Modeltime Table: ID 3...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [3/10] Finished Modeltime Table: ID 3
# 
# i [4/10] Starting Modeltime Table: ID 4...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [4/10] Finished Modeltime Table: ID 4
# 
# i [5/10] Starting Modeltime Table: ID 5...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [5/10] Finished Modeltime Table: ID 5
# 
# i [6/10] Starting Modeltime Table: ID 6...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [6/10] Finished Modeltime Table: ID 6
# 
# i [7/10] Starting Modeltime Table: ID 7...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [7/10] Finished Modeltime Table: ID 7
# 
# i [8/10] Starting Modeltime Table: ID 8...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [8/10] Finished Modeltime Table: ID 8
# 
# i [9/10] Starting Modeltime Table: ID 9...
# √ Model 6 Passed PROPHET W/ REGRESSORS.
# √ [9/10] Finished Modeltime Table: ID 9
# 
# Finished in: 1.067408 mins.

# Looks like I am not able to reproduce the overwriting part. 
# When I run this with my real data, only the forecast for the last Product_ID shows up. 
# When I scale to 100 products, only the first 4 are forecasted. 
# Visualize Future Forecast 
nested_best_refit_tbl %>%
  extract_nested_future_forecast() %>%
  group_by(Product_ID) %>%
  plot_modeltime_forecast(.facet_vars = Product_ID, .facet_ncol = 2, .interactive = TRUE)

# For some reason my confidence intervals disappear 
# plot_time_series(...): Groups are previously detected. Grouping by: Product_ID
# Warning message:
#   Expecting the following names to be in the data frame: .conf_hi, .conf_lo. 
# Proceeding with '.conf_interval_show = FALSE' to visualize the forecast without confidence intervals.
# Alternatively, try using `modeltime_calibrate()` before forecasting to add confidence intervals. 
mdancho84 commented 2 years ago

Excellent. I will take a look.

spsanderson commented 2 years ago

For me I keep getting an error of the parallel backend failing

Started like two weeks ago

mdancho84 commented 2 years ago

OK, is there a reproducible example? I'm getting ready to send a new version of modeltime to CRAN. If it doesn't make it into this release then I don't know when it will happen. Probably in 3-months (quarterly update cycle).

spsanderson commented 2 years ago

I wouldn't worry for now, could be local to my machine. I have a script I run here:

https://github.com/spsanderson/healthyverse_tsa/blob/master/README.Rmd