Wrong prediction interval when going from log to original scale

vidarsumo commented 2 years ago

library(modeltime)
library(tidymodels)
library(timetk)
library(tidyverse)

# Full = Training + Forecast Datasets
full_data_tbl <- walmart_sales_weekly %>%
    select(id, Date, Weekly_Sales) %>%

    # Apply Group-wise Time Series Manipulations
    group_by(id) %>%
    future_frame(
        .date_var   = Date,
        .length_out = 12,
        .bind_data  = TRUE
    ) %>%
    ungroup() %>%

    # Consolidate IDs
    mutate(
        id = fct_drop(id),
        Weekly_Sales = log(Weekly_Sales)
        )

# Training Data
data_prepared_tbl <- full_data_tbl %>%
    filter(!is.na(Weekly_Sales))

train_data <- data_prepared_tbl %>% 
    filter(Date <= "2012-08-03")

test_data <- data_prepared_tbl %>% 
    filter(Date > "2012-08-03")

recipe_spec <- recipe(Weekly_Sales ~ ., data = train_data) %>%
    step_timeseries_signature(Date) %>%
    step_rm(matches("(.iso$)|(.xts$)|(day)|(hour)|(minute)|(second)|(am.pm)")) %>%
    step_mutate(Date_week = factor(Date_week, ordered = TRUE)) %>%
    step_dummy(all_nominal(), one_hot = TRUE)

model_spec <- linear_reg(mode = "regression", engine = "lm")

lm_mtbl <- workflow() %>% 
    add_model(model_spec) %>% 
    add_recipe(recipe_spec) %>% 
    fit(train_data) %>% 
    modeltime_table()

lm_calib_mtbl <- lm_mtbl %>% 
    modeltime_calibrate(test_data, id = "id")

# Back to original scale
calibr_orig_scale_tbl <- lm_calib_mtbl %>% 
    select(-.model) %>% 
    unnest(.calibration_data) %>% 
    mutate(
        .actual     = exp(.actual),
        .prediction = exp(.prediction),
        .residuals  = .actual - .prediction
    ) %>% 
    group_by(.model_id, .model_desc, .type) %>% 
    nest() %>% 
    ungroup() %>% 
    rename(".calibration_data" = "data")

model_calibr_orig_scale_mtbl <- lm_calib_mtbl %>% 
    select(.model) %>% 
    bind_cols(calibr_orig_scale_tbl)

lm_fc_tbl <- model_calibr_orig_scale_mtbl %>% 
    modeltime_forecast(
        new_data = test_data,
        actual_data = data_prepared_tbl,
        conf_by_id = TRUE
    )

lm_fc_tbl %>% 
    mutate(.value = exp(.value)) %>% 
    group_by(id) %>% 
    plot_modeltime_forecast(.facet_ncol = 3)

vidarsumo commented 2 years ago

So I found a way for this to work. Not sure how it fit's into your package though.

So after I create lm_calib_mtbl from above I run this code and get what I want:

# Back to original scale
calibr_orig_scale_tbl <- lm_calib_mtbl %>% 
  select(-.model) %>% 
  unnest(.calibration_data) %>% 
  mutate(
    .actual     = exp(.actual),
    .prediction = exp(.prediction),
    .residuals  = .actual - .prediction
  ) %>% rename(".index" = "Date")

# Predict using non-calibrated model (doesn't matter which one you use)
lm_fc_tbl <- lm_calib_mtbl %>% 
  modeltime_forecast(
    new_data = test_data,
    actual_data = data_prepared_tbl,
    conf_by_id = TRUE
  )

lm_fc_tbl %>% 
  filter(id %in% c("1_93", "1_95")) %>% 
  mutate(.value = exp(.value)) %>% 
  left_join(calibr_orig_scale_tbl) %>% 
  group_by(id) %>%
  mutate(
    .conf_lo = case_when(
      is.na(.conf_lo) ~ NA_real_,
      TRUE ~ qnorm((1 - 0.95) / 2, mean = .value, sd = sd(.residuals, na.rm = TRUE))
    ),
    .conf_hi = case_when(
      is.na(.conf_hi) ~ NA_real_,
      TRUE ~ qnorm((1 +  0.95) / 2, mean = .value, sd = sd(.residuals, na.rm = TRUE))
    )
  ) %>% 
  select(-c(.actual, .prediction, .residuals)) %>%
  group_by(id) %>%
  plot_modeltime_forecast()

oguchihy commented 1 year ago

Wouldn't

mutate(.conf_lo = exp(.conf_lo),
             .conf_hi = exp(.conf_hi))

produce the same result? It worked for me.

vidarsumo commented 1 year ago

Do you get the same results as I got?

oguchihy commented 1 year ago

I need a copy of the data file that you used (could be from the learning lab number) to check. It worked on my own private data. Thanks

On Sep 21, 2022, at 1:33 PM, Vidar @.***> wrote:

Do you get the same results as I got?

— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you commented.

vidarsumo commented 1 year ago

There are severael questions on Matts slacks channel regarding this. I don't understand why exp() simply works now. So I tried to run my first example doing what you proposed, and it magically worked... I don't remember why I had this issue to begin with since simply exp(.conf_lo) and exp(.conf_hi) works.

library(modeltime)
library(tidymodels)
library(timetk)
library(tidyverse)

# Full = Training + Forecast Datasets
full_data_tbl <- walmart_sales_weekly %>%
    select(id, Date, Weekly_Sales) %>%

    # Apply Group-wise Time Series Manipulations
    group_by(id) %>%
    future_frame(
        .date_var   = Date,
        .length_out = 12,
        .bind_data  = TRUE
    ) %>%
    ungroup() %>%

    # Consolidate IDs
    mutate(
        id = fct_drop(id),
        Weekly_Sales = log(Weekly_Sales)
    )

# Training Data
data_prepared_tbl <- full_data_tbl %>%
    filter(!is.na(Weekly_Sales))

train_data <- data_prepared_tbl %>% 
    filter(Date <= "2012-08-03")

test_data <- data_prepared_tbl %>% 
    filter(Date > "2012-08-03")

recipe_spec <- recipe(Weekly_Sales ~ ., data = train_data) %>%
    step_timeseries_signature(Date) %>%
    step_rm(matches("(.iso$)|(.xts$)|(day)|(hour)|(minute)|(second)|(am.pm)")) %>%
    step_mutate(Date_week = factor(Date_week, ordered = TRUE)) %>%
    step_dummy(all_nominal(), one_hot = TRUE)

model_spec <- linear_reg(mode = "regression", engine = "lm")

lm_mtbl <- workflow() %>% 
    add_model(model_spec) %>% 
    add_recipe(recipe_spec) %>% 
    fit(train_data) %>% 
    modeltime_table()

lm_calib_mtbl <- lm_mtbl %>% 
    modeltime_calibrate(test_data, id = "id")

lm_fc_tbl <- lm_calib_mtbl %>% 
    modeltime_forecast(
        new_data = test_data,
        actual_data = data_prepared_tbl,
        conf_by_id = TRUE
    )

lm_fc_tbl %>% 
    mutate(
        .value   = exp(.value),
        .conf_lo = exp(.conf_lo),
        .conf_hi = exp(.conf_hi)
    ) %>%
    group_by(id) %>% 
    plot_modeltime_forecast(.facet_ncol = 3)

oguchihy commented 1 year ago

Great! Thanks for confirming that it works. I still could not find the dataset you used, so I wasn't able to check on it myself at this end before now. Just glad it works Let's report this solution to @mdancho84?

business-science / modeltime

Wrong prediction interval when going from log to original scale #159