cmu-delphi / epipredict

Tools for building predictive models in epidemiology.
https://cmu-delphi.github.io/epipredict/
Other
8 stars 8 forks source link

Provide better error/recovery when test lags don't exist ("zero-length inputs [...]") #333

Open brookslogan opened 1 month ago

brookslogan commented 1 month ago

Same message as #128. Looks like this is due to having a gap of data availability between seasons, and the requested forecast date having data for some lags but not all. Ideally we would give a good message in test lag preparation, rather than balk in residual quantiles.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)
library(purrr)
library(ggplot2)
library(epidatr)
#> ! epidatr cache is being used (set env var EPIDATR_USE_CACHE=FALSE if not
#>   intended).
#> ℹ The cache directory is ~/.cache/R/epidatr.
#> ℹ The cache will be cleared after 14 days and will be pruned if it exceeds 4096
#>   MB.
#> ℹ The log of cache transactions is stored at ~/.cache/R/epidatr/logfile.txt.
library(epiprocess)
#> 
#> Attaching package: 'epiprocess'
#> The following object is masked from 'package:stats':
#> 
#>     filter
library(epipredict)
#> Loading required package: parsnip
#> 
#> Attaching package: 'epipredict'
#> The following object is masked from 'package:ggplot2':
#> 
#>     layer

flusurv_analysis_issue <- as.Date("2019-08-01") %>%
  MMWRweek::MMWRweek() %>%
  {.$MMWRyear * 100L + .$MMWRweek}

flusurv_issue_data <-
  pub_flusurv(
    locations = "network_all",
    issues = epirange(123401, flusurv_analysis_issue)
  )
#> Warning: Loading from the cache at /home/fullname/.cache/R/epidatr; see
#> ~/.cache/R/epidatr/logfile.txt for more details.
#> This warning is displayed once every 8 hours.

flusurv_archive <- flusurv_issue_data %>%
  select(geo_value = location,
         time_value = epiweek,
         version = release_date,
         starts_with("rate_")) %>%
  as_epi_archive(compactify = TRUE)

archive <- flusurv_archive

forecast_dates <- seq(min(archive$DT$version) + 120L, archive$versions_end,
                      by = "6 weeks")

horizons <- c(0, 7, 14, 21, 28) # relative to forecast_date

example_forecaster <- function(snapshot_edf, forecast_date) {
  shared_reporting_latency <- as.integer(forecast_date - max(snapshot_edf$time_value))
  horizons %>%
    map(function(horizon) {
      snapshot_edf %>%
        arx_forecaster(
          outcome = "rate_overall",
          predictors = "rate_overall",
          args_list = arx_args_list(
            # (this is incomplete; latency often varies signficantly by covariate and can't be ignored, so we also need lag adjustment.)
            ahead = shared_reporting_latency + horizon,
            ## ahead = horizon,
            quantile_levels = c(0.1, 0.5, 0.9)
          )) %>%
        .$predictions %>%
        mutate(forecast_date = forecast_date,
               target_date = forecast_date + horizon)
    }) %>%
    bind_rows()
}

latest_edf <- archive %>% epix_as_of(.$versions_end)

unfaithful_forecasts <- latest_edf %>%
  # pretend we get observations about today, on today, with no revisions
  mutate(version = time_value) %>%
  as_epi_archive(versions_end = max(forecast_dates)) %>%
  epix_slide(
    # pretend version releases are on forecast dates
    ref_time_values = forecast_dates,
    before = 365000L, # 1000-year time window --> don't filter out any `time_value`s
    ~ example_forecaster(.x, .ref_time_value),
    names_sep = NULL
  ) %>%
  select(-time_value)
#> Warning in max.default(structure(numeric(0), class = "Date"), na.rm = FALSE):
#> no non-missing arguments to max; returning -Inf
#> Error in `map()` at rlang/R/dots.R:91:3:
#> ℹ In index: 1.
#> Caused by error in `lm.fit()`:
#> ! 0 (non-NA) cases
#> Backtrace:
#>      ▆
#>   1. ├─... %>% select(-time_value)
#>   2. ├─dplyr::select(., -time_value)
#>   3. ├─epiprocess::epix_slide(...) at dplyr/R/select.R:54:3
#>   4. │ └─x$slide(...)
#>   5. │   ├─... %>% ungroup()
#>   6. │   └─self$group_by()$slide(...) at dplyr/R/group-by.R:153:3
#>   7. │     └─base::lapply(...)
#>   8. │       └─epiprocess (local) FUN(X[[i]], ...)
#>   9. │         ├─dplyr::group_modify(...)
#>  10. │         ├─epiprocess:::group_modify.epi_df(...) at dplyr/R/group-map.R:156:3
#>  11. │         │ └─dplyr::dplyr_reconstruct(NextMethod(), .data)
#>  12. │         │   └─dplyr:::dplyr_new_data_frame(data) at dplyr/R/generics.R:196:3
#>  13. │         │     ├─row.names %||% .row_names_info(x, type = 0L) at dplyr/R/utils.R:18:3
#>  14. │         │     └─base::.row_names_info(x, type = 0L) at dplyr/R/utils.R:18:3
#>  15. │         ├─base::NextMethod()
#>  16. │         └─dplyr:::group_modify.data.frame(...)
#>  17. │           └─epiprocess (local) .f(.data, group_keys(.data), ...) at dplyr/R/group-map.R:166:3
#>  18. │             └─f(.data_group, .group_key, ref_time_value, ...)
#>  19. │               └─global example_forecaster(.x, .ref_time_value)
#>  20. │                 └─... %>% bind_rows()
#>  21. ├─dplyr::ungroup(.)
#>  22. ├─dplyr::bind_rows(.)
#>  23. │ └─rlang::list2(...) at dplyr/R/bind-rows.R:31:3
#>  24. ├─purrr::map(...) at rlang/R/dots.R:91:3
#>  25. │ └─purrr:::map_("list", .x, .f, ..., .progress = .progress) at purrr/R/map.R:129:3
#>  26. │   ├─purrr:::with_indexed_errors(...) at purrr/R/map.R:174:3
#>  27. │   │ └─base::withCallingHandlers(...) at purrr/R/map.R:201:3
#>  28. │   ├─purrr:::call_with_cleanup(...) at purrr/R/map.R:174:3
#>  29. │   └─.f(.x[[i]], ...)
#>  30. │     └─... %>% ...
#>  31. ├─dplyr::mutate(...)
#>  32. ├─epipredict::arx_forecaster(...) at dplyr/R/mutate.R:146:3
#>  33. │ ├─generics::fit(wf, epi_data)
#>  34. │ ├─epipredict:::fit.epi_workflow(wf, epi_data)
#>  35. │ ├─base::NextMethod()
#>  36. │ └─workflows:::fit.workflow(wf, epi_data)
#>  37. │   └─workflows::.fit_model(workflow, control)
#>  38. │     ├─generics::fit(action_model, workflow = workflow, control = control)
#>  39. │     └─workflows:::fit.action_model(...)
#>  40. │       └─workflows:::fit_from_xy(spec, mold, case_weights, control_parsnip)
#>  41. │         ├─generics::fit_xy(...)
#>  42. │         └─parsnip::fit_xy.model_spec(...)
#>  43. │           └─parsnip:::xy_form(...)
#>  44. │             └─parsnip:::form_form(...)
#>  45. │               └─parsnip:::eval_mod(...)
#>  46. │                 └─rlang::eval_tidy(e, env = envir, ...)
#>  47. ├─stats::lm(formula = ..y ~ ., data = data) at rlang/R/eval-tidy.R:121:3
#>  48. │ └─stats::lm.fit(...)
#>  49. │   └─base::stop("0 (non-NA) cases")
#>  50. └─base::.handleSimpleError(...)
#>  51.   └─purrr (local) h(simpleError(msg, call))
#>  52.     └─cli::cli_abort(...) at purrr/R/map.R:215:9
#>  53.       └─rlang::abort(...) at cli/R/rlang.R:45:3

Created on 2024-05-14 with reprex v2.0.2

brookslogan commented 1 month ago

The difference between this and #332 is that this is dealing with a lagset being bad for test data, while #332 is about a shiftset being bad for training data.

dajmcdon commented 2 weeks ago

Can you make a simple example without the slide?