Closed ruarai closed 1 month ago
Thank you so much for reporting and providing an example config @ruarai !
This issue is a somewhat known but annoying current limitation that we are feeling more and more inclined to do something about sooner rather than later.
In the meantime though, for your particular issue to do with thousands of samples, we are a couple weeks away of releasing a whole new framework for handling samples that should make the process of validating samples atleast, more memory efficient (with some additional optimisations for execution time also planned).
You can find information about the new sample config specification in this announcement, with some details on how to test drive it in this comment.
Hopefully this will help. Would be grateful to hear how you get on either way!
BTW, here's a quick mock-up I put together using your config and converting it to the new sample config spec:
{
"schema_version": "https://raw.githubusercontent.com/Infectious-Disease-Modeling-Hubs/schemas/main/v3.0.0/tasks-schema.json",
"rounds": [{
"round_id_from_variable": true,
"round_id": "forecast_date",
"model_tasks": [{
"task_ids": {
"forecast_date": {
"required": null,
"optional": ["2024-05-02", "2024-05-09", "2024-05-16", "2024-05-23", "2024-05-30", "2024-06-06", "2024-06-13", "2024-06-20", "2024-06-27", "2024-07-04", "2024-07-11", "2024-07-18", "2024-07-25", "2024-08-01", "2024-08-08", "2024-08-15", "2024-08-22", "2024-08-29", "2024-09-05", "2024-09-12", "2024-09-19", "2024-09-26", "2024-10-03", "2024-10-10", "2024-10-17", "2024-10-24", "2024-10-31", "2024-11-07", "2024-11-14", "2024-11-21", "2024-11-28", "2024-12-05", "2024-12-12", "2024-12-19", "2024-12-26"]
},
"origin_date": {
"required": null,
"optional": ["2024-05-01", "2024-05-02", "2024-05-03", "2024-05-04", "2024-05-05", "2024-05-06", "2024-05-07", "2024-05-08", "2024-05-09", "2024-05-10", "2024-05-11", "2024-05-12", "2024-05-13", "2024-05-14", "2024-05-15", "2024-05-16", "2024-05-17", "2024-05-18", "2024-05-19", "2024-05-20", "2024-05-21", "2024-05-22", "2024-05-23", "2024-05-24", "2024-05-25", "2024-05-26", "2024-05-27", "2024-05-28", "2024-05-29", "2024-05-30", "2024-05-31", "2024-06-01", "2024-06-02", "2024-06-03", "2024-06-04", "2024-06-05", "2024-06-06", "2024-06-07", "2024-06-08", "2024-06-09", "2024-06-10", "2024-06-11", "2024-06-12", "2024-06-13", "2024-06-14", "2024-06-15", "2024-06-16", "2024-06-17", "2024-06-18", "2024-06-19", "2024-06-20", "2024-06-21", "2024-06-22", "2024-06-23", "2024-06-24", "2024-06-25", "2024-06-26", "2024-06-27", "2024-06-28", "2024-06-29", "2024-06-30", "2024-07-01", "2024-07-02", "2024-07-03", "2024-07-04", "2024-07-05", "2024-07-06", "2024-07-07", "2024-07-08", "2024-07-09", "2024-07-10", "2024-07-11", "2024-07-12", "2024-07-13", "2024-07-14", "2024-07-15", "2024-07-16", "2024-07-17", "2024-07-18", "2024-07-19", "2024-07-20", "2024-07-21", "2024-07-22", "2024-07-23", "2024-07-24", "2024-07-25", "2024-07-26", "2024-07-27", "2024-07-28", "2024-07-29", "2024-07-30", "2024-07-31", "2024-08-01", "2024-08-02", "2024-08-03", "2024-08-04", "2024-08-05", "2024-08-06", "2024-08-07", "2024-08-08", "2024-08-09", "2024-08-10", "2024-08-11", "2024-08-12", "2024-08-13", "2024-08-14", "2024-08-15", "2024-08-16", "2024-08-17", "2024-08-18", "2024-08-19", "2024-08-20", "2024-08-21", "2024-08-22", "2024-08-23", "2024-08-24", "2024-08-25", "2024-08-26", "2024-08-27", "2024-08-28", "2024-08-29", "2024-08-30", "2024-08-31", "2024-09-01", "2024-09-02", "2024-09-03", "2024-09-04", "2024-09-05", "2024-09-06", "2024-09-07", "2024-09-08", "2024-09-09", "2024-09-10", "2024-09-11", "2024-09-12", "2024-09-13", "2024-09-14", "2024-09-15", "2024-09-16", "2024-09-17", "2024-09-18", "2024-09-19", "2024-09-20", "2024-09-21", "2024-09-22", "2024-09-23", "2024-09-24", "2024-09-25", "2024-09-26", "2024-09-27", "2024-09-28", "2024-09-29", "2024-09-30", "2024-10-01", "2024-10-02", "2024-10-03", "2024-10-04", "2024-10-05", "2024-10-06", "2024-10-07", "2024-10-08", "2024-10-09", "2024-10-10", "2024-10-11", "2024-10-12", "2024-10-13", "2024-10-14", "2024-10-15", "2024-10-16", "2024-10-17", "2024-10-18", "2024-10-19", "2024-10-20", "2024-10-21", "2024-10-22", "2024-10-23", "2024-10-24", "2024-10-25", "2024-10-26", "2024-10-27", "2024-10-28", "2024-10-29", "2024-10-30", "2024-10-31", "2024-11-01", "2024-11-02", "2024-11-03", "2024-11-04", "2024-11-05", "2024-11-06", "2024-11-07", "2024-11-08", "2024-11-09", "2024-11-10", "2024-11-11", "2024-11-12", "2024-11-13", "2024-11-14", "2024-11-15", "2024-11-16", "2024-11-17", "2024-11-18", "2024-11-19", "2024-11-20", "2024-11-21", "2024-11-22", "2024-11-23", "2024-11-24", "2024-11-25", "2024-11-26", "2024-11-27", "2024-11-28", "2024-11-29", "2024-11-30", "2024-12-01", "2024-12-02", "2024-12-03", "2024-12-04", "2024-12-05", "2024-12-06", "2024-12-07", "2024-12-08", "2024-12-09", "2024-12-10", "2024-12-11", "2024-12-12", "2024-12-13", "2024-12-14", "2024-12-15", "2024-12-16", "2024-12-17", "2024-12-18", "2024-12-19", "2024-12-20", "2024-12-21", "2024-12-22", "2024-12-23", "2024-12-24", "2024-12-25", "2024-12-26", "2024-12-27", "2024-12-28", "2024-12-29", "2024-12-30", "2024-12-31"]
},
"target": {
"required": ["case incidence", "reff"],
"optional": null
},
"horizon": {
"required": null,
"optional": [-14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
},
"location": {
"required": null,
"optional": ["AUS", "NZ", "ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC", "WA"]
},
"pathogen": {
"required": null,
"optional": ["RSV", "SARSCOV2", "flu"]
}
},
"output_type": {
"sample": {
"output_type_id_params": {
"is_required": true,
"type": "integer",
"min_samples_per_task": 2000,
"max_samples_per_task": 2000
},
"value": {
"type": "integer",
"minimum": 0
}
}
},
"target_metadata": [
{
"target_id": "case incidence",
"target_name": "Daily incident cases",
"target_units": "count",
"target_keys": {
"target": "case incidence"
},
"description": "Daily newly reported cases.",
"target_type": "discrete",
"is_step_ahead": true,
"time_unit": "day"
},
{
"target_id": "reff",
"target_name": "Daily R effective",
"target_units": "reff",
"target_keys": {
"target": "reff"
},
"description": "Average number of people a given case will infect in a given time in context.",
"target_type": "continuous",
"is_step_ahead": true,
"time_unit": "day"
}
]
}],
"submissions_due": {
"relative_to": "forecast_date",
"start": -1,
"end": 1
}
}
]
}
Creating the expanded grid of valid values with the hubData
version on this branch (install with remotes::install_github("Infectious-Disease-Modeling-Hubs/hubData", ref = "feature/handle-samples")
) and
hubData::expand_model_out_val_grid(
config_tasks = config_tasks,
round_id = "2024-05-02"
)
now takes 4.146 secs, takes 35.40 MB of memory and looks like:
# A tibble: 632,100 × 8
forecast_date origin_date target horizon location pathogen output_type output_type_id
<date> <date> <chr> <int> <chr> <chr> <chr> <int>
1 2024-05-02 2024-05-01 case incidence -14 AUS RSV sample NA
2 2024-05-02 2024-05-02 case incidence -14 AUS RSV sample NA
3 2024-05-02 2024-05-03 case incidence -14 AUS RSV sample NA
4 2024-05-02 2024-05-04 case incidence -14 AUS RSV sample NA
5 2024-05-02 2024-05-05 case incidence -14 AUS RSV sample NA
6 2024-05-02 2024-05-06 case incidence -14 AUS RSV sample NA
7 2024-05-02 2024-05-07 case incidence -14 AUS RSV sample NA
8 2024-05-02 2024-05-08 case incidence -14 AUS RSV sample NA
9 2024-05-02 2024-05-09 case incidence -14 AUS RSV sample NA
10 2024-05-02 2024-05-10 case incidence -14 AUS RSV sample NA
# ℹ 632,090 more rows
# ℹ Use `print(n = ...)` to see more rows
That's because output_type_ids are now set to NA
and we use different methods for determining and checking the number of samples against the schema.
Thanks very much @annakrystalli! This is very helpful (especially the example config!).
I look forward to using the v3.0.0 schema, and will let you know if I run into anything else 🙂
Hi, really appreciate the work that has been put into this project. We are trying to set up a forecasting hub and have a slightly unusual use case that seems to be causing issues with hubValidations.
The issue stems from the fact that we have a large number of possible "origin_date" fields (i.e. any date this year), alongside a large number of possible "output_type_id" for our "sample" field (i.e. any integer from 1 through to 2000).
I think this leads
expand_model_out_vals_grid
to produce an enormous table of possible values (with these options also being combined with the possible horizon values), which crashes R with an out-of-memory error.We can currently get around this for now with a patched version of hubValidations that skips these checks, but perhaps there is some way we could tackle this more directly?
Thanks!
Here's our tasks.json file for reference: