Closed jdhoffa closed 5 months ago
Is the plan for this to have a package architecture, or will this be a collection of scripts?
As a pacta.*
repository, this will be a completely vanilla R package.
The goal will be (very soon) to have workflow.scenario.preparation
look like:
# ... set paths
# ... load files
geco_2022 <- pacta.scenario.data.preparation::prepare_geco_2022(data, ...)
weo_2022 <- pacta.scenario.data.preparation::prepare_weo_2022(data, ...)
# ... etc.
# combine/ format/ whatever
# ... write files
with all the prepare_X_YYY()
functions being totally vanilla R-object handling functions
@jdhoffa I think the intent of this is now complete?
Yessir! Closing as completed.
FYI, for posterity, this now passes all expectations
devtools::load_all()
#> ℹ Loading pacta.scenario.data.preparation
scenario_preparation_inputs_path <- "~/data/pactarawdata/scenario-sources"
scenario_preparation_outputs_path <- "outputs"
config <- list(
geco_2022_path = "geco_2022",
geco_2022_automotive_filename = "geco2022_automotive_stocks_geco2021_retirement_rates_CORRECTED.csv",
geco_2022_aviation_filename = "GECO2022_Aviation_processed_data.csv",
geco_2022_fossil_fuels_15c_filename = "geco2022_15c_ff_rawdata.csv",
geco_2022_fossil_fuels_ndc_filename = "geco2022_ndc_ff_rawdata.csv",
geco_2022_fossil_fuels_ref_filename = "geco2022_ref_ff_rawdata.csv",
geco_2022_power_15c_filename = "geco2022_15c_power_rawdata_region.csv",
geco_2022_power_ndc_filename = "geco2022_ndc_power_rawdata_region.csv",
geco_2022_power_ref_filename = "geco2022_ref_power_rawdata_region.csv",
geco_2022_steel_filename = "GECO2022_Steel_processed_data.csv",
geco_2023_raw_path = "geco_2023-20240201",
geco_2023_15c_raw_filename = "20240129_PACTA data request_GECO data template_15C_data.xlsx",
geco_2023_ndc_raw_filename = "20240129_PACTA data request_GECO data template_NDC-LTS_data.xlsx",
geco_2023_ref_raw_filename = "20240129_PACTA data request_GECO data template_Ref_data.xlsx",
geco_2023_supplement_path = "geco_2023_supplemental-20240227/GECO2023_20231219",
geco_2023_supplement_15c_raw_filename = "GECO2023_20231219_15C_Total.xlsx",
geco_2023_supplement_ndc_raw_filename = "GECO2023_20231219_NDC_Total.xlsx",
geco_2023_supplement_ref_raw_filename = "GECO2023_20231219_Ref_Total.xlsx",
isf_2021_raw_path = "ISF2021",
isf_2021_power_raw_filename = "NZAOA_raw_data_power.xlsx",
isf_2021_not_power_raw_filename = "NZAOA_rawdata_notpower_P4I.xlsx",
isf_2023_raw_path = "isf_2023-20240222",
isf_2023_scope_global_raw_filepath = "doi_10_5061_dryad_cz8w9gj82__v20230901/4.7 Database_August_upload/Scope_Country/Scope_Global.xlsx",
isf_2023_s_global_raw_filepath = "doi_10_5061_dryad_cz8w9gj82__v20230901/4.7 Database_August_upload/S_Country/S_Global.xlsx",
isf_2023_country_annex_path = "doi_10_5061_dryad_cz8w9gj82__v20230901/4.7 Database_August_upload/Annex_Country",
weo_2022_raw_path = "weo_2022/WEO2022 extended data",
weo_2022_ext_data_regions_raw_filepath = "raw_data_from_provider/used_in_pacta.scenario_preparation/WEO2022_Extended_Data_Regions.csv",
weo_2022_ext_data_world_raw_filepath = "raw_data_from_provider/used_in_pacta.scenario_preparation/WEO2022_Extended_Data_World.csv",
weo_2022_fossil_fuels_raw_filepath = "light_process/weo2022_fossilfuel_demand_supply.csv",
weo_2022_nze_auto_raw_filepath = "processed_data/nze_may_2021_report_old_data/hard_process_auto/used/NZE2021_RawData_2050.xlsx",
weo_2022_nze_steel_raw_filepath = "raw_data_from_provider/used_in_pacta.scenario_preparation/WEO2022_NZE_SteelData.csv",
weo_2022_sales_aps_auto_raw_filepath = "raw_data_from_provider/used_in_pacta.scenario_preparation/SalesAPS_rawdata.csv",
weo_2022_electric_sales_aps_auto_raw_filename = "raw_data_from_provider/used_in_pacta.scenario_preparation/IEA-EV-dataEV salesCarsProjection-APS.csv",
weo_2023_raw_path = "weo_2023-20240222/WEO2023 extended data",
weo_2023_ext_data_regions_raw_filename = "WEO2023_Extended_Data_Regions.csv",
weo_2023_ext_data_world_raw_filename = "WEO2023_Extended_Data_World.csv",
weo_2023_fig_chptr_3_raw_filename = "WEO2023_Figures_Chapter_03.xlsx",
iea_global_ev_raw_path = "iea_global_ev_2023-20240226",
iea_global_ev_raw_filename = "IEA Global EV Data 2023.csv",
mpp_ats_raw_path = "mpp_ats-20240227",
mpp_ats_raw_filename = "2022-08-12 - MPP ATS - RPK and GHG intensity.xlsx"
)
source("../workflow.scenario.preparation/process_scenario_geco_2022.R")
pacta.data.validation::validate_intermediate_scenario_output(geco_2022)
waldo::compare(
dplyr::mutate(
dplyr::arrange(pacta.scenario.preparation::geco_2022, scenario, scenario_geography, sector, technology),
units = ifelse(sector == "Aviation", "tCO2/pkm", units)
),
dplyr::arrange(geco_2022, scenario, scenario_geography, sector, technology),
tolerance = 1e-15
)
#> ✔ No differences
source("../workflow.scenario.preparation/process_scenario_geco_2023.R")
pacta.data.validation::validate_intermediate_scenario_output(geco_2023)
waldo::compare(
dplyr::arrange(pacta.scenario.preparation::geco_2023, scenario, scenario_geography, sector, technology),
dplyr::arrange(geco_2023, scenario, scenario_geography, sector, technology)
)
#> ✔ No differences
source("../workflow.scenario.preparation/process_scenario_isf_2021.R")
pacta.data.validation::validate_intermediate_scenario_output(isf_2021)
waldo::compare(
dplyr::arrange(pacta.scenario.preparation::isf_2021, scenario, scenario_geography, sector, technology),
dplyr::arrange(isf_2021, scenario, scenario_geography, sector, technology)
)
#> ✔ No differences
source("../workflow.scenario.preparation/process_scenario_isf_2023.R")
pacta.data.validation::validate_intermediate_scenario_output(isf_2023)
waldo::compare(
dplyr::filter(
dplyr::arrange(pacta.scenario.preparation::isf_2023, scenario, scenario_geography, sector, technology),
!sector %in% c("Oil&Gas", "Coal")
),
dplyr::filter(
dplyr::mutate(
dplyr::arrange(isf_2023, scenario, scenario_geography, sector, technology),
scenario_geography = dplyr::case_when(
scenario_geography == "Saudi Arabia" ~ "Saudi_Arabia",
scenario_geography == "South Africa" ~ "South_Africa",
scenario_geography == "South Korea" ~ "South_Korea",
scenario_geography == "US" ~ "USA",
.default = scenario_geography
)
),
!sector %in% c("Oil&Gas", "Coal")
),
tolerance = 1e-15
)
#> ✔ No differences
source("../workflow.scenario.preparation/process_scenario_weo_2022.R")
pacta.data.validation::validate_intermediate_scenario_output(weo_2022)
waldo::compare(
dplyr::arrange(pacta.scenario.preparation::weo_2022, source, scenario, scenario_geography, sector, technology),
weo_2022,
tolerance = 1e-15
)
#> ✔ No differences
source("../workflow.scenario.preparation/process_scenario_weo_2023.R")
pacta.data.validation::validate_intermediate_scenario_output(weo_2023)
waldo::compare(
dplyr::arrange(pacta.scenario.preparation::weo_2023, scenario, scenario_geography, sector, technology),
dplyr::mutate(
dplyr::arrange(weo_2023, scenario, scenario_geography, sector, technology),
technology = dplyr::if_else(sector == "Aviation", NA, technology)
)
)
#> ✔ No differences
My plan is to:
prepare_X
script from https://github.com/RMI-PACTA/pacta.scenario.preparation/blob/main/data-raw/prepare_geco2022.R intoworkflow.scenario.preparation
prepare_X_YYYY_scenario()
function (e.g.prepare_geco_20222_scenario()
) out ofworkflow.scenario.preparation
I will do this for:
This may seem like a lot of steps, but I want to be very careful that we preserve the same data at every step of the refactor (since automated regression tests are not possible here).
Supersedes #2
AB#9915