RMI-PACTA / pacta.scenario.data.preparation

The goal of {pacta.scenario.data.preparation} is to prepare and format all scenario input datasets required to run the {pacta.portfolio.allocate} tool.
https://rmi-pacta.github.io/pacta.scenario.data.preparation/
Other
1 stars 0 forks source link

Migrate `prepare_X_YYYY_scenario()` functions out of `main.R` #12

Closed jdhoffa closed 5 months ago

jdhoffa commented 7 months ago

My plan is to:

I will do this for:

This may seem like a lot of steps, but I want to be very careful that we preserve the same data at every step of the refactor (since automated regression tests are not possible here).

Supersedes #2

AB#9915

AlexAxthelm commented 7 months ago

Is the plan for this to have a package architecture, or will this be a collection of scripts?

jdhoffa commented 7 months ago

As a pacta.* repository, this will be a completely vanilla R package.

The goal will be (very soon) to have workflow.scenario.preparation look like:

# ... set paths
# ... load files
geco_2022 <- pacta.scenario.data.preparation::prepare_geco_2022(data, ...)
weo_2022 <- pacta.scenario.data.preparation::prepare_weo_2022(data, ...)
# ... etc.
# combine/ format/ whatever
# ... write files

with all the prepare_X_YYY() functions being totally vanilla R-object handling functions

cjyetman commented 5 months ago

@jdhoffa I think the intent of this is now complete?

jdhoffa commented 5 months ago

Yessir! Closing as completed.

cjyetman commented 5 months ago

FYI, for posterity, this now passes all expectations

devtools::load_all()
#> ℹ Loading pacta.scenario.data.preparation

scenario_preparation_inputs_path <- "~/data/pactarawdata/scenario-sources"
scenario_preparation_outputs_path <- "outputs"

config <- list(
  geco_2022_path = "geco_2022",
  geco_2022_automotive_filename = "geco2022_automotive_stocks_geco2021_retirement_rates_CORRECTED.csv",
  geco_2022_aviation_filename = "GECO2022_Aviation_processed_data.csv",
  geco_2022_fossil_fuels_15c_filename = "geco2022_15c_ff_rawdata.csv",
  geco_2022_fossil_fuels_ndc_filename = "geco2022_ndc_ff_rawdata.csv",
  geco_2022_fossil_fuels_ref_filename = "geco2022_ref_ff_rawdata.csv",
  geco_2022_power_15c_filename = "geco2022_15c_power_rawdata_region.csv",
  geco_2022_power_ndc_filename = "geco2022_ndc_power_rawdata_region.csv",
  geco_2022_power_ref_filename = "geco2022_ref_power_rawdata_region.csv",
  geco_2022_steel_filename = "GECO2022_Steel_processed_data.csv",

  geco_2023_raw_path = "geco_2023-20240201",
  geco_2023_15c_raw_filename = "20240129_PACTA data request_GECO data template_15C_data.xlsx",
  geco_2023_ndc_raw_filename = "20240129_PACTA data request_GECO data template_NDC-LTS_data.xlsx",
  geco_2023_ref_raw_filename = "20240129_PACTA data request_GECO data template_Ref_data.xlsx",
  geco_2023_supplement_path = "geco_2023_supplemental-20240227/GECO2023_20231219",
  geco_2023_supplement_15c_raw_filename = "GECO2023_20231219_15C_Total.xlsx",
  geco_2023_supplement_ndc_raw_filename = "GECO2023_20231219_NDC_Total.xlsx",
  geco_2023_supplement_ref_raw_filename = "GECO2023_20231219_Ref_Total.xlsx",

  isf_2021_raw_path = "ISF2021",
  isf_2021_power_raw_filename = "NZAOA_raw_data_power.xlsx",
  isf_2021_not_power_raw_filename = "NZAOA_rawdata_notpower_P4I.xlsx",

  isf_2023_raw_path = "isf_2023-20240222",
  isf_2023_scope_global_raw_filepath = "doi_10_5061_dryad_cz8w9gj82__v20230901/4.7 Database_August_upload/Scope_Country/Scope_Global.xlsx",
  isf_2023_s_global_raw_filepath = "doi_10_5061_dryad_cz8w9gj82__v20230901/4.7 Database_August_upload/S_Country/S_Global.xlsx",
  isf_2023_country_annex_path = "doi_10_5061_dryad_cz8w9gj82__v20230901/4.7 Database_August_upload/Annex_Country",

  weo_2022_raw_path = "weo_2022/WEO2022 extended data",
  weo_2022_ext_data_regions_raw_filepath = "raw_data_from_provider/used_in_pacta.scenario_preparation/WEO2022_Extended_Data_Regions.csv",
  weo_2022_ext_data_world_raw_filepath = "raw_data_from_provider/used_in_pacta.scenario_preparation/WEO2022_Extended_Data_World.csv",
  weo_2022_fossil_fuels_raw_filepath = "light_process/weo2022_fossilfuel_demand_supply.csv",
  weo_2022_nze_auto_raw_filepath = "processed_data/nze_may_2021_report_old_data/hard_process_auto/used/NZE2021_RawData_2050.xlsx",
  weo_2022_nze_steel_raw_filepath = "raw_data_from_provider/used_in_pacta.scenario_preparation/WEO2022_NZE_SteelData.csv",
  weo_2022_sales_aps_auto_raw_filepath = "raw_data_from_provider/used_in_pacta.scenario_preparation/SalesAPS_rawdata.csv",
  weo_2022_electric_sales_aps_auto_raw_filename = "raw_data_from_provider/used_in_pacta.scenario_preparation/IEA-EV-dataEV salesCarsProjection-APS.csv",

  weo_2023_raw_path = "weo_2023-20240222/WEO2023 extended data",
  weo_2023_ext_data_regions_raw_filename = "WEO2023_Extended_Data_Regions.csv",
  weo_2023_ext_data_world_raw_filename = "WEO2023_Extended_Data_World.csv",
  weo_2023_fig_chptr_3_raw_filename = "WEO2023_Figures_Chapter_03.xlsx",
  iea_global_ev_raw_path = "iea_global_ev_2023-20240226",
  iea_global_ev_raw_filename = "IEA Global EV Data 2023.csv",
  mpp_ats_raw_path = "mpp_ats-20240227",
  mpp_ats_raw_filename = "2022-08-12 - MPP ATS - RPK and GHG intensity.xlsx"
)

source("../workflow.scenario.preparation/process_scenario_geco_2022.R")

pacta.data.validation::validate_intermediate_scenario_output(geco_2022)

waldo::compare(
  dplyr::mutate(
    dplyr::arrange(pacta.scenario.preparation::geco_2022, scenario, scenario_geography, sector, technology),
    units = ifelse(sector == "Aviation", "tCO2/pkm", units)
  ),
  dplyr::arrange(geco_2022, scenario, scenario_geography, sector, technology),
  tolerance = 1e-15
)
#> ✔ No differences

source("../workflow.scenario.preparation/process_scenario_geco_2023.R")

pacta.data.validation::validate_intermediate_scenario_output(geco_2023)

waldo::compare(
  dplyr::arrange(pacta.scenario.preparation::geco_2023, scenario, scenario_geography, sector, technology),
  dplyr::arrange(geco_2023, scenario, scenario_geography, sector, technology)
)
#> ✔ No differences

source("../workflow.scenario.preparation/process_scenario_isf_2021.R")

pacta.data.validation::validate_intermediate_scenario_output(isf_2021)

waldo::compare(
  dplyr::arrange(pacta.scenario.preparation::isf_2021, scenario, scenario_geography, sector, technology),
  dplyr::arrange(isf_2021, scenario, scenario_geography, sector, technology)
)
#> ✔ No differences

source("../workflow.scenario.preparation/process_scenario_isf_2023.R")

pacta.data.validation::validate_intermediate_scenario_output(isf_2023)

waldo::compare(
  dplyr::filter(
    dplyr::arrange(pacta.scenario.preparation::isf_2023, scenario, scenario_geography, sector, technology),
    !sector %in% c("Oil&Gas", "Coal")
  ),
  dplyr::filter(
    dplyr::mutate(
      dplyr::arrange(isf_2023, scenario, scenario_geography, sector, technology),
      scenario_geography = dplyr::case_when(
        scenario_geography == "Saudi Arabia" ~ "Saudi_Arabia",
        scenario_geography == "South Africa" ~ "South_Africa",
        scenario_geography == "South Korea" ~ "South_Korea",
        scenario_geography == "US" ~ "USA",
        .default = scenario_geography
      )
    ),
    !sector %in% c("Oil&Gas", "Coal")
  ),
  tolerance = 1e-15
)
#> ✔ No differences

source("../workflow.scenario.preparation/process_scenario_weo_2022.R")

pacta.data.validation::validate_intermediate_scenario_output(weo_2022)

waldo::compare(
  dplyr::arrange(pacta.scenario.preparation::weo_2022, source, scenario, scenario_geography, sector, technology),
  weo_2022,
  tolerance = 1e-15
)
#> ✔ No differences

source("../workflow.scenario.preparation/process_scenario_weo_2023.R")

pacta.data.validation::validate_intermediate_scenario_output(weo_2023)

waldo::compare(
  dplyr::arrange(pacta.scenario.preparation::weo_2023, scenario, scenario_geography, sector, technology),
  dplyr::mutate(
    dplyr::arrange(weo_2023, scenario, scenario_geography, sector, technology),
    technology = dplyr::if_else(sector == "Aviation", NA, technology)
  )
)
#> ✔ No differences