ihmeuw-demographics / hierarchyUtils

Demographics Related Utility Functions
https://ihmeuw-demographics.github.io/hierarchyUtils/
BSD 3-Clause "New" or "Revised" License
8 stars 3 forks source link

BUG: Speed up agg function #64

Closed hcomfo95 closed 3 years ago

hcomfo95 commented 3 years ago

Describe the bug I wrote a wrapper (for loop) to run agg on unique nid-underlyingnid-location-year-sex identifiers. For VR data, there are ~90,000 unique identifiers. It would be nice if the loop could run faster. It took over 6 hours.

gbd_year <- 2020

age_map <- mortdb::get_age_map(gbd_year = gbd_year, type = "all")

age_map_u5 <- age_map[age_group_id == 1, c("age_group_years_start", "age_group_years_end")]
colnames(age_map_u5) <- c("age_start", "age_end")

##################
## Set versions ##
##################

if (gbd_year == 2020) {

  vr_data_version <- 280 # process id 20

} else if (gbd_year == 2019) {

  vr_data_version <- 262 # process id 20

}

#################################
## Estimate death completeness ##
#################################

# 1. Read in VR (aggregate by sex and age)

deaths_reported <- mortdb::get_mort_outputs(model_name = "death number empirical",
                                            model_type = "data",
                                            age_group_ids = c(1:5, 28, 42, 49:52, 161, 388:391),
                                            run_id = vr_data_version,
                                            demographic_metadata = T)

deaths_reported <- deaths_reported[source_name %like% "VR|SRS|DSP" & outlier == 0]

deaths_reported[, sex := tolower(sex)]

setnames(deaths_reported, "mean", "deaths_reported")

setnames(deaths_reported, "age_group_years_start", "age_start")
setnames(deaths_reported, "age_group_years_end", "age_end")

deaths_reported <- deaths_reported[, c("nid", "underlying_nid", "ihme_loc_id", "year_id",
                                       "sex", "age_start", "age_end", "deaths_reported")]

value_cols <- "deaths_reported"
id_cols <- names(deaths_reported)[!names(deaths_reported) %in% value_cols]

deaths_reported[, unique_identifier := paste0(nid, "_", underlying_nid, "_", ihme_loc_id, "_", year_id, "_", sex)]

deaths_reported_agg_age <- data.table()
# deaths_reported_agg_age <- fread(paste0(root, "/temp/hcomfo95/death_comp_temp.csv"))

for (i in unique(deaths_reported$unique_identifier)) {

  temp <- deaths_reported[unique_identifier == i, !c("unique_identifier")]

  temp_agg <- hierarchyUtils::agg(
    dt = temp,
    id_cols = id_cols,
    value_cols = value_cols,
    col_stem = "age",
    col_type = "interval",
    mapping = age_map_u5,
    missing_dt_severity = "none",
    present_agg_severity = "skip",
    overlapping_dt_severity = "none"
  )

  deaths_reported_agg_age <- rbind(deaths_reported_agg_age, temp_agg)

  temp_agg <- NULL

}