Closed jeanpaulrsoucy closed 1 year ago
@benkcwong See the list of datasets from Covid19Canada
that still need to be completed above. Note the "name" value next to each dataset: this will be used in the "name" column of the output dataset as well as in the name of the output file. Please use the function convert_pt_names
above and the example code that follows to structure the conversion script. Note that since examples refer to health region-level datasets whereas the remaining datasets are province/territory-level, write.csv
will use "pt" instead of "hr" in the file name and the quote argument will have a value of "1:3", since there is no sub_region_1
column.
To complete this issue, please submit a pull request with the 6 additional files in the raw_data/ccodwg
directory and the script you use as a comment to this issue,
The code below is adapted from JP's script (above) to convert the remaining datasets (recovered, testing, vaccine distribution, vaccine administration total, vaccine administration 2 dose, vaccine administration 3 dose):
# load pipe
library(magrittr)
# function: convert province/territory names in CCODWG to two-letter abbreviations
convert_pt_names <- function(d) {
# CCODWG region names
pt_convert <- c(
"Alberta" = "AB",
"BC" = "BC",
"Manitoba" = "MB",
"New Brunswick" = "NB",
"NL" = "NL",
"Nova Scotia" = "NS",
"Nunavut" = "NU",
"NWT" = "NT",
"Ontario" = "ON",
"PEI" = "PE",
"Quebec" = "QC",
"Saskatchewan" = "SK",
"Yukon" = "YT",
"Repatriated" = "Repatriated"
)
d[, "region"] <- dplyr::recode(d$region, !!!as.list(pt_convert))
# return data frame with converted region column
d
}
# cases
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_hr/cases_timeseries_hr.csv") %>%
dplyr::transmute(
region = .data$province,
sub_region_1 = .data$health_region,
date = as.Date(.data$date_report, "%d-%m-%Y"),
value = .data$cumulative_cases
) %>%
convert_pt_names() %>%
tibble::add_column(name = "cases", .before = 1) %>%
dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
write.csv("raw_data/ccodwg/can_cases_hr_ts.csv", row.names = FALSE, quote = 1:4)
# deaths
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_hr/mortality_timeseries_hr.csv") %>%
dplyr::transmute(
region = .data$province,
sub_region_1 = .data$health_region,
date = as.Date(.data$date_death_report, "%d-%m-%Y"),
value = .data$cumulative_deaths
) %>%
convert_pt_names() %>%
tibble::add_column(name = "deaths", .before = 1) %>%
dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
write.csv("raw_data/ccodwg/can_deaths_hr_ts.csv", row.names = FALSE, quote = 1:4)
# recovered
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/recovered_timeseries_prov.csv") %>%
dplyr::transmute(
region = .data$province,
sub_region_1 = "Not Reported",
date = as.Date(.data$date_recovered, "%d-%m-%Y"),
value = .data$cumulative_recovered
) %>%
convert_pt_names() %>%
tibble::add_column(name = "recovered", .before = 1) %>%
dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
write.csv("raw_data/ccodwg/can_recovered_pt_ts.csv", row.names = FALSE, quote = 1:3)
# testing
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/testing_timeseries_prov.csv") %>%
dplyr::transmute(
region = .data$province,
sub_region_1 = "Not Reported",
date = as.Date(.data$date_testing, "%d-%m-%Y"),
value = .data$cumulative_testing
) %>%
convert_pt_names() %>%
tibble::add_column(name = "testing", .before = 1) %>%
dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
write.csv("raw_data/ccodwg/can_testing_pt_ts.csv", row.names = FALSE, quote = 1:3)
# vaccine_distribution
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_distribution_timeseries_prov.csv") %>%
dplyr::transmute(
region = .data$province,
sub_region_1 = "Not Reported",
date = as.Date(.data$date_vaccine_distributed, "%d-%m-%Y"),
value = .data$cumulative_dvaccine
) %>%
convert_pt_names() %>%
tibble::add_column(name = "vaccine_distribution", .before = 1) %>%
dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
write.csv("raw_data/ccodwg/can_vaccine_distribution_pt_ts.csv", row.names = FALSE, quote = 1:3)
# vaccine_administration_total_doses
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_administration_timeseries_prov.csv") %>%
dplyr::transmute(
region = .data$province,
sub_region_1 = "Not Reported",
date = as.Date(.data$date_vaccine_administered, "%d-%m-%Y"),
value = .data$cumulative_avaccine
) %>%
convert_pt_names() %>%
tibble::add_column(name = "vaccine_administration_total_doses", .before = 1) %>%
dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
write.csv("raw_data/ccodwg/can_vaccine_administration_total_doses_pt_ts.csv", row.names = FALSE, quote = 1:3)
# vaccine_administration_dose_2
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_completion_timeseries_prov.csv") %>%
dplyr::transmute(
region = .data$province,
sub_region_1 = "Not Reported",
date = as.Date(.data$date_vaccine_completed, "%d-%m-%Y"),
value = .data$cumulative_cvaccine
) %>%
convert_pt_names() %>%
tibble::add_column(name = "vaccine_administration_dose_2", .before = 1) %>%
dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
write.csv("raw_data/ccodwg/can_vaccine_administration_dose_2_pt_ts.csv", row.names = FALSE, quote = 1:3)
# vaccine_administration_dose_3
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_additionaldoses_timeseries_prov.csv") %>%
dplyr::transmute(
region = .data$province,
sub_region_1 = "Not Reported",
date = as.Date(.data$date_vaccine_additionaldoses, "%d-%m-%Y"),
value = .data$cumulative_additionaldosesvaccine
) %>%
convert_pt_names() %>%
tibble::add_column(name = "vaccine_administration_dose_3", .before = 1) %>%
dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
write.csv("raw_data/ccodwg/can_vaccine_administration_dose_3_pt_ts.csv", row.names = FALSE, quote = 1:3)
Thanks, @benkcwong!
Only thing I did was remove the "sub_region_1" column from the PT data files.
We should add the remaining datasets from
Covid19Canada
to theraw_data
directory, as they may be useful for constructing future datasets.The below code was used to convert the health region-level datasets for cases and deaths: