ccodwg / CovidTimelineCanada

A definitive dataset for COVID-19 in Canada
https://opencovid.ca/
Other
26 stars 8 forks source link

Finish conversion of CCODWG data #38

Closed jeanpaulrsoucy closed 1 year ago

jeanpaulrsoucy commented 2 years ago

We should add the remaining datasets from Covid19Canada to the raw_data directory, as they may be useful for constructing future datasets.

The below code was used to convert the health region-level datasets for cases and deaths:

# load pipe
library(magrittr)

# function: convert province/territory names in CCODWG to two-letter abbreviations
convert_pt_names <- function(d) {
  # CCODWG region names
  pt_convert <- c(
    "Alberta" = "AB",
    "BC" = "BC",
    "Manitoba" = "MB",
    "New Brunswick" = "NB",
    "NL" = "NL",
    "Nova Scotia" = "NS",
    "Nunavut" = "NU",
    "NWT" = "NT",
    "Ontario" = "ON",
    "PEI" = "PE",
    "Quebec" = "QC",
    "Saskatchewan" = "SK",
    "Yukon" = "YT",
    "Repatriated" = "Repatriated"
  )
  d[, "region"] <- dplyr::recode(d$region, !!!as.list(pt_convert))
  # return data frame with converted region column
  d
}

# cases
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_hr/cases_timeseries_hr.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = .data$health_region,
    date = as.Date(.data$date_report, "%d-%m-%Y"),
    value = .data$cumulative_cases
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "cases", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_cases_hr_ts.csv", row.names = FALSE, quote = 1:4)

# deaths
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_hr/mortality_timeseries_hr.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = .data$health_region,
    date = as.Date(.data$date_death_report, "%d-%m-%Y"),
    value = .data$cumulative_deaths
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "deaths", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_deaths_hr_ts.csv", row.names = FALSE, quote = 1:4)
jeanpaulrsoucy commented 2 years ago

@benkcwong See the list of datasets from Covid19Canada that still need to be completed above. Note the "name" value next to each dataset: this will be used in the "name" column of the output dataset as well as in the name of the output file. Please use the function convert_pt_names above and the example code that follows to structure the conversion script. Note that since examples refer to health region-level datasets whereas the remaining datasets are province/territory-level, write.csv will use "pt" instead of "hr" in the file name and the quote argument will have a value of "1:3", since there is no sub_region_1 column.

To complete this issue, please submit a pull request with the 6 additional files in the raw_data/ccodwg directory and the script you use as a comment to this issue,

benkcwong commented 2 years ago

The code below is adapted from JP's script (above) to convert the remaining datasets (recovered, testing, vaccine distribution, vaccine administration total, vaccine administration 2 dose, vaccine administration 3 dose):

# load pipe
library(magrittr)

# function: convert province/territory names in CCODWG to two-letter abbreviations
convert_pt_names <- function(d) {
  # CCODWG region names
  pt_convert <- c(
    "Alberta" = "AB",
    "BC" = "BC",
    "Manitoba" = "MB",
    "New Brunswick" = "NB",
    "NL" = "NL",
    "Nova Scotia" = "NS",
    "Nunavut" = "NU",
    "NWT" = "NT",
    "Ontario" = "ON",
    "PEI" = "PE",
    "Quebec" = "QC",
    "Saskatchewan" = "SK",
    "Yukon" = "YT",
    "Repatriated" = "Repatriated"
  )
  d[, "region"] <- dplyr::recode(d$region, !!!as.list(pt_convert))
  # return data frame with converted region column
  d
}

# cases
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_hr/cases_timeseries_hr.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = .data$health_region,
    date = as.Date(.data$date_report, "%d-%m-%Y"),
    value = .data$cumulative_cases
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "cases", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_cases_hr_ts.csv", row.names = FALSE, quote = 1:4)

# deaths
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_hr/mortality_timeseries_hr.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = .data$health_region,
    date = as.Date(.data$date_death_report, "%d-%m-%Y"),
    value = .data$cumulative_deaths
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "deaths", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_deaths_hr_ts.csv", row.names = FALSE, quote = 1:4)

# recovered
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/recovered_timeseries_prov.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = "Not Reported",
    date = as.Date(.data$date_recovered, "%d-%m-%Y"),
    value = .data$cumulative_recovered
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "recovered", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_recovered_pt_ts.csv", row.names = FALSE, quote = 1:3)

# testing
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/testing_timeseries_prov.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = "Not Reported",
    date = as.Date(.data$date_testing, "%d-%m-%Y"),
    value = .data$cumulative_testing
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "testing", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_testing_pt_ts.csv", row.names = FALSE, quote = 1:3)

# vaccine_distribution
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_distribution_timeseries_prov.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = "Not Reported",
    date = as.Date(.data$date_vaccine_distributed, "%d-%m-%Y"),
    value = .data$cumulative_dvaccine
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "vaccine_distribution", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_vaccine_distribution_pt_ts.csv", row.names = FALSE, quote = 1:3)

# vaccine_administration_total_doses
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_administration_timeseries_prov.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = "Not Reported",
    date = as.Date(.data$date_vaccine_administered, "%d-%m-%Y"),
    value = .data$cumulative_avaccine
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "vaccine_administration_total_doses", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_vaccine_administration_total_doses_pt_ts.csv", row.names = FALSE, quote = 1:3)

# vaccine_administration_dose_2
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_completion_timeseries_prov.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = "Not Reported",
    date = as.Date(.data$date_vaccine_completed, "%d-%m-%Y"),
    value = .data$cumulative_cvaccine
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "vaccine_administration_dose_2", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_vaccine_administration_dose_2_pt_ts.csv", row.names = FALSE, quote = 1:3)

# vaccine_administration_dose_3
readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_additionaldoses_timeseries_prov.csv") %>%
  dplyr::transmute(
    region = .data$province,
    sub_region_1 = "Not Reported",
    date = as.Date(.data$date_vaccine_additionaldoses, "%d-%m-%Y"),
    value = .data$cumulative_additionaldosesvaccine
  ) %>%
  convert_pt_names() %>%
  tibble::add_column(name = "vaccine_administration_dose_3", .before = 1) %>%
  dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>%
  write.csv("raw_data/ccodwg/can_vaccine_administration_dose_3_pt_ts.csv", row.names = FALSE, quote = 1:3)
jeanpaulrsoucy commented 1 year ago

Thanks, @benkcwong!

Only thing I did was remove the "sub_region_1" column from the PT data files.

Updated code ``` # load pipe library(magrittr) # function: convert province/territory names in CCODWG to two-letter abbreviations convert_pt_names <- function(d) { # CCODWG region names pt_convert <- c( "Alberta" = "AB", "BC" = "BC", "Manitoba" = "MB", "New Brunswick" = "NB", "NL" = "NL", "Nova Scotia" = "NS", "Nunavut" = "NU", "NWT" = "NT", "Ontario" = "ON", "PEI" = "PE", "Quebec" = "QC", "Saskatchewan" = "SK", "Yukon" = "YT", "Repatriated" = "Repatriated" ) d[, "region"] <- dplyr::recode(d$region, !!!as.list(pt_convert)) # return data frame with converted region column d } # cases readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_hr/cases_timeseries_hr.csv") %>% dplyr::transmute( region = .data$province, sub_region_1 = .data$health_region, date = as.Date(.data$date_report, "%d-%m-%Y"), value = .data$cumulative_cases ) %>% convert_pt_names() %>% tibble::add_column(name = "cases", .before = 1) %>% dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>% write.csv("raw_data/ccodwg/can_cases_hr_ts.csv", row.names = FALSE, quote = 1:4) # deaths readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_hr/mortality_timeseries_hr.csv") %>% dplyr::transmute( region = .data$province, sub_region_1 = .data$health_region, date = as.Date(.data$date_death_report, "%d-%m-%Y"), value = .data$cumulative_deaths ) %>% convert_pt_names() %>% tibble::add_column(name = "deaths", .before = 1) %>% dplyr::mutate(sub_region_1 = ifelse(.data$sub_region_1 == "Not Reported", "Unknown", .data$sub_region_1)) %>% write.csv("raw_data/ccodwg/can_deaths_hr_ts.csv", row.names = FALSE, quote = 1:4) # recovered readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/recovered_timeseries_prov.csv") %>% dplyr::transmute( region = .data$province, date = as.Date(.data$date_recovered, "%d-%m-%Y"), value = .data$cumulative_recovered ) %>% convert_pt_names() %>% tibble::add_column(name = "recovered", .before = 1) %>% write.csv("raw_data/ccodwg/can_recovered_pt_ts.csv", row.names = FALSE, quote = 1:3) # testing readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/testing_timeseries_prov.csv") %>% dplyr::transmute( region = .data$province, date = as.Date(.data$date_testing, "%d-%m-%Y"), value = .data$cumulative_testing ) %>% convert_pt_names() %>% tibble::add_column(name = "testing", .before = 1) %>% write.csv("raw_data/ccodwg/can_testing_pt_ts.csv", row.names = FALSE, quote = 1:3) # vaccine_distribution readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_distribution_timeseries_prov.csv") %>% dplyr::transmute( region = .data$province, date = as.Date(.data$date_vaccine_distributed, "%d-%m-%Y"), value = .data$cumulative_dvaccine ) %>% convert_pt_names() %>% tibble::add_column(name = "vaccine_distribution", .before = 1) %>% write.csv("raw_data/ccodwg/can_vaccine_distribution_pt_ts.csv", row.names = FALSE, quote = 1:3) # vaccine_administration_total_doses readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_administration_timeseries_prov.csv") %>% dplyr::transmute( region = .data$province, date = as.Date(.data$date_vaccine_administered, "%d-%m-%Y"), value = .data$cumulative_avaccine ) %>% convert_pt_names() %>% tibble::add_column(name = "vaccine_administration_total_doses", .before = 1) %>% write.csv("raw_data/ccodwg/can_vaccine_administration_total_doses_pt_ts.csv", row.names = FALSE, quote = 1:3) # vaccine_administration_dose_2 readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_completion_timeseries_prov.csv") %>% dplyr::transmute( region = .data$province, date = as.Date(.data$date_vaccine_completed, "%d-%m-%Y"), value = .data$cumulative_cvaccine ) %>% convert_pt_names() %>% tibble::add_column(name = "vaccine_administration_dose_2", .before = 1) %>% write.csv("raw_data/ccodwg/can_vaccine_administration_dose_2_pt_ts.csv", row.names = FALSE, quote = 1:3) # vaccine_administration_dose_3 readr::read_csv("https://raw.githubusercontent.com/ccodwg/Covid19Canada/master/timeseries_prov/vaccine_additionaldoses_timeseries_prov.csv") %>% dplyr::transmute( region = .data$province, date = as.Date(.data$date_vaccine_additionaldoses, "%d-%m-%Y"), value = .data$cumulative_additionaldosesvaccine ) %>% convert_pt_names() %>% tibble::add_column(name = "vaccine_administration_dose_3", .before = 1) %>% write.csv("raw_data/ccodwg/can_vaccine_administration_dose_3_pt_ts.csv", row.names = FALSE, quote = 1:3) ```