# Data Cleaning ---------------------------------------
# TODO social care demographics - tidy up code and make more efficient.
sc_demog <- data %>%
dplyr::mutate(
# if one chi is missing then use the other
# TODO social care demographics - decide what to do with cases where last 4 digits of chi are different
chi_upi = ifelse(is.na(chi_upi), upi, chi_upi),
upi = ifelse(is.na(upi), chi_upi, upi),
submitted_date_of_birth = ifelse(is.na(submitted_date_of_birth), chi_date_of_birth, submitted_date_of_birth),
chi_date_of_birth = ifelse(is.na(chi_date_of_birth), submitted_date_of_birth, chi_date_of_birth),
chi_date_of_birth = lubridate::as_date(chi_date_of_birth),
submitted_date_of_birth = lubridate::as_date(submitted_date_of_birth),
# check gender code - replace code 99 with 9=
# use CHI sex if available
# TODO social care demographics - check gender matches chi for extra validation check
submitted_gender = replace(.data$submitted_gender, .data$submitted_gender == 99L, 9L),
gender = dplyr::if_else(
is.na(.data$chi_gender_code) | .data$chi_gender_code == 9L,
.data$submitted_gender,
.data$chi_gender_code
)
) %>%
# format postcodes using `phsmethods`
dplyr::mutate(dplyr::across(
tidyselect::contains("postcode"),
~ phsmethods::format_postcode(.x, format = "pc7"))) %>%
dplyr::distinct() %>%
# if only one option is available for chi then choose that
dplyr::mutate(chi = ifelse(chi_upi == upi | is.na(upi), chi_upi,
ifelse(is.na(chi_upi), upi, NA)
)) %>%
dplyr::mutate(
# if only one option is available for DOB then choose that
dob = ifelse(chi_date_of_birth == submitted_date_of_birth | is.na(submitted_date_of_birth), chi_date_of_birth,
ifelse(is.na(chi_date_of_birth), submitted_date_of_birth, NA)),
dob = lubridate::as_date(dob)
) %>%
dplyr::arrange(chi, dob) %>%
dplyr::group_by(social_care_id, sending_location) %>%
tidyr::fill(chi, .direction = c("down")) %>%
tidyr::fill(dob, .direction = c("down")) %>%
dplyr::ungroup() %>%
# create string for DOB from CHI and the DOB to see if they match.
dplyr::mutate(dob_from_chiupi = paste0(stringr::str_sub(chi, 1, 6))) %>%
dplyr::mutate(dob_from_dob = paste0(
stringr::str_sub(as.character(dob), 9, 10),
stringr::str_sub(as.character(dob), 6, 7),
stringr::str_sub(as.character(dob), 3, 4)
)) %>%
# validation flag. if dob goes with chi then flag as 1
dplyr::mutate(chi_validation = ifelse(dob_from_chiupi == dob_from_dob, 1, 0))
# cases where all chi and dob are missing, nothing we can do about these and no validation
missing <- sc_demog %>%
dplyr::filter(is.na(chi_upi) & is.na(upi) & is.na(submitted_date_of_birth) & is.na(chi_date_of_birth) & is.na(chi) & is.na(dob)) %>%
dplyr::select(-dob_from_chiupi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth)
# cases where chi and dob match
validated <- sc_demog %>%
dplyr::filter(chi_validation == 1) %>%
dplyr::select(-dob_from_chiupi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth)
# match on either dob to chi
sc_demog <- sc_demog %>%
dplyr::anti_join(missing) %>%
dplyr::filter(chi_validation != 1) %>%
# get dob from chi and submitted and see if either match with chi
dplyr::mutate(dob_from_chidob = paste0(
stringr::str_sub(as.character(chi_date_of_birth), 9, 10),
stringr::str_sub(as.character(chi_date_of_birth), 6, 7),
stringr::str_sub(as.character(chi_date_of_birth), 3, 4)
)) %>%
dplyr::mutate(dob_from_submitteddob = paste0(
stringr::str_sub(submitted_date_of_birth, 9, 10),
stringr::str_sub(as.character(submitted_date_of_birth), 6, 7),
stringr::str_sub(as.character(submitted_date_of_birth), 3, 4)
)) %>%
# if either dob matches with chi then use that dob
dplyr::mutate(
dob = ifelse(dob_from_chiupi == dob_from_chidob, chi_date_of_birth, dob),
dob = ifelse(dob_from_chiupi == dob_from_submitteddob, submitted_date_of_birth, dob),
dob = lubridate::as_date(dob)
) %>%
dplyr::mutate(dob_from_dob = paste0(
stringr::str_sub(as.character(dob), 9, 10),
stringr::str_sub(as.character(dob), 6, 7),
stringr::str_sub(as.character(dob), 3, 4)
)) %>%
# if dob and chi match then flag as validated
dplyr::mutate(chi_validation = ifelse(dob_from_chiupi == dob_from_dob, 1, 0)) %>%
dplyr::select(-dob_from_chidob, -dob_from_submitteddob)
# add the validated cases to validated df
validated <- validated %>%
rbind(sc_demog %>%
dplyr::filter(chi_validation == 1) %>%
dplyr::select(-dob_from_chiupi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth))
# match on dob to either chi
sc_demog <- sc_demog %>%
dplyr::filter(chi_validation != 1) %>%
# create dob from both chi numbers
dplyr::mutate(dob_from_upi = paste0(stringr::str_sub(upi, 1, 6))) %>%
dplyr::mutate(dob_from_chi_upi = paste0(stringr::str_sub(chi_upi, 1, 6))) %>%
# use whichever one matches
dplyr::mutate(chi = ifelse(dob_from_chi_upi == dob_from_dob, chi_upi, chi)) %>%
dplyr::mutate(chi = ifelse(dob_from_upi == dob_from_dob, upi, chi)) %>%
dplyr::mutate(dob_from_chi = paste0(stringr::str_sub(chi, 1, 6))) %>%
# if chi and dob match then flag as validated
dplyr::mutate(chi_validation = ifelse(dob_from_chi == dob_from_dob, 1, 0))
# all validated cases
validated <- validated %>%
rbind(sc_demog %>%
dplyr::filter(chi_validation == 1) %>%
dplyr::select(-dob_from_chiupi, -dob_from_upi, -dob_from_chi_upi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth, -dob_from_chi))
# TODO social care demographics - decide what to do with non-validated chi and cases where dob does not match chi
# Need to decide what to do with social care cases where the chi and the dob do not match.
# this is why I have kept the validated/non-validated df seperate. Hoping we can get back and sort this out.
sc_demog <- sc_demog %>%
dplyr::filter(chi_validation != 1) %>% # all unvalidated cases. most of these are due to missing chi or dob so there is no way to validate.
dplyr::select(-dob_from_chiupi, -dob_from_upi, -dob_from_chi_upi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth, -dob_from_chi) %>%
rbind(validated) %>%
rbind(missing)
# count number of na postcodes
na_postcodes <- sc_demog %>%
https://github.com/Public-Health-Scotland/source-linkage-files/blob/1c75e3b775014579bd26d500a0e82496f89e3940/R/process_lookup_sc_demographics.R#L36