Public-Health-Scotland / source-linkage-files

This repo is for the syntax used for the PHS Source Linkage File project
https://public-health-scotland.github.io/source-linkage-files/
Other
4 stars 2 forks source link

social care demographics - decide what to do with non-validated chi and cases wh... #866

Closed github-actions[bot] closed 3 months ago

github-actions[bot] commented 10 months ago

Need to decide what to do with social care cases where the chi and the dob do not match.

this is why I have kept the validated/non-validated df seperate. Hoping we can get back and sort this out.

https://github.com/Public-Health-Scotland/source-linkage-files/blob/1c75e3b775014579bd26d500a0e82496f89e3940/R/process_lookup_sc_demographics.R#L150


  # Data Cleaning ---------------------------------------
# TODO social care demographics - tidy up  code and make more efficient.
  sc_demog <- data %>%
    dplyr::mutate(
      # if one chi is missing then use the other
      # TODO social care demographics - decide what to do with cases where last 4 digits of chi are different
      chi_upi = ifelse(is.na(chi_upi), upi, chi_upi),
      upi = ifelse(is.na(upi), chi_upi, upi),
      submitted_date_of_birth = ifelse(is.na(submitted_date_of_birth), chi_date_of_birth, submitted_date_of_birth),
      chi_date_of_birth = ifelse(is.na(chi_date_of_birth), submitted_date_of_birth, chi_date_of_birth),
      chi_date_of_birth = lubridate::as_date(chi_date_of_birth),
      submitted_date_of_birth = lubridate::as_date(submitted_date_of_birth),
      # check gender code - replace code 99 with 9=

      # use CHI sex if available
       # TODO social care demographics - check gender matches chi for extra validation check
      submitted_gender = replace(.data$submitted_gender, .data$submitted_gender == 99L, 9L),
      gender = dplyr::if_else(
        is.na(.data$chi_gender_code) | .data$chi_gender_code == 9L,
        .data$submitted_gender,
        .data$chi_gender_code
      )
    ) %>%
    # format postcodes using `phsmethods`
    dplyr::mutate(dplyr::across(
      tidyselect::contains("postcode"),
      ~ phsmethods::format_postcode(.x, format = "pc7"))) %>%
    dplyr::distinct() %>%
    # if only one option is available for chi then choose that
    dplyr::mutate(chi = ifelse(chi_upi == upi | is.na(upi), chi_upi,
                               ifelse(is.na(chi_upi), upi, NA)
                               )) %>%
    dplyr::mutate(
      # if only one option is available for DOB then choose that
      dob = ifelse(chi_date_of_birth == submitted_date_of_birth | is.na(submitted_date_of_birth), chi_date_of_birth,
        ifelse(is.na(chi_date_of_birth), submitted_date_of_birth, NA)),
      dob = lubridate::as_date(dob)
    ) %>%
    dplyr::arrange(chi, dob) %>%
    dplyr::group_by(social_care_id, sending_location) %>%
    tidyr::fill(chi, .direction = c("down")) %>%
    tidyr::fill(dob, .direction = c("down")) %>%
    dplyr::ungroup() %>%
  # create string for DOB from CHI and the DOB to see if they match.
  dplyr::mutate(dob_from_chiupi = paste0(stringr::str_sub(chi, 1, 6))) %>%
    dplyr::mutate(dob_from_dob = paste0(
      stringr::str_sub(as.character(dob), 9, 10),
      stringr::str_sub(as.character(dob), 6, 7),
      stringr::str_sub(as.character(dob), 3, 4)
    )) %>%
    # validation flag. if dob goes with chi then flag as 1
    dplyr::mutate(chi_validation = ifelse(dob_from_chiupi == dob_from_dob, 1, 0))

# cases where all chi and dob are missing, nothing we can do about these and no validation
  missing <- sc_demog %>%
    dplyr::filter(is.na(chi_upi) & is.na(upi) & is.na(submitted_date_of_birth) & is.na(chi_date_of_birth) & is.na(chi) & is.na(dob)) %>%
    dplyr::select(-dob_from_chiupi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth)

  # cases where chi and dob match
  validated <- sc_demog %>%
    dplyr::filter(chi_validation == 1) %>%
    dplyr::select(-dob_from_chiupi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth)

  # match on either dob to chi
  sc_demog <- sc_demog %>%
    dplyr::anti_join(missing) %>%
    dplyr::filter(chi_validation != 1) %>%
    # get dob from chi and submitted and see if either match with chi
    dplyr::mutate(dob_from_chidob = paste0(
      stringr::str_sub(as.character(chi_date_of_birth), 9, 10),
      stringr::str_sub(as.character(chi_date_of_birth), 6, 7),
      stringr::str_sub(as.character(chi_date_of_birth), 3, 4)
    )) %>%
    dplyr::mutate(dob_from_submitteddob = paste0(
      stringr::str_sub(submitted_date_of_birth, 9, 10),
      stringr::str_sub(as.character(submitted_date_of_birth), 6, 7),
      stringr::str_sub(as.character(submitted_date_of_birth), 3, 4)
    )) %>%
    # if either dob matches with chi then use that dob
    dplyr::mutate(
      dob = ifelse(dob_from_chiupi == dob_from_chidob, chi_date_of_birth, dob),
      dob = ifelse(dob_from_chiupi == dob_from_submitteddob, submitted_date_of_birth, dob),
      dob = lubridate::as_date(dob)
    ) %>%
    dplyr::mutate(dob_from_dob = paste0(
      stringr::str_sub(as.character(dob), 9, 10),
      stringr::str_sub(as.character(dob), 6, 7),
      stringr::str_sub(as.character(dob), 3, 4)
    )) %>%
    # if dob and chi match then flag as validated
    dplyr::mutate(chi_validation = ifelse(dob_from_chiupi == dob_from_dob, 1, 0)) %>%
    dplyr::select(-dob_from_chidob, -dob_from_submitteddob)

# add the validated cases to validated df
  validated <- validated %>%
    rbind(sc_demog %>%
      dplyr::filter(chi_validation == 1) %>%
      dplyr::select(-dob_from_chiupi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth))

  # match on dob to either chi
  sc_demog <- sc_demog %>%
    dplyr::filter(chi_validation != 1) %>%
    # create dob from both chi numbers
    dplyr::mutate(dob_from_upi = paste0(stringr::str_sub(upi, 1, 6))) %>%
    dplyr::mutate(dob_from_chi_upi = paste0(stringr::str_sub(chi_upi, 1, 6))) %>%
    # use whichever one matches
    dplyr::mutate(chi = ifelse(dob_from_chi_upi == dob_from_dob, chi_upi, chi)) %>%
    dplyr::mutate(chi = ifelse(dob_from_upi == dob_from_dob, upi, chi)) %>%
    dplyr::mutate(dob_from_chi = paste0(stringr::str_sub(chi, 1, 6))) %>%
    # if chi and dob match then flag as validated
    dplyr::mutate(chi_validation = ifelse(dob_from_chi == dob_from_dob, 1, 0))

  # all validated cases
  validated <- validated %>%
    rbind(sc_demog %>%
      dplyr::filter(chi_validation == 1) %>%
      dplyr::select(-dob_from_chiupi, -dob_from_upi, -dob_from_chi_upi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth, -dob_from_chi))

  # TODO social care demographics - decide what to do with non-validated chi and cases where dob does not match chi
  # Need to decide what to do with social care cases where the chi and the dob do not match.
  # this is why I have kept the validated/non-validated df seperate. Hoping we can get back and sort this out.
  sc_demog <- sc_demog %>%
    dplyr::filter(chi_validation != 1) %>%   # all unvalidated cases. most of these are due to missing chi or dob so there is no way to validate.
    dplyr::select(-dob_from_chiupi, -dob_from_upi, -dob_from_chi_upi, -dob_from_dob, -chi_upi, -upi, -chi_date_of_birth, -submitted_date_of_birth, -dob_from_chi) %>%
    rbind(validated) %>%
    rbind(missing)

  # count number of na postcodes
  na_postcodes <- sc_demog %>%
Jennit07 commented 9 months ago

Potentially pick this up with a data quality script and give this to DM before the next update

github-actions[bot] commented 4 months ago

This issue is stale because it has been open approximately 5 months with no activity.

SwiftySalmon commented 3 months ago

closing this as it will be part of social care work to improve their processes and data quality