Check if we can fill in ch_names or ch_postcodes when a client has multiple epis...

https://github.com/Public-Health-Scotland/source-linkage-files/blob/4c9aa4b0a1b401fd058ac0b7b522738aa5c1e64f/R/fill_ch_names.R#L202

#' Fix and fill care home name and postcodes
#'
#' @param ch_data partially cleaned up care home data as a
#' [tibble][tibble::tibble-package]
#' @param ch_name_lookup_path path to the 'official' Care Home name Excel
#' Workbook, this defaults to [get_slf_ch_name_lookup_path()]
#'
#' @return the same data with improved accuracy and completeness of the Care
#' Home names and postcodes, as a [tibble][tibble::tibble-package].
#' @export
fill_ch_names <- function(ch_data,
                          ch_name_lookup_path = get_slf_ch_name_lookup_path(),
                          spd_path = get_spd_path()) {
  ch_data <- ch_data %>%
    # Make the care home name more uniform
    dplyr::mutate(ch_name = clean_up_free_text(.data[["ch_name"]])) %>%
    # correct postcode formatting
    dplyr::mutate(
      dplyr::across(
        dplyr::contains("postcode"),
        phsmethods::format_postcode
      ),
      # Replace invalid postcode with NA
      # Get a list of confirmed valid Scottish postcodes from the SPD
      ch_postcode = dplyr::if_else(
        .data[["ch_postcode"]] %in% dplyr::pull(readr::read_rds(spd_path), "pc7"),
        .data[["ch_postcode"]],
        NA_character_
      )
    )

  # Care Home name lookup from the Care Inspectorate
  # Previous contact 'Al Scougal' <Al.Scougal@careinspectorate.gov.scot>
  ch_name_lookup <- readxl::read_xlsx(ch_name_lookup_path) %>%
    # Drop any Care Homes that were closed before 2017/18
    dplyr::select(
      ch_postcode = "AccomPostCodeNo",
      ch_name_validated = "ServiceName",
      ch_date_registered = "DateReg",
      ch_date_cancelled = "DateCanx"
    ) %>%
    dplyr::filter(is.na(.data[["ch_date_cancelled"]]) | .data[["ch_date_cancelled"]] >= start_fy("1718")) %>%
    # Standardise the postcode and CH name
    dplyr::mutate(
      ch_postcode = phsmethods::format_postcode(.data[["ch_postcode"]]),
      ch_name_validated = clean_up_free_text(.data[["ch_name_validated"]]),
      ch_date_registered = lubridate::as_date(.data[["ch_date_registered"]]),
      ch_date_cancelled = lubridate::as_date(.data[["ch_date_cancelled"]])
    ) %>%
    # Merge any duplicates, and get the interval each CH name was active
    dplyr::group_by(.data[["ch_postcode"]], .data[["ch_name_validated"]]) %>%
    dplyr::summarise(
      # Find the latest date for each CH name / postcode
      latest_close_date = dplyr::if_else(is.na(max(.data[["ch_date_cancelled"]])),
        Sys.Date(),
        max(.data[["ch_date_cancelled"]])
      ),
      open_interval = lubridate::interval(
        min(.data[["ch_date_registered"]]),
        .data[["latest_close_date"]]
      )
    ) %>%
    dplyr::ungroup()

  # Generate some metrics for how the submitted names connect to the valid names
  ch_name_best_match <- ch_data %>%
    dplyr::distinct(.data[["ch_postcode"]], .data[["ch_name"]]) %>%
    dplyr::left_join(ch_name_lookup,
      by = dplyr::join_by("ch_postcode"),
      multiple = "all",
      na_matches = "never"
    ) %>%
    tidyr::drop_na() %>%
    # Work out string distances between names for each postcode
    dplyr::mutate(
      match_distance_jaccard = stringdist::stringdist(.data[["ch_name"]], .data[["ch_name_validated"]],
        method = "jaccard"
      ),
      match_distance_cosine = stringdist::stringdist(.data[["ch_name"]], .data[["ch_name_validated"]],
        method = "cosine"
      ),
      match_mean = (.data[["match_distance_jaccard"]] + .data[["match_distance_cosine"]]) / 2.0
    ) %>%
    # Drop any name matches which aren't very close
    dplyr::filter(.data[["match_distance_jaccard"]] <= 0.25 |
      .data[["match_distance_cosine"]] <= 0.3) %>%
    dplyr::group_by(.data[["ch_postcode"]], .data[["ch_name"]], .data[["open_interval"]]) %>%
    dplyr::mutate(
      min_match_mean = min(.data[["match_mean"]], na.rm = TRUE)
    ) %>%
    # Identify the closest match in case there are multiple close matches
    # If there's still multiple matches just pick the shortest
    dplyr::arrange(
      "min_match_mean",
      length(.data[["ch_name_validated"]])
    ) %>%
    dplyr::ungroup() %>%
    dplyr::distinct(.data[["ch_postcode"]],
      .data[["ch_name"]],
      .keep_all = TRUE
    ) %>%
    dplyr::select(
      "ch_postcode",
      "ch_name",
      "ch_name_validated",
      "open_interval",
      "latest_close_date"
    ) %>%
    dplyr::arrange(
      "ch_postcode",
      "ch_name",
      "open_interval"
    )

  no_match_pc_name_bad <- ch_data %>%
    dplyr::anti_join(ch_name_lookup,
      by = dplyr::join_by("ch_postcode"),
      na_matches = "never"
    ) %>%
    dplyr::filter(!is.na(.data[["ch_name"]]) & !is.na(.data[["ch_postcode"]])) %>%
    dplyr::left_join(ch_name_best_match,
                     by = dplyr::join_by(
                       "ch_name",
                       closest("ch_admission_date" <= "latest_close_date")
                     ),
                     multiple = "last",
                     na_matches = "never",
                     suffix = c("_old", "")
    ) %>%
    dplyr::mutate(
      ch_postcode = dplyr::if_else(!is_missing(.data[["ch_postcode"]]),
                               .data[["ch_postcode"]],
                               .data[["ch_postcode_old"]]
      )
    )

  no_match_pc_name_missing <- ch_data %>%
    dplyr::anti_join(ch_name_lookup,
      by = dplyr::join_by("ch_postcode"),
      na_matches = "never"
    ) %>%
    dplyr::filter(is.na(.data[["ch_name"]]) & is.na(.data[["ch_postcode"]]))

  no_match_pc_missing <- ch_data %>%
    dplyr::anti_join(ch_name_lookup,
      by = dplyr::join_by("ch_postcode"),
      na_matches = "never"
    ) %>%
    dplyr::filter(!is.na(.data[["ch_name"]]) & is.na(.data[["ch_postcode"]])) %>%
    dplyr::left_join(ch_name_best_match,
                     by = dplyr::join_by(
                       "ch_name",
                       closest("ch_admission_date" <= "latest_close_date")
                     ),
                     multiple = "last",
                     na_matches = "never",
                     suffix = c("_old", "")
    ) %>%
    dplyr::mutate(
      ch_postcode = dplyr::if_else(!is_missing(.data[["ch_postcode"]]),
                                   .data[["ch_postcode"]],
                                   .data[["ch_postcode_old"]]
      )
    )

  no_match_name_missing <- ch_data %>%
    dplyr::anti_join(ch_name_lookup,
      by = dplyr::join_by("ch_postcode"),
      na_matches = "never"
    ) %>%
    dplyr::filter(is.na(.data[["ch_name"]]) & !is.na(.data[["ch_postcode"]]))

  ch_name_pc_clean <- ch_data %>%
    # Remove records with no matching postcode, we'll add them back later
    dplyr::semi_join(ch_name_lookup,
      by = dplyr::join_by("ch_postcode"),
      na_matches = "never"
    ) %>%
    dplyr::left_join(ch_name_best_match,
      by = dplyr::join_by(
        "ch_postcode",
        "ch_name",
        closest("ch_admission_date" <= "latest_close_date")
      ),
      na_matches = "never"
    ) %>%
    dplyr::mutate(
      ch_name_old = .data[["ch_name"]],
      ch_name = dplyr::if_else(!is_missing(.data[["ch_name_validated"]]),
        .data[["ch_name_validated"]],
        .data[["ch_name"]]
      )
    ) %>%
    # Bring back the records which had no postcode match
    dplyr::bind_rows(
      no_match_pc_name_bad,
      no_match_pc_name_missing,
      no_match_pc_missing,
      no_match_name_missing
    )

  # TODO Check if we can fill in ch_names or ch_postcodes when a client has multiple episodes

  return(ch_name_pc_clean)
}
Public-Health-Scotland / source-linkage-files

Check if we can fill in ch_names or ch_postcodes when a client has multiple epis... #550