check if this is the same as split_episode_counter?

https://github.com/Public-Health-Scotland/source-linkage-files/blob/d9a8bb8384d79b8c03040b7c4e748875aa8ab0f9/Production_scripts/Social_Care/care_home_all_episodes.R#L330

# Load package
library(createslf)

# Variables to be used as function parameters
sc_demog_lookup_path <- get_sc_demog_lookup_path()
spd_path <- get_spd_path()
ch_name_lookup_path <- get_slf_ch_name_lookup_path()
slf_deaths_path <- get_slf_deaths_path()

db_connection <- phs_db_connection(dsn = "DVPROD")

# Read in data
ch_data <-
  dplyr::tbl(
    db_connection,
    dbplyr::in_schema("social_care_2", "carehome_snapshot")
  ) %>%
  dplyr::select(
    "ch_name",
    "ch_postcode",
    "sending_location",
    "social_care_id",
    "financial_year",
    "financial_quarter",
    "period",
    "ch_provider",
    "reason_for_admission",
    "type_of_admission",
    "nursing_care_provision",
    "ch_admission_date",
    "ch_discharge_date",
    "age"
  ) %>%
  # Correct FY 2017
  dplyr::mutate(financial_quarter = dplyr::if_else(
    financial_year == 2017L &
      is.na(financial_quarter),
    4L,
    financial_quarter
  )) %>%
  dplyr::mutate(period = dplyr::if_else(financial_year == 2017L &
    financial_quarter == 4L, "2017Q4", period)) %>%
  dplyr::collect()

ch_clean <- ch_data %>%
  dplyr::mutate(
    dplyr::across(
      c(
        "ch_provider",
        "reason_for_admission",
        "type_of_admission",
        "nursing_care_provision"
      ),
      as.integer
    ),
    record_date = end_fy_quarter(period),
    qtr_start = start_fy_quarter(period),
    # Set missing admission date to start of the submitted quarter
    ch_admission_date = dplyr::if_else(
      is.na(ch_admission_date),
      qtr_start,
      ch_admission_date
    ),
    # If the dis date is before admission, remove the dis date
    ch_discharge_date = dplyr::if_else(
      ch_admission_date > ch_discharge_date,
      lubridate::NA_Date_,
      ch_discharge_date
    )
  )

# read in demographic data
sc_demog <- readr::read_rds(sc_demog_lookup_path)

# Get a list of confirmed valid Scottish postcodes from the SPD
valid_spd_postcodes <- readr::read_rds(spd_path) %>%
  dplyr::pull(pc7)

matched_ch_data <- ch_clean %>%
  dplyr::left_join(sc_demog, by = c("sending_location", "social_care_id")) %>%
  # Make the care home name more uniform
  dplyr::mutate(ch_name = clean_up_free_text(ch_name)) %>%
  # correct postcode formatting
  dplyr::mutate(
    dplyr::across(tidyselect::contains("postcode"), phsmethods::format_postcode),
    # replace invalid postcode with NA
    ch_postcode = dplyr::na_if(
      ch_postcode,
      ch_postcode %in% valid_spd_postcodes
    )
  )

# Care Home Name lookup
ch_name_lookup <-
  # Care Home name lookup from the Care Inspectorate
  # Previous contact Al Scrougal - Al.Scougal@careinspectorate.gov.scot
  readxl::read_xlsx(ch_name_lookup_path) %>%
  # Drop any Care Homes that were closed before 2017/18
  dplyr::filter(is.na(DateCanx) | DateCanx >= start_fy("1718")) %>%
  dplyr::rename(
    ch_postcode = "AccomPostCodeNo",
    ch_name_validated = "ServiceName"
  ) %>%
  dplyr::select(
    "ch_postcode",
    "ch_name_validated",
    "DateReg",
    "DateCanx"
  ) %>%
  # Standardise the postcode and CH name
  dplyr::mutate(
    ch_postcode = phsmethods::format_postcode(ch_postcode),
    ch_name_validated = clean_up_free_text(ch_name_validated)
  ) %>%
  # Merge any duplicates, and get the interval each CH name was active
  dplyr::group_by(ch_postcode, ch_name_validated) %>%
  dplyr::summarise(open_interval = lubridate::interval(
    min(DateReg),
    tidyr::replace_na(max(DateCanx), Sys.Date())
  )) %>%
  # Find the latest date for each CH name
  dplyr::mutate(latest_close_date = max(
    lubridate::int_end(open_interval)
  )) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(open_interval)

# Generate some metrics for how the submitted names connect to the valid names
ch_name_match_metrics <- matched_ch_data %>%
  dplyr::distinct(ch_postcode, ch_name) %>%
  dplyr::left_join(ch_name_lookup, by = "ch_postcode") %>%
  tidyr::drop_na() %>%
  # Work out string distances between names for each postcode
  dplyr::mutate(
    match_distance_jaccard = stringdist::stringdist(ch_name, ch_name_validated,
      method = "jaccard"
    ),
    match_distance_cosine = stringdist::stringdist(ch_name, ch_name_validated,
      method = "cosine"
    ),
    match_mean = (match_distance_jaccard + match_distance_cosine) / 2.0
  ) %>%
  # Drop any name matches which aren't very close
  dplyr::filter(match_distance_jaccard <= 0.25 |
    match_distance_cosine <= 0.3) %>%
  dplyr::group_by(ch_postcode, ch_name) %>%
  # Identify the closest match in case there are multiple close matches
  dplyr::mutate(
    min_jaccard = min(match_distance_jaccard, na.rm = TRUE),
    min_cosine = min(match_distance_cosine, na.rm = TRUE),
    min_match_mean = min(match_mean, na.rm = TRUE)
  ) %>%
  dplyr::ungroup()

no_postcode_match <- matched_ch_data %>%
  dplyr::anti_join(ch_name_lookup, by = "ch_postcode")

name_postcode_clean <- matched_ch_data %>%
  # Remove records with no matching postcode, we'll add them back later
  dplyr::semi_join(ch_name_lookup, by = "ch_postcode") %>%
  # Create a unique ID per row so we can get rid of duplicates later
  dplyr::mutate(ch_record_id = dplyr::row_number()) %>%
  # Match CH names with the generated metrics and the lookup. This will create
  # duplicates which should be filtered out as we identify matches
  dplyr::left_join(ch_name_match_metrics, by = c("ch_postcode", "ch_name")) %>%
  dplyr::mutate(
    # Work out the duration of the stay
    # If the end date is missing set this to the end of the quarter
    stay_interval = lubridate::interval(
      ch_admission_date,
      min(ch_discharge_date, record_date, na.rm = TRUE)
    ),
    # Highlight which stays overlap with an open care home name
    stay_overlaps_open = lubridate::int_overlaps(
      stay_interval, open_interval
    ) &
      lubridate::int_start(stay_interval) >= lubridate::int_start(open_interval),
    # Highlight which names seem to be good matches
    name_match = dplyr::case_when(
      # Exact match
      ch_name == ch_name_validated ~ TRUE,
      # Submitted name is missing and stay dates are valid for the CH
      is.na(ch_name) & stay_overlaps_open ~ TRUE,
      # This name had the closest 'jaccard' distance of all possibilities
      (min_jaccard == match_distance_jaccard) &
        match_distance_jaccard <= 0.25 ~ TRUE,
      # This name had the closest 'cosine' distance of all possibilities
      (min_cosine == match_distance_cosine) &
        match_distance_cosine <= 0.3 ~ TRUE,
      # This name had the closest 'mean' distance (used when the above disagree)
      (min_match_mean == match_mean) & match_mean <= 0.25 ~ TRUE,
      # No good match
      TRUE ~ FALSE
    )
  ) %>%
  # Group by record
  # - There will be duplicate rows per record if there are
  # multiple 'options' for the possible CH name.
  dplyr::group_by(ch_record_id) %>%
  dplyr::mutate(
    # Highlight where the record has no matches out of any of the options
    no_name_matches = !any(name_match),
    # Highlight where the record has no overlaps with any of the options
    no_overlaps = !any(stay_overlaps_open)
  ) %>%
  # Keep a record if:
  # 1) It's name matches `name_match`
  # Or either
  # 2)a) None of the option's names match AND this option overlaps in dates
  # e.g. the submitted name is missing but the dates match)
  # or 2)b) None of the option's names match AND none of the dates overlap
  # (i.e. we don't have any idea what name to use)
  dplyr::filter(dplyr::n() == 1L |
    sum(name_match) == 1L | !any(name_match)) %>%
  # For the records which still have multiple options
  # (usually multiple names matched)
  dplyr::filter(dplyr::n() == 1L |
    lubridate::int_end(open_interval) == latest_close_date) %>%
  dplyr::filter(dplyr::n() == 1L | match_mean == min_match_mean) %>%
  dplyr::ungroup() %>%
  # Bring back to single record with no duplicates introduce by the lookup
  dplyr::distinct(ch_record_id, .keep_all = TRUE) %>%
  # Replace the ch name with our best guess at the proper name from the lookup
  dplyr::mutate(
    ch_name_old = ch_name,
    ch_name = dplyr::if_else(is.na(ch_name_validated),
      ch_name,
      ch_name_validated
    )
  ) %>%
  # Bring back the records which had no postcode match
  dplyr::bind_rows(no_postcode_match)

(check_names <- name_postcode_clean %>%
  dplyr::count(ch_name_old, ch_name, sort = TRUE))

ch_data_clean <- name_postcode_clean %>%
  # sort data
  dplyr::arrange(
    sending_location,
    social_care_id,
    ch_admission_date,
    period
  ) %>%
  dplyr::mutate(
    min_ch_provider = min(ch_provider),
    max_ch_provider = max(ch_provider)
  ) %>%
  dplyr::mutate(ch_provider = dplyr::if_else(
    min_ch_provider != max_ch_provider,
    6L,
    ch_provider
  )) %>%
  dplyr::select(
    -"min_ch_provider",
    -"max_ch_provider"
  ) %>%
  # when multiple social_care_id from sending_location for
  # single CHI replace social_care_id with latest
  dplyr::group_by(sending_location, chi) %>%
  dplyr::mutate(latest_sc_id = dplyr::last(social_care_id)) %>%
  # count changed social_care_id
  dplyr::mutate(
    changed_sc_id = !is.na(chi) & social_care_id != latest_sc_id,
    social_care_id = dplyr::if_else(changed_sc_id, latest_sc_id, social_care_id)
  ) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(sending_location, social_care_id, ch_admission_date) %>%
  # fill in nursing care provision when missing
  # but present in the following entry
  dplyr::mutate(
    nursing_care_provision = dplyr::na_if(nursing_care_provision, 9L)
  ) %>%
  tidyr::fill(nursing_care_provision, .direction = "downup") %>%
  # tidy up ch_provider using 6 when disagreeing values
  tidyr::fill(ch_provider, .direction = "downup") %>%
  # remove any duplicate records before merging for speed and simplicity
  dplyr::distinct() %>%
  # counter for split episodes
  dplyr::mutate(
    split_episode = tidyr::replace_na(
      nursing_care_provision != lag(nursing_care_provision),
      TRUE
    ),
    split_episode_counter = cumsum(split_episode)
  ) %>%
  dplyr::ungroup()

# count changed social_care_id
ch_data_clean %>%
  dplyr::count(changed_sc_id)

# Megre records to a single row per episode
# where admission is the same
ch_episode <- ch_data_clean %>%
  # when nursing_care_provision is different on
  # records within the episode, split the episode
  # at this point.
  dplyr::group_by(
    chi,
    sending_location,
    social_care_id,
    ch_admission_date,
    nursing_care_provision,
    split_episode_counter
  ) %>%
  dplyr::summarise(
    sc_latest_submission = dplyr::last("period"),
    dplyr::across(
      c(
        "ch_discharge_date",
        "ch_provider",
        "record_date",
        "qtr_start",
        "ch_name",
        "ch_postcode",
        "reason_for_admission"
      ),
      dplyr::last
    ),
    dplyr::across(c("gender", "dob", "postcode"), dplyr::first)
  ) %>%
  dplyr::ungroup() %>%
  # Amend dates for split episodes
  # Change the start and end date as appropriate when an episode is split,
  # using the start / end date of the submission quarter
  dplyr::group_by(chi, sending_location, social_care_id, ch_admission_date) %>%
  # counter for latest submission
  # TODO check if this is the same as split_episode_counter?
  dplyr::mutate(
    latest_submission_counter = tidyr::replace_na(
      sc_latest_submission != dplyr::lag(sc_latest_submission),
      TRUE
    ),
    sum_latest_submission = cumsum(latest_submission_counter)
  ) %>%
  # TODO double check this works
  dplyr::mutate(
    # If it's the first episode(s) then keep the admission date(s),
    # otherwise use the start of the quarter
    ch_admission_date = dplyr::if_else(
      sum_latest_submission == min(sum_latest_submission),
      ch_admission_date,
      qtr_start
    ),
    # If it's the last episode(s) then keep the discharge date(s), otherwise
    # use the end of the quarter
    ch_discharge_date = dplyr::if_else(
      sum_latest_submission == max(sum_latest_submission),
      ch_discharge_date,
      record_date
    )
  ) %>%
  dplyr::ungroup()

# Compare to Deaths Data
deaths_data <- readr::read_rds(slf_deaths_path)

# match ch_episode data with deaths data
matched_deaths_data <- ch_episode %>%
  dplyr::left_join(deaths_data, by = "chi") %>%
  # compare discharge date with NRS and CHI death date
  # if either of the dates are 5 or fewer days before discharge
  # adjust the discharge date to the date of death
  # corrects most cases of ‘discharge after death’
  dplyr::mutate(dis_after_death = tidyr::replace_na(
    death_date > (ch_discharge_date - lubridate::days(5L)) &
      death_date < ch_discharge_date,
    FALSE
  )) %>%
  dplyr::mutate(ch_discharge_date = dplyr::if_else(dis_after_death,
    death_date,
    ch_discharge_date
  )) %>%
  dplyr::ungroup() %>%
  # remove any episodes where discharge is now before admission,
  # i.e. death was before admission
  dplyr::filter(
    !tidyr::replace_na(ch_discharge_date < ch_admission_date, FALSE)
  )

# Continuous Care Home Stays

# stay will be continuous as long as the admission date is the next day or
# earlier than the previous discharge date

ch_markers <- matched_deaths_data %>%
  # ch_chi_cis
  dplyr::group_by(chi) %>%
  dplyr::mutate(
    continuous_stay_chi = tidyr::replace_na(
      ch_admission_date <= dplyr::lag(ch_discharge_date) + lubridate::days(1L),
      TRUE
    ),
    ch_chi_cis = cumsum(continuous_stay_chi)
  ) %>%
  dplyr::ungroup() %>%
  # ch_sc_id_cis
  # uses the social care id and sending location so can be used for
  # episodes that are not attached to a CHI number
  # This will restrict continuous stays to each Local Authority
  dplyr::group_by(social_care_id, sending_location) %>%
  dplyr::mutate(
    continuous_stay_sc = tidyr::replace_na(
      ch_admission_date <= dplyr::lag(ch_discharge_date) + lubridate::days(1L),
      TRUE
    ),
    ch_sc_id_cis = cumsum(continuous_stay_sc)
  ) %>%
  dplyr::ungroup()

outfile <- ch_markers %>%
  create_person_id() %>%
  dplyr::rename(
    record_keydate1 = "ch_admission_date",
    record_keydate2 = "ch_discharge_date",
    ch_adm_reason = "reason_for_admission",
    ch_nursing = "nursing_care_provision"
  ) %>%
  dplyr::select(
    "chi",
    "person_id",
    "gender",
    "dob",
    "postcode",
    "sending_location",
    "social_care_id",
    "ch_name",
    "ch_postcode",
    "record_keydate1",
    "record_keydate2",
    "ch_chi_cis",
    "ch_sc_id_cis",
    "ch_provider",
    "ch_nursing",
    "ch_adm_reason",
    "sc_latest_submission"
  )

outfile %>%
  write_rds(get_sc_ch_episodes_path(check_mode = "write"))

# End of Script #
Public-Health-Scotland / source-linkage-files

check if this is the same as split_episode_counter? #537