Public-Health-Scotland / source-linkage-files

This repo is for the syntax used for the PHS Source Linkage File project
https://public-health-scotland.github.io/source-linkage-files/
Other
4 stars 2 forks source link

make the slf_path a function #832

Closed github-actions[bot] closed 1 year ago

github-actions[bot] commented 1 year ago

'

' @param data The in-progress episode file data.

' @inheritParams create_episode_file

' @param vars_to_keep a character vector of the variables to keep, all others

' will be stored.

'

' @return `data` with only the `vars_to_keep` kept

'

' @inheritParams create_episode_file

' @inheritParams store_ep_file_vars

'

' @return The full SLF data.

'

' @inheritParams store_ep_file_vars

'

' @return A data frame with CIJ markers filled in for those missing.

group and are NA. This is why we use this arrange() before the mutate()

'

' @inheritParams store_ep_file_vars

'

' @return The data with CIJ variables corrected.

'

' @inheritParams store_ep_file_vars

'

' @return The data with cost including dna.

not including DNAs using cattend

those with attendance status 5 or 8 (CNWs and DNAs)

'

' @inheritParams store_ep_file_vars

' @inheritParams create_demographic_cohorts

'

' @return The data unchanged (the cohorts are written to disk)

'

' @inheritParams store_ep_file_vars

' @inheritParams get_demographic_cohorts_path

' @param demographic_cohort,service_use_cohort The cohort data

'

' @return The data including the Demographic and Service Use lookups.

https://api.github.com/Public-Health-Scotland/source-linkage-files/blob/63da8b09b2003e972f8f73bab65b8929f0305a8a/R/create_episode_file.R#L139


#' Produce the Source Episode file
#'
#' @param processed_data_list containing data from processed extracts.
#' @param year The year to process, in FY format.
#' @param write_to_disk (optional) Should the data be written to disk default is
#' `TRUE` i.e. write the data to disk.
#' @inheritParams add_nsu_cohort
#' @inheritParams fill_geographies
#' @inheritParams join_cohort_lookups
#' @inheritParams join_deaths_data
#' @inheritParams match_on_ltcs
#' @inheritParams link_delayed_discharge_eps
#' @param anon_chi_out (Default:TRUE) Should `anon_chi` be used in the output
#' (instead of chi)
#'
#' @return a [tibble][tibble::tibble-package] containing the episode file
#' @export
create_episode_file <- function(
    processed_data_list,
    year,
    dd_data = read_file(get_source_extract_path(year, "DD")),
    nsu_cohort = read_file(get_nsu_path(year)),
    ltc_data = read_file(get_ltcs_path(year)),
    slf_pc_lookup = read_file(get_slf_postcode_path()),
    slf_gpprac_lookup = read_file(
      get_slf_gpprac_path(),
      col_select = c("gpprac", "cluster", "hbpraccode")
    ),
    slf_deaths_lookup = read_file(get_slf_deaths_lookup_path(year)),
    write_to_disk = TRUE,
    anon_chi_out = TRUE) {
  episode_file <- dplyr::bind_rows(processed_data_list) %>%
    create_cost_inc_dna() %>%
    apply_cost_uplift() %>%
    store_ep_file_vars(
      year = year,
      vars_to_keep = c(
        "year",
        "recid",
        "record_keydate1",
        "record_keydate2",
        "smrtype",
        "chi",
        "gender",
        "dob",
        "gpprac",
        "hbpraccode",
        "postcode",
        "hbrescode",
        "lca",
        "location",
        "hbtreatcode",
        "ipdc",
        "spec",
        "sigfac",
        "diag1",
        "diag2",
        "diag3",
        "diag4",
        "diag5",
        "diag6",
        "op1a",
        "age",
        "cij_marker",
        "cij_start_date",
        "cij_end_date",
        "cij_pattype_code",
        "cij_ipdc",
        "cij_admtype",
        "cij_adm_spec",
        "cij_dis_spec",
        "cost_total_net",
        "hscp",
        "datazone2011",
        "attendance_status",
        "deathdiag1",
        "deathdiag2",
        "deathdiag3",
        "deathdiag4",
        "deathdiag5",
        "deathdiag6",
        "deathdiag7",
        "deathdiag8",
        "deathdiag9",
        "deathdiag10",
        "deathdiag11",
        "yearstay",
        "apr_beddays",
        "may_beddays",
        "jun_beddays",
        "jul_beddays",
        "aug_beddays",
        "sep_beddays",
        "oct_beddays",
        "nov_beddays",
        "dec_beddays",
        "jan_beddays",
        "feb_beddays",
        "mar_beddays"
      )
    ) %>%
    # Check chi is valid using phsmethods function
    # If the CHI is invalid for whatever reason, set the CHI to NA
    dplyr::mutate(
      chi = dplyr::if_else(
        phsmethods::chi_check(.data$chi) != "Valid CHI",
        NA_character_,
        .data$chi
      ),
      gpprac = convert_eng_gpprac_to_dummy(.data[["gpprac"]]),
      # PC8 format may still be used. Ensure here that all datasets are in PC7 format.
      postcode = phsmethods::format_postcode(.data$postcode, "pc7")
    ) %>%
    correct_cij_vars() %>%
    fill_missing_cij_markers() %>%
    add_ppa_flag() %>%
    link_delayed_discharge_eps(year, dd_data) %>%
    add_nsu_cohort(year, nsu_cohort) %>%
    match_on_ltcs(year, ltc_data) %>%
    correct_demographics(year) %>%
    create_cohort_lookups(year) %>%
    join_cohort_lookups(year) %>%
    join_sparra_hhg(year) %>%
    fill_geographies(
      slf_pc_lookup,
      slf_gpprac_lookup
    ) %>%
    join_deaths_data(
      year,
      slf_deaths_lookup
    ) %>%
    load_ep_file_vars(year)

  if (anon_chi_out) {
    episode_file <- slfhelper::get_anon_chi(episode_file)
  }

  if (write_to_disk) {
    # TODO make the slf_path a function
    slf_episode_path <- get_file_path(
      get_year_dir(year),
      stringr::str_glue(
        "source-episode-file-{year}.parquet"
      ),
      check_mode = "write"
    )

    write_file(episode_file, slf_episode_path)
  }

  return(episode_file)
}

#' Store the unneeded episode file variables
#'
#' @param data The in-progress episode file data.
#' @inheritParams create_episode_file
#' @param vars_to_keep a character vector of the variables to keep, all others
#' will be stored.
#'
#' @return `data` with only the `vars_to_keep` kept
store_ep_file_vars <- function(data, year, vars_to_keep) {
  tempfile_path <- get_file_path(
    directory = get_year_dir(year),
    file_name = stringr::str_glue("temp_ep_file_variable_store_{year}.parquet"),
    check_mode = "write",
    create = TRUE
  )

  check_variables_exist(data, vars_to_keep)

  data <- data %>%
    dplyr::mutate(ep_file_row_id = dplyr::row_number())

  vars_to_store <- c("ep_file_row_id", setdiff(names(data), vars_to_keep))

  dplyr::select(
    data,
    dplyr::all_of(vars_to_store)
  ) %>%
    write_file(
      path = tempfile_path
    )

  return(
    dplyr::select(
      data,
      dplyr::all_of(c("ep_file_row_id", vars_to_keep))
    )
  )
}

#' Load the unneeded episode file variables
#'
#' @inheritParams create_episode_file
#' @inheritParams store_ep_file_vars
#'
#' @return The full SLF data.
load_ep_file_vars <- function(data, year) {
  tempfile_path <- get_file_path(
    directory = get_year_dir(year),
    file_name = stringr::str_glue("temp_ep_file_variable_store_{year}.parquet"),
    check_mode = "write",
    create = TRUE
  )

  full_data <- data %>%
    dplyr::left_join(
      read_file(path = tempfile_path),
      by = "ep_file_row_id",
      unmatched = "error",
      relationship = "one-to-one"
    ) %>%
    dplyr::select(!"ep_file_row_id")

  fs::file_delete(tempfile_path)

  return(full_data)
}

#' Fill any missing CIJ markers for records that should have them
#'
#' @inheritParams store_ep_file_vars
#'
#' @return A data frame with CIJ markers filled in for those missing.
fill_missing_cij_markers <- function(data) {
  fixable_data <- data %>%
    dplyr::filter(
      .data[["recid"]] %in% c("01B", "04B", "GLS", "02B", "DD") & !is.na(.data[["chi"]])
    )

  non_fixable_data <- data %>%
    dplyr::filter(
      !(.data[["recid"]] %in% c("01B", "04B", "GLS", "02B", "DD")) | is.na(.data[["chi"]])
    )

  fixed_data <- fixable_data %>%
    dplyr::group_by(.data$chi) %>%
    # We want any NA cij_markers to be filled in, if they are the first in the
    # group and are NA. This is why we use this arrange() before the mutate()
    dplyr::arrange(dplyr::desc(is.na(.data$cij_marker)), .by_group = TRUE) %>%
    dplyr::mutate(cij_marker = dplyr::if_else(
      is.na(.data$cij_marker) & dplyr::row_number() == 1L,
      1L,
      .data$cij_marker
    )) %>%
    dplyr::ungroup() %>%
    # Tidy up cij_ipdc
    dplyr::mutate(cij_ipdc = dplyr::if_else(
      is_missing(.data$cij_ipdc),
      dplyr::case_when(
        .data$ipdc == "I" ~ "I",
        .data$recid == "01B" & .data$ipdc == "D" ~ "D",
        .default = .data$cij_ipdc
      ),
      .data$cij_ipdc
    )) %>%
    # Ensure every record with a CHI has a valid CIJ marker
    dplyr::group_by(.data$chi, .data$cij_marker) %>%
    dplyr::mutate(
      cij_ipdc = max(.data$cij_ipdc),
      cij_admtype = dplyr::first(.data$cij_admtype),
      cij_pattype_code = dplyr::first(.data$cij_pattype_code),
      cij_pattype = dplyr::first(.data$cij_pattype),
      cij_adm_spec = dplyr::first(.data$cij_adm_spec),
      cij_dis_spec = dplyr::last(.data$cij_dis_spec)
    ) %>%
    dplyr::ungroup()

  return_data <- dplyr::bind_rows(non_fixable_data, fixed_data)

  return(return_data)
}

#' Correct the CIJ variables
#'
#' @inheritParams store_ep_file_vars
#'
#' @return The data with CIJ variables corrected.
correct_cij_vars <- function(data) {
  check_variables_exist(
    data,
    c("chi", "recid", "cij_admtype", "cij_pattype_code")
  )

  data %>%
    # Change some values of cij_pattype_code based on cij_admtype
    dplyr::mutate(
      cij_admtype = dplyr::if_else(
        .data[["cij_admtype"]] == "Unknown",
        "99",
        .data[["cij_admtype"]]
      ),
      cij_pattype_code = dplyr::if_else(
        !is.na(.data$chi) & .data$recid %in% c("01B", "04B", "GLS", "02B"),
        dplyr::case_match(
          .data$cij_admtype,
          c("41", "42") ~ 2L,
          c("40", "48", "99") ~ 9L,
          "18" ~ 0L,
          .default = as.integer(.data$cij_pattype_code)
        ),
        .data$cij_pattype_code
      ),
      # Recode cij_pattype based on above
      cij_pattype = dplyr::case_match(
        .data$cij_pattype_code,
        0L ~ "Non-Elective",
        1L ~ "Elective",
        2L ~ "Maternity",
        9L ~ "Other"
      )
    )
}

#' Create cost total net inc DNA
#'
#' @inheritParams store_ep_file_vars
#'
#' @return The data with cost including dna.
create_cost_inc_dna <- function(data) {
  check_variables_exist(data, c("cost_total_net", "attendance_status"))

  # Create cost including DNAs and modify costs
  # not including DNAs using cattend
  data %>%
    dplyr::mutate(
      cost_total_net_inc_dnas = .data$cost_total_net,
      # In the Cost_Total_Net column set the cost for
      # those with attendance status 5 or 8 (CNWs and DNAs)
      cost_total_net = dplyr::if_else(
        .data$attendance_status %in% c(5L, 8L),
        0.0,
        .data$cost_total_net
      )
    )
}

#' Create the cohort lookups
#'
#' @inheritParams store_ep_file_vars
#' @inheritParams create_demographic_cohorts
#'
#' @return The data unchanged (the cohorts are written to disk)
create_cohort_lookups <- function(data, year, update = latest_update()) {
  # Use future so the cohorts can be created simultaneously (in parallel)
  future::plan(strategy = future.callr::callr, .skip = TRUE)
  options(future.globals.maxSize = 21474836480)

  future_demographic <- future::future({
    create_demographic_cohorts(
      data,
      year,
      update,
      write_to_disk = TRUE
    )
  })
  future_service_use <- future::future({
    create_service_use_cohorts(
      data,
      year,
      update,
      write_to_disk = TRUE
    )
  })

  # This 'blocks' the code until they have both finished executing
  value_demographic <- future::value(future_demographic)
  value_service_use <- future::value(future_service_use)

  return(data)
}

#' Join cohort lookups
#'
#' @inheritParams store_ep_file_vars
#' @inheritParams get_demographic_cohorts_path
#' @param demographic_cohort,service_use_cohort The cohort data
#'
#' @return The data including the Demographic and Service Use lookups.
join_cohort_lookups <- function(
    data,
    year,
    update = latest_update(),
    demographic_cohort = read_file(
      get_demographic_cohorts_path(year, update),
      col_select = c("chi", "demographic_cohort")
    ),
    service_use_cohort = read_file(
      get_service_use_cohorts_path(year, update),
      col_select = c("chi", "service_use_cohort")
    )) {
  join_cohort_lookups <- data %>%
    dplyr::left_join(
      demographic_cohort,
      by = "chi"
    ) %>%
    dplyr::left_join(
      service_use_cohort,
      by = "chi"
    )

  return(join_cohort_lookups)
}
github-actions[bot] commented 1 year ago

Closed in 6f2cb827b2ac8627926b4f6a23120304b0897229