nhs-r-community / demos-and-how-tos

A repo for community contributed demos and how-tos to get common stuff done in the R language
https://nhs-r-community.github.io/demos-and-how-tos/
MIT License
29 stars 13 forks source link

Add age band grouped data #48

Open Lextuga007 opened 1 year ago

Lextuga007 commented 1 year ago

From the Slack discussion in help-with-r, code suggestions were: (contributions from @tomjemmett, https://github.com/sebastian-fox, @zx8754, @TimTaylor)

currently 5 styles of approach jump out:

  1. dplyr::recode()
  2. factor approaches (base or forcats)
  3. Named vector.
  4. Explicit joins (base merge, dplyr or data.table)
  5. dplyr::case_when() / data.table::fcase()
list_of_csv[['Lancaster']] %>%
  mutate(
    # Create categories
    age_group = dplyr::case_when(
      age == '00_04' | age == '05_09' | age == '10_14' | age == '15_19' ~ '0-19'
    ),
    # Convert to factor
    age_group = factor(
      age_group,
      level = c('0-19')
    )
  )

Doesn't work with character though:

 cut(0:25, c(0, 5, 10, 15, 20, Inf), c("0-4", "5-9", "10-14", "15-19", "20+"), right = TRUE)
 [1] <NA>  0-4   0-4   0-4   0-4   0-4   5-9   5-9   5-9   5-9   5-9   10-14 10-14 10-14 10-14 10-14 15-19 15-19 15-19 15-19 15-19 20+   20+  
[24] 20+   20+   20+  
Levels: 0-4 5-9 10-14 15-19 20+
list_of_csv[['Lancaster']] %>%
  mutate(
    # Create categories
    age_group = dplyr::case_when(
      age %in% c('00_04', '05_09', '10_14', '15_19') ~ '0-19',
      TRUE ~ 'Other age bands'
    ),
    # Convert to factor
    age_group = factor(
      age_group,
      level = c('0-19')
    )
  )
# lookup approach
lookup <- c(
    "00_04" = "00_19",
    "05_09" = "00_19",
    "10_14" = "00_19",
    "15_19" = "00_19",
    "20_29" = "20_29",
    "30_99" = "30_99"
)

# get some dummy data
(dat <- sample(names(lookup), size = 20L, replace = TRUE))
#>  [1] "15_19" "15_19" "10_14" "15_19" "30_99" "20_29" "05_09" "30_99" "10_14"
#> [10] "30_99" "10_14" "10_14" "05_09" "30_99" "05_09" "00_04" "20_29" "15_19"
#> [19] "20_29" "15_19"

# use lookup
data.frame(old = dat, new = lookup[dat])
#>      old   new
#> 1  15_19 00_19
#> 2  15_19 00_19
#> 3  10_14 00_19
#> 4  15_19 00_19
#> 5  30_99 30_99
#> 6  20_29 20_29
#> 7  05_09 00_19
#> 8  30_99 30_99
#> 9  10_14 00_19
#> 10 30_99 30_99
#> 11 10_14 00_19
#> 12 10_14 00_19
#> 13 05_09 00_19
#> 14 30_99 30_99
#> 15 05_09 00_19
#> 16 00_04 00_19
#> 17 20_29 20_29
#> 18 15_19 00_19
#> 19 20_29 20_29
#> 20 15_19 00_19
# example data using @TimTaylorUKHSA lookup 
set.seed(1); dat <- data.frame(age = sample(names(lookup), size = 10L, replace = TRUE))

library(forcats)
library(dplyr)

dat %>% 
  mutate(ageForcats = fct_collapse(age, `00_19` = c("00_04", "05_09", "10_14", "15_19")),
         ageFactor = factor(age,
                          levels = c("00_04", "05_09", "10_14", "15_19", "20_29", "30_99"),
                          labels = c("00_19", "00_19", "00_19", "00_19", "20_29", "30_99")))

#      age ageForcats ageFactor
# 1  00_04      00_19     00_19
# 2  15_19      00_19     00_19
# 3  00_04      00_19     00_19
# 4  05_09      00_19     00_19
# 5  20_29      20_29     20_29
# 6  10_14      00_19     00_19
# 7  30_99      30_99     30_99
# 8  05_09      00_19     00_19
# 9  10_14      00_19     00_19
# 10 10_14      00_19     00_19

Possibly opaque?

dat$age[ as.integer(factor(dat$age)) < 5 ] <- "00_19"