Last observation carried forward with time constraints

prockenschaub commented 5 years ago

Hi all,

I have created a function that performs a last observation carried forward that takes account of how long ago the last observation was recorded. Code for the function below.

library(tibble)
library(lubridate)
library(zoo)
library(testthat)

locf_window <- function(x, by, date, window, unit = "hours"){
  # Perform last observation carried forward (LOCF) based on the time difference 
  # to the last measured observation. Allow for stratification by identifier 
  # (e.g.patient ID) 
  #
  # Parameters 
  # x : character or numeric
  #    vector of measurements on which to perform LOCF 
  # by : character or numeric 
  #   vector indicating the group to stratify by 
  # date : datetime
  #   vector of times at which the value in 'x' was attempted to be measured. 
  # window : numeric 
  #   length of the time window 
  # units : character 
  #    units of the time window 

  if (is.character(x)) { 
    placeholder <- "-Infinity" 
  } else if (is.numeric(x)) { 
    placeholder <- -Inf 
  } else { 
    stop("vector 'x' must either be character or numeric") 
  }

  x <- if_else(is.na(x) & by != lag(by), placeholder, x) 
  date_measure <- as_datetime(ifelse(!is.na(x), date, NA)) 
  date_measure <- zoo::na.locf(date_measure) 
  n_measure <- unlist(tapply(!is.na(x), by, cumsum)) 
  date_measure <- as_datetime(ifelse(n_measure != 0, date_measure, NA)) 

  x <- if_else(is.na(x) & !is.na(date_measure) & 
                 time_length(lag(date_measure) %--% date, unit = unit) < window, 
               zoo::na.locf(x, na.rm = FALSE), x, x) 
  x[x == placeholder] <- NA 
  x 
}

test_locf <- tribble( 
  ~patid, ~start_date , ~value, 
  1, ymd_hms("2010-01-05 12:00:00"), 5, 
  1, ymd_hms("2010-01-05 13:00:00"), NA, 
  1, ymd_hms("2010-01-05 15:59:59"), NA, 
  1, ymd_hms("2010-01-05 17:00:00"), NA, 
  1, ymd_hms("2010-01-05 18:00:00"), 10, 
  2, ymd_hms("2010-01-05 13:00:00"), NA, 
  2, ymd_hms("2010-01-05 14:00:00"), NA, 
  2, ymd_hms("2010-01-05 15:00:00"), 2, 
  2, ymd_hms("2010-01-05 15:31:01"), NA, 
  2, ymd_hms("2010-01-06 16:00:00"), NA 
) %>% as.data.table()

with(test_locf, {
  expect_identical(locf_window(value, start_date, 4, by = patid), c(5, 5, 5, NA, 10, NA, NA, 2, 2, NA))
  expect_error(locf_window(list(2), ymd_hms("2010-01-06 16:00:00"), 4, by = patid))
})

The function generally seems to work, but it seems overly complicated. Does anyone have an idea of how to simplify or speed up the function?

P

alhenry commented 5 years ago

Here's an attempt using data.table package. The idea is to first create a within-stratum/ID temporary grouping variable for each sets of observation ordered by date, so that each group contains last non-missing observation + subsequent missing observation.

library(data.table)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:data.table':
#> 
#>     hour, isoweek, mday, minute, month, quarter, second, wday,
#>     week, yday, year
#> The following object is masked from 'package:base':
#> 
#>     date

test_locf <- tibble::tribble( 
  ~patid, ~start_date , ~value, 
  1, ymd_hms("2010-01-05 12:00:00"), 5, 
  1, ymd_hms("2010-01-05 13:00:00"), NA, 
  1, ymd_hms("2010-01-05 15:59:59"), NA, 
  1, ymd_hms("2010-01-05 17:00:00"), NA, 
  1, ymd_hms("2010-01-05 18:00:00"), 10, 
  2, ymd_hms("2010-01-05 13:00:00"), NA, 
  2, ymd_hms("2010-01-05 14:00:00"), NA, 
  2, ymd_hms("2010-01-05 15:00:00"), 2, 
  2, ymd_hms("2010-01-05 15:31:01"), NA, 
  2, ymd_hms("2010-01-06 16:00:00"), NA 
)

locf_window <- function(df, value, by, date, window, unit = "hours"){
  # Parameters
  # df = data.frame with value, by, date columns
  # value = column name  to perform LOCF
  # by = grouping column (e.g. ID)
  # date = date column (created using lubridate package)
  # window = maximum allowed time window for LOCF
  # unit = time unit (passed to lubridate::time_length function)

  DT <- data.table(df, key = c(by,date))
  setnames(DT, c(value, by, date), c("value", "by", "date"))

  # Create a temporary grouping variable per non-missing observation per patient
  DT[!is.na(value), tmp_group := seq_along(value)]

  # Impute grouping for missing obs with zoo::na.locf  (na.rm=F will keep missing first observation)
  DT[, tmp_group := zoo::na.locf(tmp_group, na.rm=F), by=by]

  # Fill in missing observation if time_length < window by group
  DT[ , value := ifelse(time_length(`[`(date, 1L) %--% date, unit = unit) < window,
                        `[`(value, 1L), NA),
      by=c("by", "tmp_group")]

  setnames(DT, c("value", "by", "date"), c(value, by, date))

  return(DT[,-"tmp_group"])
}

locf_window(test_locf, "value", "patid", "start_date", 4)
#>     patid          start_date value
#>  1:     1 2010-01-05 12:00:00     5
#>  2:     1 2010-01-05 13:00:00     5
#>  3:     1 2010-01-05 15:59:59     5
#>  4:     1 2010-01-05 17:00:00    NA
#>  5:     1 2010-01-05 18:00:00    10
#>  6:     2 2010-01-05 13:00:00    NA
#>  7:     2 2010-01-05 14:00:00    NA
#>  8:     2 2010-01-05 15:00:00     2
#>  9:     2 2010-01-05 15:31:01     2
#> 10:     2 2010-01-06 16:00:00    NA

^{Created on 2019-08-08 by the reprex package (v0.3.0)}

The code is a bit lengthy as I'm not very familiar with data.table syntaxes for programming (hence the double setnames) but hopefully this can start the discussion. Any idea to improve this further is welcome :)

prockenschaub commented 5 years ago

First, thanks Albert for looking into this, really appreciated!

Second, sorry for the absolutely crap "reproducible example" which is absolutely not reproducible. Just tried it and it failed horribly. That happens if you screenread something from the Data Safe Haven, then make changes for it to use tidyverse instead of data.table, and then not even try to run it... Example should now be fixed.

Third, I love the temp_group approach and particularly how you get the first element in each group with simple list subsetting: `[`(date, 1L) %--% date. I never really thought about this. The only thing is I would make it more readable by simply doing date[1] %--% date, which I believe should do the same.

Finally, the main problem why this solution doesn't work for dataset is speed. The main problem are the by-statements, which call na.locf() and time_length(), which are both expensive functions, separately for each temp_group:

# Impute grouping for missing obs with zoo::na.locf  (na.rm=F will keep missing first observation)
  DT[, tmp_group := zoo::na.locf(tmp_group, na.rm=F), by=by]

  # Fill in missing observation if time_length < window by group
  DT[ , value := ifelse(time_length(`[`(date, 1L) %--% date, unit = unit) < window,
                        `[`(value, 1L), NA),
      by=c("by", "tmp_group")]

Running the two solutions on a dataset of 100,000 observations takes 83 milliseconds for the code without by and 27 seconds for the code with by (a factor of 325). So the goal is to avoid the by.

Here is quick and dirty code to run the timing of large samples yourself (I will clean it up if I have time):


num_pats <- 1000000L
set.seed(1234)

dt <- data.table(
  patid = 1L:num_pats,
  num_obs = 1L + rpois(num_pats, 8),
  day = make_date(2015, round(runif(num_pats, 1, 12)), round(runif(num_pats, 1, 28)))
)

dt <- dt[, .(val = as.numeric(rpois(num_obs, 10))), by = .(patid, day)]
dt[, datetime := as_datetime(day) %m+% seconds(floor(runif(nrow(dt), 0, 24*60*60)))]
dt[rbinom(val, 1, 0.3) == 0, val := NA]

setorder(dt, patid, datetime)

up_to <- 100000L
microbenchmark::microbenchmark(with(dt[1:up_to], locf_window_patrick(val, datetime, 4, by = patid)), times = 1L)
microbenchmark::microbenchmark(locf_window_ablert(dt[1:up_to], "val", "patid", "datetime", 4), times = 1L)

ucl-ihi / CodeClub

Last observation carried forward with time constraints #7