Add a chunked function for SAS loads

Often it is hard to load SAS formatted datasets, not just for speed, but for memory limits.

The following function, although limited by the speeds of haven::read_sas(), allows for filtering on a column similar to other chunked functions in the package

sas_chunked <- function(file_location, 
                        filter_col,
                        filter_v,
                        col_select = NULL, 
                        chunk_function = NULL, 
                        chunk_size = 1e6L, 
                        ...) {

  # Chunking prep
  num_rows <- nrow(haven::read_sas(file_location, col_select = 1L))
  chunk_length <- seq_along(1:ceiling(num_rows / chunk_size))

  filter_col <- substitute(filter_col)
  col_select <- rlang::enquo(col_select)

  default_chunk_function <- function(i) {
    tmp <- data.table::as.data.table(
      haven::read_sas(file_location,
                      col_select = !!(col_select),
                      skip = chunk_size * i,
                      n_max = chunk_size,
                      ...)) 
    tmp[eval(filter_col) %in% filter_v]
  }

  chunk_list <- furrr::future_map(chunk_length - 1, default_chunk_function)
  output_combined <- data.table::rbindlist(chunk_list)

  return(output_combined)
}

future::plan('multisession', workers = 4)

sas_chunked(file_location, 
            filter_col = FILTERCOL, 
            filter_v = FILTERVALUE # Must be in the right format for the col to filter properly
            col_select = COLS) # Including the filter col

al-obrien / farrago

Add a chunked function for SAS loads #2