al-obrien / farrago

GNU General Public License v3.0
3 stars 0 forks source link

Add a chunked function for SAS loads #2

Open al-obrien opened 1 month ago

al-obrien commented 1 month ago

Often it is hard to load SAS formatted datasets, not just for speed, but for memory limits.

The following function, although limited by the speeds of haven::read_sas(), allows for filtering on a column similar to other chunked functions in the package

sas_chunked <- function(file_location, 
                        filter_col,
                        filter_v,
                        col_select = NULL, 
                        chunk_function = NULL, 
                        chunk_size = 1e6L, 
                        ...) {

  # Chunking prep
  num_rows <- nrow(haven::read_sas(file_location, col_select = 1L))
  chunk_length <- seq_along(1:ceiling(num_rows / chunk_size))

  filter_col <- substitute(filter_col)
  col_select <- rlang::enquo(col_select)

  default_chunk_function <- function(i) {
    tmp <- data.table::as.data.table(
      haven::read_sas(file_location,
                      col_select = !!(col_select),
                      skip = chunk_size * i,
                      n_max = chunk_size,
                      ...)) 
    tmp[eval(filter_col) %in% filter_v]
  }

  chunk_list <- furrr::future_map(chunk_length - 1, default_chunk_function)
  output_combined <- data.table::rbindlist(chunk_list)

  return(output_combined)
}

future::plan('multisession', workers = 4)

sas_chunked(file_location, 
            filter_col = FILTERCOL, 
            filter_v = FILTERVALUE # Must be in the right format for the col to filter properly
            col_select = COLS) # Including the filter col