potential `arrow` incorporation

Could the package eventually be written in such a way that the pixel level population data never actually gets loaded into memory?

Here's how we can generate domain level estimates using arrow....

library(arrow)
library(dplyr)

NV <- readRDS("~/Downloads/NV.rds")
NVy <- NV$tsumdatp
NVx <- NV$pltassgn

plt <- left_join(NVy, NVx, by = c("CN" = "PLT_CN")) %>%
  filter(MEASYEAR == "2019") %>%
  mutate(sqrtbio = sqrt(DRYBIO_AG_TPA_live_ADJ_TONS)) |> 
  select(COUNTYFIPS, tcc16, elev, sqrtbio)

mod1 <- lme4::lmer(sqrtbio ~ tcc16 + elev + (1 | COUNTYFIPS), data = plt[plt$sqrtbio > 0, ])
mod2 <- lme4::glmer(sqrtbio != 0 ~ tcc16 + elev + (1 | COUNTYFIPS), data = plt, family = "binomial")

lin_cfs <- summary(mod1)$coefficients[ ,1, drop = TRUE]
names(lin_cfs) <- NULL
log_cfs <- summary(mod2)$coefficients[ ,1, drop = TRUE]
names(log_cfs) <- NULL

lkp_tab_lin <- data.frame(
  COUNTYFIPS = rownames(lme4::ranef(mod1)[[1]]),
  reff_lin = lme4::ranef(mod1)[[1]][,1]
)

lkp_tab <- data.frame(
  COUNTYFIPS = rownames(lme4::ranef(mod2)[[1]]),
  reff_log = lme4::ranef(mod2)[[1]][,1]
) |> 
  left_join(lkp_tab_lin) |> 
  mutate(reff_lin = replace_na(reff_lin, 0))

# runs in ~ 9 seconds
# reminder: popdat is ~33 million rows
popdat <- arrow::open_csv_dataset("~/Downloads/Pixel_data.csv") |> 
  rename(elev = dem) |> 
  mutate(COUNTYFIPS = as.character(COUNTYFIPS)) |> 
  select(COUNTYFIPS, tcc16, elev) |> 
  left_join(lkp_tab, by = "COUNTYFIPS") |> 
  mutate(pred_lin = lin_cfs[1] + tcc16 * lin_cfs[2] + elev * lin_cfs[3] + reff_lin,
         pred_log = 1/(1 + exp(log_cfs[1] + tcc16 * log_cfs[2] + elev * log_cfs[3] + reff_log)),
         pred = pred_lin * pred_log) |> 
  group_by(COUNTYFIPS) |> 
  summarise(est = mean(pred)) |> 
  collect()

harvard-ufds / saeczi

potential `arrow` incorporation #23