Closed joshyam-k closed 7 months ago
Could the package eventually be written in such a way that the pixel level population data never actually gets loaded into memory?
Here's how we can generate domain level estimates using arrow....
arrow
library(arrow) library(dplyr) NV <- readRDS("~/Downloads/NV.rds") NVy <- NV$tsumdatp NVx <- NV$pltassgn plt <- left_join(NVy, NVx, by = c("CN" = "PLT_CN")) %>% filter(MEASYEAR == "2019") %>% mutate(sqrtbio = sqrt(DRYBIO_AG_TPA_live_ADJ_TONS)) |> select(COUNTYFIPS, tcc16, elev, sqrtbio) mod1 <- lme4::lmer(sqrtbio ~ tcc16 + elev + (1 | COUNTYFIPS), data = plt[plt$sqrtbio > 0, ]) mod2 <- lme4::glmer(sqrtbio != 0 ~ tcc16 + elev + (1 | COUNTYFIPS), data = plt, family = "binomial") lin_cfs <- summary(mod1)$coefficients[ ,1, drop = TRUE] names(lin_cfs) <- NULL log_cfs <- summary(mod2)$coefficients[ ,1, drop = TRUE] names(log_cfs) <- NULL lkp_tab_lin <- data.frame( COUNTYFIPS = rownames(lme4::ranef(mod1)[[1]]), reff_lin = lme4::ranef(mod1)[[1]][,1] ) lkp_tab <- data.frame( COUNTYFIPS = rownames(lme4::ranef(mod2)[[1]]), reff_log = lme4::ranef(mod2)[[1]][,1] ) |> left_join(lkp_tab_lin) |> mutate(reff_lin = replace_na(reff_lin, 0)) # runs in ~ 9 seconds # reminder: popdat is ~33 million rows popdat <- arrow::open_csv_dataset("~/Downloads/Pixel_data.csv") |> rename(elev = dem) |> mutate(COUNTYFIPS = as.character(COUNTYFIPS)) |> select(COUNTYFIPS, tcc16, elev) |> left_join(lkp_tab, by = "COUNTYFIPS") |> mutate(pred_lin = lin_cfs[1] + tcc16 * lin_cfs[2] + elev * lin_cfs[3] + reff_lin, pred_log = 1/(1 + exp(log_cfs[1] + tcc16 * log_cfs[2] + elev * log_cfs[3] + reff_log)), pred = pred_lin * pred_log) |> group_by(COUNTYFIPS) |> summarise(est = mean(pred)) |> collect()
probably not within scope of package
Could the package eventually be written in such a way that the pixel level population data never actually gets loaded into memory?
Here's how we can generate domain level estimates using
arrow
....