kevinwolz / hisafer

An R toolbox for the Hi-sAFe biophysical agroforestry model
6 stars 4 forks source link

Add function that estimates output file sizes using just a hip #108

Open kevinwolz opened 6 years ago

kevinwolz commented 6 years ago

Here's a start:

est_output_file_size <- function(hip) {
  is_hip(hip, error = TRUE)

  profs <- SUPPORTED.PROFILES %>%
    dplyr::filter(profiles %in% hip$profiles) %>%
    dplyr::select(-description)

  USED_PARAMS <- get_used_params(hip)
  get_used  <- function(param) unlist(USED_PARAMS[[param]]$value)
  get_table <- function(param) USED_PARAMS[[param]]$value

  plotWidth         <- get_used("plotWidth")
  plotHeight        <- get_used("plotHeight")
  nbSimulations     <- get_used("nbSimulations")
  voxelThicknessMax <- get_used("voxelThicknessMax")
  soilDepth         <- purrr::map_dbl(get_table("layers"), function(x) max(cumsum(x$thick)))

  get_profs <- function(x) profs$profiles[str_detect(profs$profiles, x)]
  for(get_profs("plot")) {

  }
  return()
}
kevinwolz commented 6 years ago

Here's some "training data" for estimating file siwe simply base on the number of spreadsheet cells in the output file. The above function should calculate the number of spreadsheet cells expected in the output file and then use this regression to calculate file size.

training.data <- dplyr::tibble(size.mb = c(10.3, 12.2, 3.1, 2.9, 46.6, 
                                           114.3, 0.427, 28.1, 0.703, 655.7, 
                                           5.1, 0.082, 167.8, 100.7, 830.7, 
                                           393.6, 9.6, 92.9, 7.9, 51.9, 
                                           499.4, 167.8), 
                               n.cells = c(915800, 659855, 360684, 263010, 4195233, 
                                           5850073, 38456, 1422127, 60214, 30547935, 
                                           482000, 8942, 9900468, 6204870, 32768799, 
                                           21501935, 867844, 7206738, 710127, 2965578,
                                           20968013, 7705206))

lm.out   <- lm(size.mb ~ n.cells - 1,      data = training.data)
poly.out <- lm(size.mb ~ n.cells + I(n.cells^2) , data = training.data)

x.pred <- seq(min(training.data$n.cells), max(training.data$n.cells), 1000)
poly.pred <- dplyr::tibble(size.mb = predict(poly.out, data.frame(n.cells = x.pred)), 
                           n.cells = x.pred)

poly.pred.man <- dplyr::tibble(size.mb = coef(poly.out)[1] + coef(poly.out)[2] * x.pred + coef(poly.out)[3] * x.pred ^ 2, 
                               n.cells = x.pred)

ggplot(training.data, aes(x = n.cells, y = size.mb)) +
  labs(x = "Number of cells in file", y = "File size (Mb)") +
  geom_point() +
  geom_abline(slope = coef(lm.out)) +
  geom_line(data = poly.pred, color = "red") +
  geom_line(data = poly.pred.man, color = "blue", linetype = "dashed")

summary(poly.out)
coef(poly.out)