Closed ldecicco-USGS closed 4 weeks ago
Shifted this to making a sub function of importWQP. This allows much easier loops, which then allow much easier benchmarking:
wi_phos <- readWQPsummary(statecode = "WI",
sampleMedia = "Water",
characteristicName = "Phosphorus")
library(dplyr)
wi_phos_summary <- wi_phos |>
rename(Site = MonitoringLocationIdentifier) |>
mutate(Lat = as.numeric(MonitoringLocationLatitude),
Lon = as.numeric(MonitoringLocationLongitude)) |>
group_by(Site, Lat, Lon, Provider) |>
summarise(min_year = min(YearSummarized),
max_year = max(YearSummarized),
count = sum(ResultCount)) |>
mutate(POR = max_year - min_year) |>
filter(count > 100,
POR >= 20) |>
arrange(desc(count)) |>
ungroup()
# 102 sites
system.time({
p1 <- readWQPdata(siteid = unique(wi_phos_summary$Site),
sampleMedia = "Water",
characteristicName = "Phosphorus")
})
# user system elapsed
# 2.39 0.33 30.16
# Split by providers
system.time({
df_lists <- split(wi_phos_summary, wi_phos_summary$Provider)
all_data <- data.frame()
for(df in df_lists){
sub_data <- readWQPdata(siteid = unique(df$Site),
providers = unique(df$Provider),
characteristicName = "Phosphorus",
sampleMedia = "Water",
ignore_attributes = TRUE,
convertType = FALSE)
all_data <- dplyr::bind_rows(all_data, sub_data)
}
all_data <- dataRetrieval:::parse_WQP(all_data)
})
# user system elapsed
# 1.66 0.35 2.31 !
# Split by providers but ALL data
system.time({
all_data_ALL <- data.frame()
for(i in c("NWIS", "STORET")){
sub_data <- readWQPdata(statecode = "WI",
providers = i,
characteristicName = "Phosphorus",
sampleMedia = "Water",
ignore_attributes = TRUE,
convertType = FALSE)
all_data_ALL <- dplyr::bind_rows(all_data_ALL, sub_data)
}
all_data_ALL <- dataRetrieval:::parse_WQP(all_data_ALL)
})
# Timeout error
I'm working on updating some documentation and it would be really helpful if importWQP (and probably all the others) could accept a data frame. that way we could do this:
Rather than having to save an intermediate file. It really speeds things up.