DOI-USGS / dataRetrieval

This R package is designed to obtain USGS or EPA water quality sample data, streamflow data, and metadata directly from web services.
https://doi-usgs.github.io/dataRetrieval/
Other
256 stars 85 forks source link

allow data frame as input to importWQP #697

Closed ldecicco-USGS closed 4 weeks ago

ldecicco-USGS commented 4 months ago

I'm working on updating some documentation and it would be really helpful if importWQP (and probably all the others) could accept a data frame. that way we could do this:

wi_phos <- readWQPsummary(statecode = "WI",
                          sampleMedia = "Water",
                          characteristicName = "Phosphorus")

  df_lists <- split(wi_phos, wi_phos$Provider)
  all_data <- data.frame()
  for(df in df_lists){
    sub_data <- readWQPdata(siteid = unique(df$Site),
                            providers = unique(df$Provider),
                            characteristicName = "Phosphorus",
                            sampleMedia = "Water",
                            ignore_attributes = TRUE,
                            convertType = FALSE)
    all_data <- dplyr::bind_rows(all_data, sub_data)
    all_data <- importWQP(all_data , zip = FALSE)
  }

Rather than having to save an intermediate file. It really speeds things up.

ldecicco-USGS commented 4 months ago

Shifted this to making a sub function of importWQP. This allows much easier loops, which then allow much easier benchmarking:

wi_phos <- readWQPsummary(statecode = "WI",
                          sampleMedia = "Water",
                          characteristicName = "Phosphorus")
library(dplyr)

wi_phos_summary <- wi_phos |> 
  rename(Site = MonitoringLocationIdentifier) |>
  mutate(Lat = as.numeric(MonitoringLocationLatitude),
         Lon = as.numeric(MonitoringLocationLongitude)) |> 
  group_by(Site, Lat, Lon, Provider) |> 
  summarise(min_year = min(YearSummarized),
            max_year = max(YearSummarized),
            count = sum(ResultCount)) |> 
  mutate(POR = max_year - min_year) |> 
  filter(count > 100,
         POR >= 20) |> 
  arrange(desc(count)) |> 
  ungroup()

# 102 sites
system.time({
  p1 <- readWQPdata(siteid = unique(wi_phos_summary$Site),
                    sampleMedia = "Water",
                    characteristicName = "Phosphorus")
})
# user  system elapsed                                                  
# 2.39    0.33   30.16

# Split by providers
system.time({
  df_lists <- split(wi_phos_summary, wi_phos_summary$Provider)

  all_data <- data.frame()
  for(df in df_lists){
    sub_data <- readWQPdata(siteid = unique(df$Site),
                            providers = unique(df$Provider),
                            characteristicName = "Phosphorus",
                            sampleMedia = "Water",
                            ignore_attributes = TRUE,
                            convertType = FALSE)
    all_data <- dplyr::bind_rows(all_data, sub_data)
  }
  all_data <- dataRetrieval:::parse_WQP(all_data)
})
# user  system elapsed                                       
# 1.66    0.35    2.31 !

# Split by providers but ALL data
system.time({

  all_data_ALL <- data.frame()
  for(i in c("NWIS", "STORET")){
    sub_data <- readWQPdata(statecode = "WI",
                            providers = i,
                            characteristicName = "Phosphorus",
                            sampleMedia = "Water",
                            ignore_attributes = TRUE,
                            convertType = FALSE)
    all_data_ALL <- dplyr::bind_rows(all_data_ALL, sub_data)
  }
  all_data_ALL <- dataRetrieval:::parse_WQP(all_data_ALL)
})
# Timeout error