r-lib / httr2

Make HTTP requests and process their responses. A modern reimagining of httr.
https://httr2.r-lib.org
Other
237 stars 57 forks source link

performance enhancement parse_match #440

Closed DyfanJones closed 7 months ago

DyfanJones commented 7 months ago

Hi all,

I think I have another performance enhancement. This time focusing on parse_match.

urls <- list(
  "/",
  "//google.com",
  "file:///",
  "http://google.com/",
  "http://google.com/path",
  "http://google.com/path?a=1&b=2",
  "http://google.com:80/path?a=1&b=2",
  "http://google.com:80/path?a=1&b=2#frag",
  "http://google.com:80/path?a=1&b=2&c=%7B1%7B2%7D3%7D#frag",
  "http://user@google.com:80/path?a=1&b=2",
  "http://user:pass@google.com:80/path?a=1&b=2",
  "svn+ssh://my.svn.server/repo/trunk"
)

httr2_parse_match <- function(x, pattern) {
  m <- regexec(pattern, x, perl = TRUE)
  pieces <- regmatches(x, m)[[1]][-1]

  empty <- pieces == ""
  pieces <- as.list(pieces)
  pieces[empty] <- list(NULL)
  pieces
}

new_parse_match <- function(x, pattern) {
  match_loc <- regexpr(pattern, x, perl = TRUE)
  cap_start <- attr(match_loc,"capture.start")
  cap_len <- attr(match_loc, "capture.length")
  cap_end <- (cap_start + cap_len - 1)
  cap_end[cap_end == -1] <- 0
  pieces <- as.list(substring(x, cap_start, cap_end))
  pieces[pieces == ""] <- list(NULL)
  return(pieces)
}

pattern <- "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"

bench::mark(
  httr2 = lapply(urls, httr2_parse_match, pattern = pattern),
  new = lapply(urls, new_parse_match, pattern = pattern)
) |> ggplot2::autoplot()
#> Loading required namespace: tidyr

Created on 2024-02-09 with reprex v2.1.0

urls <- list(
  "/",
  "//google.com",
  "file:///",
  "http://google.com/",
  "http://google.com/path",
  "http://google.com/path?a=1&b=2",
  "http://google.com:80/path?a=1&b=2",
  "http://google.com:80/path?a=1&b=2#frag",
  "http://google.com:80/path?a=1&b=2&c=%7B1%7B2%7D3%7D#frag",
  "http://user@google.com:80/path?a=1&b=2",
  "http://user:pass@google.com:80/path?a=1&b=2",
  "svn+ssh://my.svn.server/repo/trunk"
)

bench::mark(
  httr2 = lapply(urls, httr2::url_parse),
  httr = lapply(urls, httr::parse_url),
  urltools = lapply(urls, urltools::url_parse),
  adaR = lapply(urls, adaR::ada_url_parse),
  check = F
) |> ggplot2::autoplot()
#> Loading required namespace: tidyr

Created on 2024-02-09 with reprex v2.1.0

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.3.2 (2023-10-31) #> os macOS Sonoma 14.0 #> system aarch64, darwin20 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz Europe/London #> date 2024-02-09 #> pandoc 3.1.11.1 @ /opt/homebrew/bin/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> adaR 0.3.2 2024-02-08 [1] CRAN (R 4.3.1) #> beeswarm 0.4.0 2021-06-01 [1] CRAN (R 4.3.0) #> bench 1.1.3 2023-05-04 [1] CRAN (R 4.3.0) #> cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.1) #> colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.0) #> curl 5.2.0 2023-12-08 [1] CRAN (R 4.3.1) #> digest 0.6.34 2024-01-11 [1] CRAN (R 4.3.1) #> dplyr 1.1.4 2023-11-17 [1] CRAN (R 4.3.1) #> evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.1) #> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1) #> farver 2.1.1 2022-07-06 [1] CRAN (R 4.3.0) #> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0) #> fs 1.6.3 2023-07-20 [1] CRAN (R 4.3.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0) #> ggbeeswarm 0.7.2 2023-04-29 [1] CRAN (R 4.3.0) #> ggplot2 3.4.4 2023-10-12 [1] CRAN (R 4.3.1) #> glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.1) #> gtable 0.3.4 2023-08-21 [1] CRAN (R 4.3.0) #> highr 0.10 2022-12-22 [1] CRAN (R 4.3.0) #> htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.1) #> httr 1.4.7 2023-08-15 [1] CRAN (R 4.3.0) #> httr2 1.0.0.9000 2024-02-09 [1] local #> knitr 1.45 2023-10-30 [1] CRAN (R 4.3.1) #> labeling 0.4.3 2023-08-29 [1] CRAN (R 4.3.0) #> lifecycle 1.0.4 2023-11-07 [1] RSPM (R 4.3.0) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0) #> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.3.0) #> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0) #> profmem 0.6.0 2020-12-13 [1] CRAN (R 4.3.0) #> purrr 1.0.2 2023-08-10 [1] CRAN (R 4.3.0) #> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.3.0) #> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.3.0) #> R.oo 1.26.0 2024-01-24 [1] CRAN (R 4.3.1) #> R.utils 2.12.3 2023-11-18 [1] CRAN (R 4.3.1) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0) #> rappdirs 0.3.3 2021-01-31 [1] CRAN (R 4.3.0) #> Rcpp 1.0.12 2024-01-09 [1] CRAN (R 4.3.1) #> reprex 2.1.0 2024-01-11 [1] CRAN (R 4.3.1) #> rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.2) #> rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.1) #> rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.3.0) #> scales 1.3.0 2023-11-28 [1] CRAN (R 4.3.1) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0) #> styler 1.10.2 2023-08-29 [1] CRAN (R 4.3.0) #> tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.0) #> tidyr 1.3.1 2024-01-24 [1] CRAN (R 4.3.1) #> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.3.0) #> triebeard 0.4.1 2023-03-04 [1] CRAN (R 4.3.0) #> urltools 1.7.3 2019-04-14 [1] CRAN (R 4.3.0) #> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1) #> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1) #> vipor 0.4.7 2023-12-18 [1] CRAN (R 4.3.1) #> withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.1) #> xfun 0.42 2024-02-08 [1] CRAN (R 4.3.1) #> xml2 1.3.6 2023-12-04 [1] CRAN (R 4.3.1) #> yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.1) #> #> [1] /Users/dyfanjones/Library/R/arm64/4.3/library #> [2] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```

On my machine, it makes httr2 one of the fastest url_parses (in R) which is quite cool 😄 Specially when it is just R.

hadley commented 7 months ago

Looks good! Want to do a PR again?