paws-r / paws

Paws, a package for Amazon Web Services in R
https://www.paws-r-sdk.com
Other
305 stars 37 forks source link

Consider using stringi as dependency #754

Closed DyfanJones closed 4 months ago

DyfanJones commented 4 months ago
json_convert_string <- function(string) {
  replace <- list(
    c("\\", "\\\\"),
    c('"', '\\"'),
    c("\b", "\\b"),
    c("\f", "\\f"),
    c("\r", "\\r"),
    c("\t", "\\t"),
    c("\n", "\\n")
  )
  for (elem in replace) {
    string <- gsub(elem[1], elem[2], string, fixed = TRUE)
  }
  string <- json_escape_unicode(string)
  string <- sprintf('"%s"', string)
  return(string)
}

json_escape_unicode <- function(string) {
  from <- intToUtf8(1:31, multiple = T)
  to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
  for (i in 1:31) {
    string <- gsub(from[i], to[i], string, fixed = TRUE)
  }
  return(string)
}

json_convert_string_stringi <- function(string) {
  from <- c("\\", '"', "\b", "\f", "\r", "\t", "\n")
  to <- c("\\\\", '\\"', "\\b", "\\f", "\\r", "\\t", "\\n")
  string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all =F)
  string <- json_escape_unicode_stringi(string)
  string <- sprintf('"%s"', string)
  return(string)
}

json_escape_unicode_stringi <- function(string) {
  from <- intToUtf8(1:31, multiple = T)
  to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
  string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all = F)
  return(string)
}

unicode <- c(intToUtf8(1:31, multiple = T), c("\\", '"', "\b", "\f", "\r", "\t", "\n"))

x <- c(1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6)
bms <- lapply(x, \(i) {
  string <- paste0(sample(c(letters, LETTERS, unicode), i, replace = T), collapse = "")
  bm <- bench::mark(
    paws = paws.common:::json_escape_unicode(string),
    new_mthd = json_escape_unicode(string),
    stringi = json_escape_unicode_stringi(string)
  )
  bm$id <- i
  return(bm)
})

do.call(rbind, bms) |> ggplot2::autoplot() + 
  ggplot2::facet_wrap("id", labeller = ggplot2::label_both, scales = "free_x")

Created on 2024-02-26 with reprex v2.1.0

image

stringi has some significant performance enhancements. Alternatively move some of the looping gsub to cpp, however cpp isn't my forte so will need help regarding that option :)

DyfanJones commented 4 months ago

Found Rcpp solutions developed from: https://stackoverflow.com/questions/25609174/fast-escaping-deparsing-of-character-vectors-in-r

remotes::install_github("dyfanjones/paws/paws.common", ref="json_escape")
json_escape_unicode <- function(string) {
  from <- intToUtf8(1:31, multiple = TRUE)
  to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
  for (i in 1:31) {
    string <- gsub(from[i], to[i], string, fixed = TRUE)
  }
  return(string)
}

json_convert_string <- function(string) {
  replace <- list(
    c("\\", "\\\\"),
    c('"', '\\"'),
    c("\b", "\\b"),
    c("\f", "\\f"),
    c("\r", "\\r"),
    c("\t", "\\t"),
    c("\n", "\\n")
  )
  for (elem in replace) {
    string <- gsub(elem[1], elem[2], string, fixed = TRUE)
  }
  string <- json_escape_unicode(string)
  string <- sprintf('"%s"', string)
  return(string)
}

json_convert_string_stringi <- function(string) {
  from <- c("\\", '"', "\b", "\f", "\r", "\t", "\n")
  to <- c("\\\\", '\\"', "\\b", "\\f", "\\r", "\\t", "\\n")
  string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all =F)
  string <- json_escape_unicode_stringi(string)
  string <- sprintf('"%s"', string)
  return(string)
}

json_escape_unicode_stringi <- function(string) {
  from <- intToUtf8(1:31, multiple = T)
  to <- paste0("\\u00", format(as.hexmode(1:31), width = 2))
  string <- stringi::stri_replace_all_fixed(string, from, to, vectorize_all = F)
  return(string)
}

unicode <- c(intToUtf8(1:31, multiple = T), "\\", '"', "\b", "\f", "\r", "\t", "\n")
x <- c(1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6)
bms <- lapply(x, \(i) {
  string <- paste0(sample(c(letters, LETTERS, unicode), i, replace = T), collapse = "")
  bm <- bench::mark(
    old_paws = json_convert_string(string),
    stringi = json_convert_string_stringi(string),
    new_paws = paws.common:::json_convert_string(string)
  )
  bm$id <- i
  return(bm)
})

Created on 2024-02-27 with reprex v2.1.0

image

From this we have significant performance improvement.