tidyverse / rvest

Simple web scraping for R
https://rvest.tidyverse.org
Other
1.49k stars 343 forks source link

Provide wrapper around chromote #245

Closed hadley closed 9 months ago

hadley commented 5 years ago

Which implements https://chromedevtools.github.io/devtools-protocol/tot/Input

hadley commented 2 years ago

Now on CRAN: https://cran.r-project.org/web/packages/chromote/

hadley commented 1 year ago
library(chromote)

b <- ChromoteSession$new()
b$Page$navigate("https://www.r-project.org/")
b$view()

x <- b$DOM$getDocument()
p <- b$DOM$querySelector(x$root$nodeId, "p")
b$DOM$getOuterHTML(p$nodeId)
hadley commented 1 year ago

And need to implement interface similar to https://github.com/rstudio/shinytest/blob/main/R/shiny-driver.R so you can "click" and set values

hadley commented 1 year ago
hadley commented 1 year ago

Sample problems:

# https://twitter.com/ryanburge/status/1592910141128867841 --------------------

url <- "https://www.christianitytoday.com/ct/search/?query=abortion"
ct <- read_html_live(url)

items <- ct %>% html_elements(".ais-Hits-item")
items %>% html_element(".title-2") %>% html_text2()
items %>% html_element(".d-inline-block") %>% html_text2()

# https://twitter.com/minebocek/status/1592917026049183745 --------------------

url <- "https://www.forbes.com/top-colleges/"
sess <- read_html_live(url)
sess |> 
  html_elements(".TopColleges2023_tableRowGroup__65RHn") |> 
  html_elements(".TopColleges2023_tableRow__BYOSU")

# https://twitter.com/joshmccrain/status/1592910563327496199 -------------------
# http://joshuamccrain.com/tutorials/web_scraping_R_selenium.html

sess <- read_html_live("https://www.fcc.gov/media/engineering/dtvmaps")
sess$view()

sess$type("#startpoint", "77003")
sess$press("#startpoint", "Enter")

sess |> 
  html_table() |> 
  html_element("#contourdata > table:nth-child(3)") |> 
  select(-1) |> 
  filter(Callsign != "")

sess$press("#startpoint", "ArrowLeft", modifiers = c("Alt", "Shift"))
sess$press("#startpoint", "Delete")

# https://twitter.com/jdsdog10/status/1592912450625884160 ---------------------

url <- "https://www.fantasypros.com/nfl/rankings/half-point-ppr-rb.php"
sess <- read_html_live(url)
sess$view()
sess |> html_element("#ranking-table") |> html_table()

# https://twitter.com/sctyner/status/1592921110730797057 -----------------------
# need to click "load more" a few times

sess <- read_html_live("https://www.bodybuilding.com/exercises/finder")
sess |> html_elements(".ExResult-row") |> length()
sess$click(".ExLoadMore-btn")
sess |> html_elements(".ExResult-row") |> length()
sess$click(".ExLoadMore-btn")
sess |> html_elements(".ExResult-row") |> length()
hadley commented 1 year ago

To use xpath, need to do something like:

document_method <- function(session, method, ...) {
  root_id <- sess$session$DOM$getDocument()$root$nodeId
  obj_id <- session$DOM$resolveNode(root_id)$object$objectId

  js_fun <- paste0("function() { return this", method, "}")
  session$Runtime$callFunctionOn(js_fun, objectId = obj_id, ...)
}

document_method(sess$session, ".evaluate('/', document, null, XPathResult. ORDERED_NODE_ITERATOR_TYPE, null)")

That gives you an XPathResult object which you need to iterate over with something like:

let thisHeading = headings.iterateNext();
let alertText = "Level 2 headings in this document are:\n";
while (thisHeading) {
  alertText += `${thisHeading.textContent}\n`;
  thisHeading = headings.iterateNext();
}

But instead building up an array of nodes.

hadley commented 1 year ago
(function() {
  let xpath = document.evaluate('//p', document, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
  let nodes = [];

  let thisNode = xpath.iterateNext();
  while (thisNode) {
    nodes.push(thisNode);
    thisNode = xpath.iterateNext();
  } 

  return nodes;
})();
hadley commented 1 year ago

Tried and failed to simulate a mouse click:

    click = function(css) {
      node <- private$wait_for_selector(css)
      loc <- private$get_bounding_rect(node)

      # https://chromedevtools.github.io/devtools-protocol/1-3/Input/#method-dispatchMouseEvent
      self$session$Input$dispatchMouseEvent(
        type = "mousePressed",
        x = (loc$left + loc$right) / 2,
        y = (loc$top + loc$bottom) / 2,
        button = "left",
        clickCount = 1
      )
      self$session$Input$dispatchMouseEvent(
        type = "mouseReleased",
        x = (loc$left + loc$right) / 2,
        y = (loc$top + loc$bottom) / 2,
        button = "left",
        clickCount = 1
      )
      invisible(self)
    },

    get_bounding_rect = function(node) {
      out <- private$call_node_method(
        node,
        ".getBoundingClientRect().toJSON()",
        returnByValue = TRUE
      )
      out$result$value
    },

Always timed out when setting the button argument