Toniiiio / sivis

Turn browser clicks into reproducible scraping code.
10 stars 1 forks source link

Feature Request: Extract links in additional columns #2

Open Toniiiio opened 4 years ago

Toniiiio commented 4 years ago

Type: Feature Request

Homepage: https://www.handelsblatt.com/

Target data (Mostly static): Articles. With preview text and then the link "Mehr,..."

Amount of occurences: Should be on many pages. All Pages that have links within selected texts.

Toniiiio commented 4 years ago

I could do sthg like:

options(stringsAsFactors = FALSE)
library(xml2)
library(httr)
library(DT)
urlGen <- function(nr) 'https://www.spielemax.de/jobs'
nr <- 1
hasResult <- TRUE
output <- list()

while(hasResult){
    print(nr)
    url <- urlGen(nr)
    response <- url %>% httr::GET() %>% httr::content(type = "text")

    html_href <- function(x) rvest::html_attr(x = x, name = "href")

    xpathes <- list(
    data.frame(XPath = 'td[1]/a/strong', type = "html_text"),
    data.frame(XPath = 'td[2]', type = "html_text"),
    data.frame(XPath = 'td[3]', type = "html_text"),
    data.frame(XPath = 'td[4]/a', type = "html_href")
  )

    nodes <- response %>% xml2::read_html() %>% rvest::html_nodes(xpath = 
    "/html/body/div/div/div/main/div/article/div/div/div/table/tbody/tr"
  )

    xpath <- xpathes[[4]]
    node <- nodes
    response <- lapply(xpathes, function(xpath){
    lapply(nodes, function(node) rvest::html_nodes(x = node, xpath = xpath$XPath) %>% {ifelse(length(.), yes = get(xpath$type)(.), no = NA)}) %>% unlist
  })

    hasResult <- length(response) & nr < 3
    output[[nr]] <- response
    nr <- nr + 1
}

output %>% 
    lapply(FUN = data.frame) %>% 
    do.call(what = rbind) %>% 
    DT::datatable()

Open issue would be that 'td[4]/a' is actually `'td[4]/a/strong'

Toniiiio commented 4 years ago

Shiny would look like:

options(stringsAsFactors = FALSE)
library(shiny)
library(DT)

ui <- fluidPage(
  DT::dataTableOutput('foo'),
  verbatimTextOutput('sel')
)

server <- function(input, output, session) {
  data <- head(iris[, 1:4], 4)
  colnames(data) <- c("td[1]/a/strong", "td[2]", "td[3]", "td[4]/a/strong")

  rep <- rep(NA, ncol(data))
  for (i in 1:ncol(data)) {
    rep[i] <- as.character(selectInput(paste0("sel", i), "", choices = c("text", "link", "both"), width = "100px"))
  }
  data <- rbind(rep, data)
  rownames(data) <- as.numeric(rownames(data)) - 1
  rownames(data)[1] <- "extr. attribute"

  output$foo = DT::renderDataTable(
    data, escape = FALSE, selection = 'none', server = FALSE,
    options = list(dom = 't', paging = FALSE, ordering = FALSE),
    callback = JS("table.rows().every(function(i, tab, row) {
        var $this = $(this.node());
        $this.attr('id', this.data()[0]);
        $this.addClass('shiny-input-container');
      });
      Shiny.unbindAll(table.table().node());
      Shiny.bindAll(table.table().node());")
  )
  output$sel = renderPrint({
    str(sapply(1:ncol(data), function(i) input[[paste0("sel", i)]]))
  })
}

shinyApp(ui, server)

Also added the opportunity for extracting both, text and link.

Toniiiio commented 4 years ago

I could do sthg like:

options(stringsAsFactors = FALSE)
library(xml2)
library(httr)
library(DT)
urlGen <- function(nr) 'https://www.spielemax.de/jobs'
nr <- 1
hasResult <- TRUE
output <- list()

while(hasResult){
  print(nr)
  url <- urlGen(nr)
  response <- url %>% httr::GET() %>% httr::content(type = "text")

  html_href <- function(x) rvest::html_attr(x = x, name = "href")

  xpathes <- list(
      data.frame(XPath = 'td[1]/a/strong', type = "html_text"),
      data.frame(XPath = 'td[2]', type = "html_text"),
      data.frame(XPath = 'td[3]', type = "html_text"),
      data.frame(XPath = 'td[4]/a', type = "html_href")
  )

  nodes <- response %>% xml2::read_html() %>% rvest::html_nodes(xpath = 
    "/html/body/div/div/div/main/div/article/div/div/div/table/tbody/tr"
  )

  xpath <- xpathes[[4]]
  node <- nodes
  response <- lapply(xpathes, function(xpath){
      lapply(nodes, function(node) rvest::html_nodes(x = node, xpath = xpath$XPath) %>% {ifelse(length(.), yes = get(xpath$type)(.), no = NA)}) %>% unlist
  })

  hasResult <- length(response) & nr < 3
  output[[nr]] <- response
  nr <- nr + 1
}

output %>% 
  lapply(FUN = data.frame) %>% 
  do.call(what = rbind) %>% 
  DT::datatable()

Open issue would be that 'td[4]/a' is actually `'td[4]/a/strong'

That could yield issues in rbind with different colnames. Could do sthg along:

  output[[nr]] <- response %>% do.call(what = cbind)
  nr <- nr + 1
}

output %>% 
  do.call(what = rbind) %>% 
  datatable()

or better also provide colnames (the xpathes).