Open Toniiiio opened 4 years ago
I could do sthg like:
options(stringsAsFactors = FALSE)
library(xml2)
library(httr)
library(DT)
urlGen <- function(nr) 'https://www.spielemax.de/jobs'
nr <- 1
hasResult <- TRUE
output <- list()
while(hasResult){
print(nr)
url <- urlGen(nr)
response <- url %>% httr::GET() %>% httr::content(type = "text")
html_href <- function(x) rvest::html_attr(x = x, name = "href")
xpathes <- list(
data.frame(XPath = 'td[1]/a/strong', type = "html_text"),
data.frame(XPath = 'td[2]', type = "html_text"),
data.frame(XPath = 'td[3]', type = "html_text"),
data.frame(XPath = 'td[4]/a', type = "html_href")
)
nodes <- response %>% xml2::read_html() %>% rvest::html_nodes(xpath =
"/html/body/div/div/div/main/div/article/div/div/div/table/tbody/tr"
)
xpath <- xpathes[[4]]
node <- nodes
response <- lapply(xpathes, function(xpath){
lapply(nodes, function(node) rvest::html_nodes(x = node, xpath = xpath$XPath) %>% {ifelse(length(.), yes = get(xpath$type)(.), no = NA)}) %>% unlist
})
hasResult <- length(response) & nr < 3
output[[nr]] <- response
nr <- nr + 1
}
output %>%
lapply(FUN = data.frame) %>%
do.call(what = rbind) %>%
DT::datatable()
Open issue would be that 'td[4]/a'
is actually `'td[4]/a/strong'
Shiny would look like:
options(stringsAsFactors = FALSE)
library(shiny)
library(DT)
ui <- fluidPage(
DT::dataTableOutput('foo'),
verbatimTextOutput('sel')
)
server <- function(input, output, session) {
data <- head(iris[, 1:4], 4)
colnames(data) <- c("td[1]/a/strong", "td[2]", "td[3]", "td[4]/a/strong")
rep <- rep(NA, ncol(data))
for (i in 1:ncol(data)) {
rep[i] <- as.character(selectInput(paste0("sel", i), "", choices = c("text", "link", "both"), width = "100px"))
}
data <- rbind(rep, data)
rownames(data) <- as.numeric(rownames(data)) - 1
rownames(data)[1] <- "extr. attribute"
output$foo = DT::renderDataTable(
data, escape = FALSE, selection = 'none', server = FALSE,
options = list(dom = 't', paging = FALSE, ordering = FALSE),
callback = JS("table.rows().every(function(i, tab, row) {
var $this = $(this.node());
$this.attr('id', this.data()[0]);
$this.addClass('shiny-input-container');
});
Shiny.unbindAll(table.table().node());
Shiny.bindAll(table.table().node());")
)
output$sel = renderPrint({
str(sapply(1:ncol(data), function(i) input[[paste0("sel", i)]]))
})
}
shinyApp(ui, server)
Also added the opportunity for extracting both, text and link.
I could do sthg like:
options(stringsAsFactors = FALSE) library(xml2) library(httr) library(DT) urlGen <- function(nr) 'https://www.spielemax.de/jobs' nr <- 1 hasResult <- TRUE output <- list() while(hasResult){ print(nr) url <- urlGen(nr) response <- url %>% httr::GET() %>% httr::content(type = "text") html_href <- function(x) rvest::html_attr(x = x, name = "href") xpathes <- list( data.frame(XPath = 'td[1]/a/strong', type = "html_text"), data.frame(XPath = 'td[2]', type = "html_text"), data.frame(XPath = 'td[3]', type = "html_text"), data.frame(XPath = 'td[4]/a', type = "html_href") ) nodes <- response %>% xml2::read_html() %>% rvest::html_nodes(xpath = "/html/body/div/div/div/main/div/article/div/div/div/table/tbody/tr" ) xpath <- xpathes[[4]] node <- nodes response <- lapply(xpathes, function(xpath){ lapply(nodes, function(node) rvest::html_nodes(x = node, xpath = xpath$XPath) %>% {ifelse(length(.), yes = get(xpath$type)(.), no = NA)}) %>% unlist }) hasResult <- length(response) & nr < 3 output[[nr]] <- response nr <- nr + 1 } output %>% lapply(FUN = data.frame) %>% do.call(what = rbind) %>% DT::datatable()
Open issue would be that
'td[4]/a'
is actually `'td[4]/a/strong'
That could yield issues in rbind with different colnames. Could do sthg along:
output[[nr]] <- response %>% do.call(what = cbind)
nr <- nr + 1
}
output %>%
do.call(what = rbind) %>%
datatable()
or better also provide colnames (the xpathes).
Type: Feature Request
Homepage: https://www.handelsblatt.com/
Target data (Mostly static): Articles. With preview text and then the link "Mehr,..."
Amount of occurences: Should be on many pages. All Pages that have links within selected texts.