Closed MarkGannon closed 7 years ago
Forgot to include sessionInfo()
sessionInfo() R version 3.3.2 (2016-10-31) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Gentoo/Linux
locale:
[1] LC_CTYPE=en_US.utf8 LC_NUMERIC=C
[3] LC_TIME=en_US.utf8 LC_COLLATE=en_US.utf8
[5] LC_MONETARY=en_US.utf8 LC_MESSAGES=en_US.utf8
[7] LC_PAPER=en_US.utf8 LC_NAME=en_US.utf8
[9] LC_ADDRESS=en_US.utf8 LC_TELEPHONE=en_US.utf8
[11] LC_MEASUREMENT=en_US.utf8 LC_IDENTIFICATION=en_US.utf8
attached base packages: [1] stats graphics grDevices utils datasets methods base
other attached packages: [1] tm.plugin.webmining_1.3.2 plyr_1.8.4
loaded via a namespace (and not attached):
[1] tools_3.3.2 parallel_3.3.2 NLP_0.1-10 RCurl_1.95-4.8
[5] Rcpp_0.12.4 slam_0.1-40 RJSONIO_1.3-0 tm_0.6-2
[9] rJava_0.9-8 boilerpipeR_1.3 bitops_1.0-6 XML_3.98-1.5
I cannot understand why you get a parsing error without using the WebCorpus function. Please clarify. Closing Issue (feel free to comment to re-open).
> query <- "deviousdoll endometriosis awareness month million women suffer disease"
> results <- GoogleNewsSource(query, params=list(hl="en", q=query, ie="utf-8", num=5, output="rss", as_min_date='03/16/2013', as_max_date='03/17/2013') )
> results
$encoding
[1] "UTF-8"
$length
[1] 0
$names
[1] NA
$position
[1] 0
$reader
function (elem, language, id)
{
tree <- parser(elem$content)
content(doc) <- if ("content" %in% names(spec)) {
content <- contentparser(tree, spec[["content"]])
}
else {
character(0)
}
for (n in setdiff(names(spec), "content")) {
meta(doc, n) <- contentparser(tree, spec[[n]])
}
if (!is.null(freeFUN)) {
freeFUN(tree)
}
doc
}
<bytecode: 0x255ed18>
<environment: 0x259f610>
$content
list()
$feedurls
[1] "http://news.google.com/news?hl=en&q=deviousdoll%20endometriosis%20awareness%20month%20million%20women%20suffer%20disease&ie=utf-8&num=5&output=rss&as_min_date=03%2F16%2F2013&as_max_date=03%2F17%2F2013"
$parser
function (cr)
{
tree <- parse(cr, type = "XML", asText = TRUE)
nodes <- xpathSApply(tree, path = "//item")
xmlns1 <- lapply(nodes, newXMLNamespace, "http://purl.org/dc/elements/1.1/",
"dc")
nodes
}
<environment: 0x9da7008>
$curlOpts
$ssl.verifypeer
[1] FALSE
$ssl.verifyhost
[1] FALSE
$connecttimeout
[1] 30
$timeout
[1] 30
$maxredirs
[1] 20
$maxconnects
[1] 5
$followlocation
[1] TRUE
attr(,"class")
[1] "CURLOptions"
$postFUN
function (corpus, links = sapply(corpus, meta, "origin"), timeout.request = 30,
chunksize = 20, verbose = getOption("verbose"), curlOpts = curlOptions(verbose = FALSE,
followlocation = TRUE, maxconnects = 5, maxredirs = 20,
timeout = timeout.request, connecttimeout = timeout.request,
ssl.verifyhost = FALSE, ssl.verifypeer = FALSE, useragent = "R",
cookiejar = tempfile()), retry.empty = 3, sleep.time = 3,
extractor = ArticleExtractor, .encoding = integer(), ...)
{
if (length(corpus) != length(links))
stop("Corpus length not equal to links length\n")
if (verbose) {
cat("Starting URL Download ...\n")
}
retries <- 0
while (any(empty <- sapply(corpus, function(x) identical(content(x),
character(0)))) & (retries <= retry.empty)) {
retries <- retries + 1
emptycontent.ids <- which(empty)
if (verbose) {
cat("Run ", retries, ", retrieving ", length(emptycontent.ids),
" content items\n")
}
for (cstart in seq(from = 1, to = length(emptycontent.ids),
by = chunksize)) {
if (sleep.time > 0) {
if (verbose) {
cat("Sleeping ", sleep.time, " seconds...\n")
}
Sys.sleep(sleep.time)
}
cend <- min(cstart[1] + chunksize - 1, length(emptycontent.ids))
chunk.ids <- emptycontent.ids[cstart:cend]
chunk <- links[chunk.ids]
content <- tryCatch({
getURL(chunk, .opts = curlOpts, .encoding = .encoding,
...)
}, error = function(e) {
print(e)
cat("\nError on retrieval, single retrieval fallback... \n")
content <- list()
for (i in 1:length(chunk)) {
content[[i]] <- tryCatch({
getURL(chunk[i], .opts = curlOpts, .encoding = .encoding,
...)
}, error = function(f) {
print(f)
""
})
}
do.call(c, content)
})
extract <- sapply(content, extractor)
for (i in 1:length(chunk.ids)) {
cid <- chunk.ids[i]
content(corpus[[cid]]) <- extract[i]
}
if (verbose) {
progress <- floor(cend/length(links) * 100)
cat(paste(progress, "% (", cend, "/", length(emptycontent.ids),
") ", Sys.time(), "\n", sep = ""))
}
}
}
corpus
}
<environment: namespace:tm.plugin.webmining>
$retrieveFeedURL
[1] TRUE
attr(,"class")
[1] "WebXMLSource" "WebSource" "SimpleSource"
> out <- WebCorpus(results)
> out
<<WebCorpus>>
Metadata: corpus specific: 3, document level (indexed): 0
Content: documents: 0
> query <- "deviousdoll endometriosis awareness month million women suffer disease"
> results <- GoogleNewsSource(query, params=list(hl="en", q=query, ie="utf-8", num=5, output="rss", as_min_date='03/16/2013', as_max_date='03/17/2013') )
> results
$encoding
[1] "UTF-8"
$length
[1] 0
$names
[1] NA
$position
[1] 0
$reader
function (elem, language, id)
{
tree <- parser(elem$content)
content(doc) <- if ("content" %in% names(spec)) {
content <- contentparser(tree, spec[["content"]])
}
else {
character(0)
}
for (n in setdiff(names(spec), "content")) {
meta(doc, n) <- contentparser(tree, spec[[n]])
}
if (!is.null(freeFUN)) {
freeFUN(tree)
}
doc
}
<bytecode: 0x255ed18>
<environment: 0x259f610>
$content
list()
$feedurls
[1] "http://news.google.com/news?hl=en&q=deviousdoll%20endometriosis%20awareness%20month%20million%20women%20suffer%20disease&ie=utf-8&num=5&output=rss&as_min_date=03%2F16%2F2013&as_max_date=03%2F17%2F2013"
$parser
function (cr)
{
tree <- parse(cr, type = "XML", asText = TRUE)
nodes <- xpathSApply(tree, path = "//item")
xmlns1 <- lapply(nodes, newXMLNamespace, "http://purl.org/dc/elements/1.1/",
"dc")
nodes
}
<environment: 0x9dc7400>
$curlOpts
$ssl.verifypeer
[1] FALSE
$ssl.verifyhost
[1] FALSE
$connecttimeout
[1] 30
$timeout
[1] 30
$maxredirs
[1] 20
$maxconnects
[1] 5
$followlocation
[1] TRUE
attr(,"class")
[1] "CURLOptions"
$postFUN
function (corpus, links = sapply(corpus, meta, "origin"), timeout.request = 30,
chunksize = 20, verbose = getOption("verbose"), curlOpts = curlOptions(verbose = FALSE,
followlocation = TRUE, maxconnects = 5, maxredirs = 20,
timeout = timeout.request, connecttimeout = timeout.request,
ssl.verifyhost = FALSE, ssl.verifypeer = FALSE, useragent = "R",
cookiejar = tempfile()), retry.empty = 3, sleep.time = 3,
extractor = ArticleExtractor, .encoding = integer(), ...)
{
if (length(corpus) != length(links))
stop("Corpus length not equal to links length\n")
if (verbose) {
cat("Starting URL Download ...\n")
}
retries <- 0
while (any(empty <- sapply(corpus, function(x) identical(content(x),
character(0)))) & (retries <= retry.empty)) {
retries <- retries + 1
emptycontent.ids <- which(empty)
if (verbose) {
cat("Run ", retries, ", retrieving ", length(emptycontent.ids),
" content items\n")
}
for (cstart in seq(from = 1, to = length(emptycontent.ids),
by = chunksize)) {
if (sleep.time > 0) {
if (verbose) {
cat("Sleeping ", sleep.time, " seconds...\n")
}
Sys.sleep(sleep.time)
}
cend <- min(cstart[1] + chunksize - 1, length(emptycontent.ids))
chunk.ids <- emptycontent.ids[cstart:cend]
chunk <- links[chunk.ids]
content <- tryCatch({
getURL(chunk, .opts = curlOpts, .encoding = .encoding,
...)
}, error = function(e) {
print(e)
cat("\nError on retrieval, single retrieval fallback... \n")
content <- list()
for (i in 1:length(chunk)) {
content[[i]] <- tryCatch({
getURL(chunk[i], .opts = curlOpts, .encoding = .encoding,
...)
}, error = function(f) {
print(f)
""
})
}
do.call(c, content)
})
extract <- sapply(content, extractor)
for (i in 1:length(chunk.ids)) {
cid <- chunk.ids[i]
content(corpus[[cid]]) <- extract[i]
}
if (verbose) {
progress <- floor(cend/length(links) * 100)
cat(paste(progress, "% (", cend, "/", length(emptycontent.ids),
") ", Sys.time(), "\n", sep = ""))
}
}
}
corpus
}
<environment: namespace:tm.plugin.webmining>
$retrieveFeedURL
[1] TRUE
attr(,"class")
[1] "WebXMLSource" "WebSource" "SimpleSource"
`query <- "deviousdoll endometriosis awareness month million women suffer disease"
results <- GoogleNewsSource(query, params=list(hl="en", q=query, ie="utf-8", num=5, output="rss", as_min_date='03/16/2013', as_max_date='03/17/2013') )`
Produces this output:
` Space required after the Public Identifier
SystemLiteral " or ' expected
SYSTEM or PUBLIC, the URI is missing
Opening and ending tag mismatch: meta line 3 and head
Specification mandate value for attribute noshade
attributes construct error
Couldn't find end of Start Tag hr line 6
Opening and ending tag mismatch: br line 12 and form
Specification mandate value for attribute noshade
attributes construct error
Couldn't find end of Start Tag hr line 14
Entity 'mdash' not defined
Opening and ending tag mismatch: br line 19 and div
Opening and ending tag mismatch: br line 22 and div Opening and ending tag mismatch: br line 22 and div Opening and ending tag mismatch: br line 22 and body Opening and ending tag mismatch: br line 19 and html Premature end of data in tag br line 19 Premature end of data in tag br line 19 Premature end of data in tag div line 18 Premature end of data in tag br line 17 Premature end of data in tag br line 17 Premature end of data in tag br line 17 Premature end of data in tag br line 17 Premature end of data in tag div line 16 Premature end of data in tag br line 12 Premature end of data in tag br line 12 Premature end of data in tag input line 12 Premature end of data in tag input line 11 Premature end of data in tag input line 11 Premature end of data in tag input line 10 Premature end of data in tag br line 9 Premature end of data in tag br line 9 Premature end of data in tag img line 9 Premature end of data in tag br line 8 Premature end of data in tag br line 8 Premature end of data in tag form line 7 Premature end of data in tag br line 6 Premature end of data in tag div line 5 Premature end of data in tag body line 4 Premature end of data in tag meta line 3 Premature end of data in tag head line 3 Premature end of data in tag html line 2 Error: 1: Space required after the Public Identifier 2: SystemLiteral " or ' expected 3: SYSTEM or PUBLIC, the URI is missing 4: Opening and ending tag mismatch: meta line 3 and head 5: Specification mandate value for attribute noshade 6: attributes construct error 7: Couldn't find end of Start Tag hr line 6 8: Opening and ending tag mismatch: br line 12 and form 9: Specification mandate value for attribute noshade 10: attributes construct error 11: Couldn't find end of Start Tag hr line 14 12: Entity 'mdash' not defined 13: Opening and ending tag mismatch: br line 19 and div 14: Opening and ending tag mismatch: br line 22 and div 15: Opening and ending tag mismatch: br line 22 and div 16: Opening and ending tag mismatch: br line 22 and body 17: Opening and ending tag mismatch: br line 19 and html 18: Premature end of data in tag br line 19 19: Premature end of data in tag br line 19 20: Premature end of data in tag div line 18 21: Premature end of data in tag br line 17 22: Premature ` Here is the intended URL in a manually entered browser I've seen similar output from libxml when using Perl to process web pages in the case where the web page didn't close html tags. In that case, I solved the problem by running the page through html-tidy (which adds in the ending tags).