Rewrite extractions for speed

Solution by Jim: use a list column and treat the values all at once downstream. Then we can use vectorized functions which should speed up the process quite a bit.

library(xml2)
library(magrittr)

xml <- read_xml('<ref-list>
  <ref id="ref1">
      <citation type="book">
        <title>Some title</title>
        <publisher>Some publisher</publisher>
      </citation>
   </ref>
   <ref id="ref2">
      <citation type="journal">
        <title>Another title</title>
        <volume>1</volume>
        <author>Author 1/1</author>
        <author>Author 1/2</author>
      </citation>
   </ref>
</ref-list>')

refs <- xml_find_all(xml, ".//ref")

res <- tibble::tibble(
  title = refs %>% xml_find_first("./citation/title") %>% xml_text(),
  volume = refs %>% xml_find_first("./citation/volume") %>% xml_text(),
  publisher = refs %>% xml_find_first("./citation/publisher") %>% xml_text(),
  author = refs %>% purrr::map(~ xml_find_all(.x, "./citation/author") %>% xml_text())
)

res
#> # A tibble: 2 x 4
#>   title         volume publisher      author   
#>   <chr>         <chr>  <chr>          <list>   
#> 1 Some title    <NA>   Some publisher <chr [0]>
#> 2 Another title 1      <NA>           <chr [2]>

res$author
#> [[1]]
#> character(0)
#> 
#> [[2]]
#> [1] "Author 1/1" "Author 1/2"

ropensci / jstor

Rewrite extractions for speed #74