Closed DyfanJones closed 3 years ago
To over come this can utilise json parser.
Possible solutions:
library(data.table)
x = 1e6
dt1 = data.table(
var1 = 1:x,
var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)
dt2 = data.table(
var1 = 1:x,
var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)
col = "var2"
system.time(set(dt1, j=col, value=sapply(dt1[[col]], jsonlite::toJSON, auto_unbox = T)))
# user system elapsed
# 196.737 1.269 199.344
system.time(set(dt2, j=col, value=noctua:::list_to_json(dt2[[col]])))
# user system elapsed
# 6.409 0.094 6.582
head(dt1)
var1 var2
1: 1 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
2: 2 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
3: 3 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
4: 4 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
5: 5 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
6: 6 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
head(dt2)
var1 var2
1: 1 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
2: 2 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
3: 3 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
4: 4 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
5: 5 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
6: 6 {"var3":[1,2,3],"var4":{"var5":["a","b","c","d","e"]}}
Need a solution batch method for jsonlite for a fair comparison
tmp1 <- tempfile()
tmp2 <- tempfile()
con <- file(tmp1)
system.time(jsonlite::stream_out(dt[, .(var2)], con))
user system elapsed
230.323 9.498 259.813
system.time(data.table::fwrite(
x=as.list(jsonify::to_ndjson(dt$var2,unbox = T)),
file="test.jsonl",
quote =F,
col.names=F)
)
# user system elapsed
# 3.547 0.444 4.191
If possible should switch from jsonlite to jsonify. will need batch method for writing table out to file but current method is pretty fast.
Possible alternatives using jsonlite.
col_to_json_raw_1 <- function(dt, col, batch = 1e4){
max_len = nrow(dt)
start <- seq(1, max_len, batch)
end <- c(start[-1]-1, max_len)
output <- unlist(
lapply(seq_along(start), function(i) {
con <- rawConnection(raw(), open = "w")
jsonlite::stream_out(subset(dt[start[i]:end[i],], select = col), con, verbose = F, pagesize = batch)
str = rawToChar(rawConnectionValue(con))
close(con)
strsplit(str, split = "\n")[[1]]
}),
recursive = FALSE
)
return(output)
}
col_to_json_raw_2 <- function(dt, col){
con <- rawConnection(raw(), open = "w")
on.exit(close(con))
jsonlite::stream_out(subset(dt, select = col), con, verbose = F)
obj = rawConnectionValue(con)
end <- which(obj == charToRaw("\n"))
start <- c(1, end[-length(end)]+1)
return(sapply(seq_along(start), function(i) rawToChar(obj[start[i]:end[i]])))
}
col_to_json_raw_3 <- function(dt, col){
con <- rawConnection(raw(), open = "w")
on.exit(close(con))
jsonlite::stream_out(subset(dt, select = col), con, verbose = F)
return(readr::read_lines(rawConnectionValue(con),progress=F))
}
col_to_json_raw_4 <- function(dt, col){
con_raw <- rawConnection(raw(), open = "w")
jsonlite::stream_out(subset(dt, select = col), con_raw, verbose = F)
con_out <- rawConnection(rawConnectionValue(con_raw))
on.exit({
close(con_raw)
close(con_out)
})
return(readLines(con_out))
}
col_to_json_text <- function(dt, col){
con <- textConnection("character", open = "w")
on.exit(close(con))
jsonlite::stream_out(subset(dt, select = col), con, verbose = F)
return(textConnectionValue(con))
}
library(data.table)
n = c(1e1, 1e2,1e3,1e4, 1e5)
bench_list = lapply(n, function(x){
dt = data.table::data.table(
var1 = 1:x,
var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)
microbenchmark::microbenchmark(
"split_text" = col_to_json_raw_1(dt,"var2"),
"split_raw" = col_to_json_raw_2(dt,"var2"),
"raw_readr" = col_to_json_raw_3(dt,"var2"),
"raw_base" = col_to_json_raw_4(dt,"var2"),
"text_base" = col_to_json_text(dt,"var2"),
times = 10
)
})
benchplot(bench_list, n)
These methods seem promising. Plus if a solution with jsonlite could be found then the overall dependencies would be able to be kept low :)
col_to_json_jsonify <- function(dt, col, batch = 1e4){
max_len <- nrow(dt)
start <- seq(1, max_len, batch)
end <- c(start[-1]-1, max_len)
splits <- lapply(seq_along(start), function(i) dt[[col]][start[i]:end[i]])
output <- lapply(splits, function(i) {
strsplit(as.character(jsonify::to_ndjson(i,unbox = T, numeric_dates = F)), split = "\n")[[1]]
})[[1]]
return(output)
}
library(data.table)
n = c(1e1, 1e2,1e3,1e4, 1e5)
bench_list = lapply(n, function(x){
dt = data.table::data.table(
var1 = 1:x,
var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)
microbenchmark::microbenchmark(
"split_text" = col_to_json_raw_1(dt,"var2"),
"split_raw" = col_to_json_raw_2(dt,"var2"),
"raw_readr" = col_to_json_raw_3(dt,"var2"),
"raw_base" = col_to_json_raw_4(dt,"var2"),
"text_jsonify" = col_to_json_jsonify(dt,"var2"),
times = 10
)
})
Even with the new jsonlite functions it looks like jsonify is still faster.
col_to_json_raw_4 <- function(dt, col, batch = 500){
con_raw <- rawConnection(raw(), open = "w")
jsonlite::stream_out(subset(dt, select = col), con_raw, verbose = F, pagesize = batch)
con_out <- rawConnection(rawConnectionValue(con_raw))
on.exit({
close(con_raw)
close(con_out)
})
return(readLines(con_out))
}
library(data.table)
n = c(1e3,1e4, 1e5)
bench_list = lapply(n, function(x){
dt = data.table::data.table(
var1 = 1:x,
var2 = rep(list(list("var3"= 1:3, "var4" = list("var5"= letters[1:5]))), x)
)
microbenchmark::microbenchmark(
"raw_base_500" = col_to_json_raw_4(dt,"var2"),
"raw_base_1000" = col_to_json_raw_4(dt,"var2", 1e3),
"raw_base_10000" = col_to_json_raw_4(dt,"var2", 1e4),
"raw_base_100000" = col_to_json_raw_4(dt,"var2", 1e5),
"text_jsonify" = col_to_json_jsonify(dt,"var2"),
times = 10
)
})
Increasing the pagesize with jsonlite::stream_out
doesn't seem to be any performance improvements.
Currently noctua just collapses list using
paste
. This is incorrect as it Athena won't be able to interpret it, for example.