Open OfekShilon opened 1 year ago
Thanks for opening this!
I'm not surprised that very small objects have this property...Arrow's columnar format exploits that there are frequently many more rows than columns, and there are some places in the R package where we loop over columns in R. Mostly that is fine, although looping in R for 500 columns, as you've seen, can result in some overhead. If you look at the absolute times (instead of the relative times), I imagine that what you're seeing is still very small (maybe 0.1s) overhead...R can just do that much faster.
Is there a workflow where you're seeing this impact analysis time?
Is there a workflow where you're seeing this impact analysis time?
While transitioning from native-R storage to arrow, we saw substantial performance degradation in a workflow that generates many (10K+) small files in a cluster and then reads them back.
Make sure you're writing using compression = "uncompressed"
! It's not perfect, but is about 2x faster. I'll look into it to see if there's any way to skip some R code here to more directly call the C++ writer...even the leve of overhead with no compression that you've highlighted is confusing to me.
Using no compression:
tmpdir <- tempfile()
dir.create(tmpdir)
colnums <- c(10,20,30,100,150,200,300,500)
rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000)
# Generate files
for (colnum in colnums) {
for (rownum in rownums) {
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum, ncol=colnum))
save(dat, file=fn.robj)
arrow::write_feather(x = dat, sink = fn.arrow, compression = "uncompressed")
}
}
times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
rownames(times.robj) <- paste(rownums,"rows")
colnames(times.robj) <- paste(colnums,"cols")
times.arrow <- times.robj
for (i in 1:length(rownums)) {
for (j in 1:length(colnums)) {
rownum <- rownums[i]
colnum <- colnums[j]
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
# measure 2nd load to account for cold caches
load(fn.robj)
start <- Sys.time();
load(fn.robj);
times.robj[i,j] <- Sys.time()-start
tst <- arrow::read_feather(fn.arrow)
start <- Sys.time();
tst <- arrow::read_feather(fn.arrow);
times.arrow[i,j] <- Sys.time()-start
}
}
times.arrow / times.robj
#> 10 cols 20 cols 30 cols 100 cols 150 cols 200 cols
#> 1 rows 14.4725275 17.9795082 18.4140625 21.90818859 47.65606362 22.84116694
#> 2 rows 15.1983806 16.2460317 16.9053030 18.67129630 20.76380952 37.10859729
#> 3 rows 21.7117117 15.6601562 15.0646259 17.03752759 17.77000000 19.34379458
#> 4 rows 15.7056277 16.7242798 14.8692810 16.17453799 16.61224490 18.86018642
#> 5 rows 13.1034483 14.4306050 14.9470199 14.90576923 17.99046105 18.01030928
#> 10 rows 12.5816327 12.9710611 13.5114943 12.35703002 28.33454988 13.22032289
#> 20 rows 12.0430464 10.7642276 10.1307339 9.10829493 8.45411765 9.29576547
#> 30 rows 11.1220238 9.6205251 8.8284024 6.56949960 6.90670927 7.49974529
#> 40 rows 10.7088235 9.0176600 8.0673953 6.57269790 6.01518560 6.51640071
#> 50 rows 8.8784119 8.7257384 7.2162162 5.68754448 5.36519115 5.89375727
#> 60 rows 9.7962963 8.1595960 6.8823529 5.16987179 10.22431958 4.99090247
#> 70 rows 8.4882075 8.1819961 6.6296296 5.04599761 4.74102564 4.54345654
#> 100 rows 8.2778993 6.3507692 5.5512821 3.87919776 3.18816885 3.65419847
#> 200 rows 6.9781818 4.6319149 11.3175395 2.39477680 2.22712351 2.23399873
#> 300 rows 5.9528875 3.4087948 2.8162523 2.28367392 1.53755051 1.65800866
#> 400 rows 4.7578419 3.0028986 2.2602876 2.15348917 1.26760074 1.21309890
#> 500 rows 4.1558308 2.5225768 2.2711656 1.41115560 1.05550257 1.02989052
#> 1000 rows 2.2786585 1.3790087 3.0056259 0.60250798 0.53179530 0.53369967
#> 2000 rows 1.3539916 1.5805147 0.5737926 0.30327838 0.27820840 0.27057028
#> 3000 rows 1.1347815 0.5374048 0.3965298 0.20412111 0.19350023 0.45714431
#> 4000 rows 0.7417894 0.4128671 3.5819726 0.24726677 0.14699569 0.14043276
#> 5000 rows 0.6041413 0.3378337 0.8593773 0.19491538 0.12437216 0.11456206
#> 10000 rows 0.3014837 0.1828018 0.1201612 0.02665133 0.05724913 0.05461478
#> 300 cols 500 cols
#> 1 rows 27.20939086 48.20383912
#> 2 rows 25.13126492 34.15562914
#> 3 rows 24.11811024 30.89401968
#> 4 rows 21.79393939 26.18478261
#> 5 rows 20.94679803 26.48522653
#> 10 rows 14.96833216 25.12523191
#> 20 rows 10.51369216 15.84330318
#> 30 rows 7.43155288 11.73603952
#> 40 rows 6.62136223 10.43135770
#> 50 rows 5.99006711 9.25798485
#> 60 rows 5.04369274 6.14095785
#> 70 rows 4.75809650 5.70886076
#> 100 rows 5.00190311 4.54890153
#> 200 rows 4.50396996 2.68490953
#> 300 rows 2.99969424 1.89673687
#> 400 rows 2.34352282 1.48038762
#> 500 rows 2.03165384 1.20663080
#> 1000 rows 0.70601711 0.63683243
#> 2000 rows 0.27909992 0.43289769
#> 3000 rows 0.18386126 0.20415949
#> 4000 rows 0.29411463 0.16423265
#> 5000 rows 0.11312960 0.12045428
#> 10000 rows 0.05825836 0.06443037
Created on 2023-01-10 with reprex v2.0.2
Using default compression:
tmpdir <- tempfile()
dir.create(tmpdir)
colnums <- c(10,20,30,100,150,200,300,500)
rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000)
# Generate files
for (colnum in colnums) {
for (rownum in rownums) {
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum, ncol=colnum))
save(dat, file=fn.robj)
arrow::write_feather(x = dat, sink = fn.arrow)
}
}
times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
rownames(times.robj) <- paste(rownums,"rows")
colnames(times.robj) <- paste(colnums,"cols")
times.arrow <- times.robj
for (i in 1:length(rownums)) {
for (j in 1:length(colnums)) {
rownum <- rownums[i]
colnum <- colnums[j]
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
# measure 2nd load to account for cold caches
load(fn.robj)
start <- Sys.time();
load(fn.robj);
times.robj[i,j] <- Sys.time()-start
tst <- arrow::read_feather(fn.arrow)
start <- Sys.time();
tst <- arrow::read_feather(fn.arrow);
times.arrow[i,j] <- Sys.time()-start
}
}
times.arrow / times.robj
#> 10 cols 20 cols 30 cols 100 cols 150 cols 200 cols
#> 1 rows 16.9572954 19.6031746 19.4701754 26.5231144 56.01642710 33.39605735
#> 2 rows 19.0990991 20.9177489 20.8730769 23.7868481 26.38644689 45.43119266
#> 3 rows 21.1547619 19.2469136 21.4253731 21.8924051 24.21588946 25.88827586
#> 4 rows 18.4112554 18.8007663 18.4275862 21.3195021 22.97166667 26.45885635
#> 5 rows 15.8395522 17.8750000 16.3880597 22.0901804 22.89716841 26.51943005
#> 10 rows 15.3061224 13.0547945 14.9222520 16.8244767 34.23970944 17.79330709
#> 20 rows 14.3840830 13.6781609 12.3011236 11.2735528 11.33975904 11.87954111
#> 30 rows 13.5421687 11.0495283 10.1816514 9.3316370 9.57760314 9.62248996
#> 40 rows 12.6453488 9.8964059 9.9819168 7.7601744 8.33240067 8.37088608
#> 50 rows 11.8975069 10.1530612 10.3616000 7.4708579 7.31219272 7.15629522
#> 60 rows 11.3643836 8.9316081 8.3958991 7.0183366 12.72128146 6.76300578
#> 70 rows 11.0265252 9.6686869 8.1184408 6.6577017 6.38455080 6.92413793
#> 100 rows 10.3680556 8.0369748 6.4965116 5.1863354 4.83441670 5.06206362
#> 200 rows 12.3647059 6.8830275 4.9482612 3.2896631 3.24210312 3.27877754
#> 300 rows 5.7400000 4.5351986 3.5697161 2.3988402 2.14011906 2.06634286
#> 400 rows 5.0799087 2.9543702 2.8629648 1.7690058 1.72880966 1.76503533
#> 500 rows 4.4447884 2.8496770 2.3769231 1.4735886 1.35359428 1.52543420
#> 1000 rows 2.7072555 1.5854657 1.3616873 0.7840171 0.76427293 0.72445101
#> 2000 rows 1.5208333 0.8911792 0.6701459 0.4350788 0.37124991 0.37946588
#> 3000 rows 1.0453862 0.6643997 0.5169999 0.2656266 0.24755968 0.25853659
#> 4000 rows 0.8616682 0.4784442 0.4127477 0.2119238 0.19982264 0.19844568
#> 5000 rows 0.8958047 0.3799294 0.3235682 0.1832789 0.16097686 0.16914301
#> 10000 rows 0.3733628 0.2193108 0.1665289 0.1076588 0.09350925 0.08932051
#> 300 cols 500 cols
#> 1 rows 35.87483176 62.1506196
#> 2 rows 32.28801843 44.1924342
#> 3 rows 31.48050459 39.4118098
#> 4 rows 29.49416755 36.0374823
#> 5 rows 28.25379171 34.6821192
#> 10 rows 20.96233383 29.6552511
#> 20 rows 12.59460738 21.9988169
#> 30 rows 10.30442541 15.0805057
#> 40 rows 9.17821473 13.3024585
#> 50 rows 7.90048940 10.9834538
#> 60 rows 7.22199747 8.0121655
#> 70 rows 7.09084699 7.5827408
#> 100 rows 5.27838565 5.9264278
#> 200 rows 5.55643482 3.2979336
#> 300 rows 3.63902649 2.3820292
#> 400 rows 3.04591480 1.9261239
#> 500 rows 2.45318492 1.5959291
#> 1000 rows 1.27772319 0.8132839
#> 2000 rows 0.70657236 0.4209621
#> 3000 rows 0.50213646 0.2835666
#> 4000 rows 0.20044236 0.2147253
#> 5000 rows 0.14406603 0.1745972
#> 10000 rows 0.08071889 0.1012044
Created on 2023-01-10 with reprex v2.0.2
Ok! It seems like the problem is metadata. On write, we stick some R metadata into the schema and use it to do some stuff when we recreate the data frame on the way out. Most of the time that metadata is unused, and because it involves an R loop we see performance issues.
If you write the file without the R metadata, it looks like reading it is much faster (but definitely test locally to confirm!).
If this works for you, we could add a flag to disable writing the R metadata (or disable loading it).
tmpdir <- tempfile()
dir.create(tmpdir)
colnums <- c(10,20,30,100,150,200,300,500)
rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000)
# Generate files
for (colnum in colnums) {
for (rownum in rownums) {
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum, ncol=colnum))
save(dat, file=fn.robj)
# create the table manually to avoid metadata
dat_table <- arrow::as_arrow_table(dat)
schema <- dat_table$schema
schema$metadata <- NULL
dat_table <- dat_table$cast(schema)
arrow::write_feather(x = dat_table, sink = fn.arrow, compression = "uncompressed")
}
}
times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
rownames(times.robj) <- paste(rownums,"rows")
colnames(times.robj) <- paste(colnums,"cols")
times.arrow <- times.robj
for (i in 1:length(rownums)) {
for (j in 1:length(colnums)) {
rownum <- rownums[i]
colnum <- colnums[j]
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
# measure 2nd load to account for cold caches
load(fn.robj)
start <- Sys.time();
load(fn.robj);
times.robj[i,j] <- Sys.time()-start
tst <- arrow::read_feather(fn.arrow)
start <- Sys.time();
tst <- arrow::read_feather(fn.arrow, as_data_frame = TRUE, mmap = TRUE);
times.arrow[i,j] <- Sys.time()-start
}
}
times.arrow / times.robj
#> 10 cols 20 cols 30 cols 100 cols 150 cols 200 cols
#> 1 rows 14.0952381 12.2730769 14.97500000 9.20437956 8.75479744 8.13718412
#> 2 rows 16.0696517 14.9234234 14.28278689 9.90533981 8.56250000 8.56160000
#> 3 rows 13.7713004 13.4891775 11.53790614 8.57407407 8.35842294 7.07703704
#> 4 rows 14.7380952 95.9319149 11.25517241 8.21645022 6.92554992 6.31000000
#> 5 rows 14.4626168 12.9609375 11.72664360 7.66060606 6.76986755 5.90463576
#> 10 rows 12.3790323 10.7172414 9.49712644 6.16776316 5.58681876 4.66462793
#> 20 rows 11.2867647 62.1293103 7.77804296 4.57604790 3.61700263 3.06232877
#> 30 rows 10.3590604 8.4000000 7.05376344 3.62488129 2.55404571 11.87710970
#> 40 rows 9.9206349 7.3310185 6.19379845 3.17002417 2.39525463 1.98561465
#> 50 rows 11.6686567 6.9299781 5.71708185 2.71903751 2.14587738 1.81172220
#> 60 rows 8.8262032 6.7301255 5.47731092 16.19293478 1.97486961 1.62612613
#> 70 rows 8.8347339 6.4109312 5.19554849 2.44055069 1.78809932 1.56611431
#> 100 rows 7.7412935 5.3079526 4.49799197 2.03780242 1.44817927 1.29230357
#> 200 rows 6.7373358 3.8204819 3.00714286 1.16359795 0.87507926 0.72829531
#> 300 rows 4.9736842 2.9963603 2.74172185 0.85562541 0.63074822 0.51833064
#> 400 rows 3.9795134 2.4449307 1.77052632 0.82852432 0.54286035 0.40358784
#> 500 rows 3.4116356 2.0236613 1.46481876 0.55421516 0.40433317 1.58486533
#> 1000 rows 1.9754717 1.1283404 0.85135779 0.28743853 0.21571464 0.17574634
#> 2000 rows 1.1457113 0.5982890 0.41544440 0.15338311 0.10889462 0.09171419
#> 3000 rows 0.7994512 0.4206546 0.28310156 0.10914713 0.07384605 0.06096175
#> 4000 rows 0.6511236 0.3175360 0.23418670 0.07628486 0.05940748 0.04918424
#> 5000 rows 0.4762331 0.2692693 0.17628306 0.07026943 0.04680908 0.03953978
#> 10000 rows 0.2431953 0.1263146 0.08880676 0.03294864 0.02410858 0.02036180
#> 300 cols 500 cols
#> 1 rows 7.25616438 6.53256705
#> 2 rows 6.45868263 5.33515199
#> 3 rows 6.25084364 5.28482972
#> 4 rows 5.76898396 4.93076374
#> 5 rows 5.70020121 4.40679095
#> 10 rows 3.84901532 3.18668529
#> 20 rows 2.55096154 2.00625000
#> 30 rows 1.87665830 1.49675397
#> 40 rows 1.68254466 1.29062263
#> 50 rows 1.72490914 1.25016578
#> 60 rows 1.38690327 1.04475309
#> 70 rows 1.43247588 0.94965370
#> 100 rows 0.96589063 0.79135421
#> 200 rows 0.58168371 0.45303118
#> 300 rows 0.42742552 0.32061561
#> 400 rows 0.31737134 0.24586616
#> 500 rows 0.25752199 0.20255645
#> 1000 rows 0.55231620 0.10501935
#> 2000 rows 0.07068172 0.05902058
#> 3000 rows 0.04852037 0.13285303
#> 4000 rows 0.03674992 0.02832831
#> 5000 rows 0.01554990 0.02325099
#> 10000 rows 0.01585784 0.01236076
Created on 2023-01-10 with reprex v2.0.2
@paleolimbot Thanks for the suggestion. I'm aware of the overhead of metadata from this discussion, but there is no metadata to speak of in the files in this example (not even row names) - and indeed I don't see any definite win by dropping it:
tmpdir <- tempfile()
dir.create(tmpdir)
colnums <- c(10,20,30,100,150,200)
rownums <- c(1,2,3,4,5,10,20,30,40,50,60,70,100,200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000)
# Generate files
for (colnum in colnums) {
for (rownum in rownums) {
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
fn.arrow.nometa <- paste0(tmpdir, "/arrow.nometa.",rownum,"x",colnum)
dat <- as.data.frame(matrix(runif(rownum*colnum), nrow=rownum, ncol=colnum))
save(dat, file=fn.robj)
# create the table manually to avoid metadata
dat_table <- arrow::as_arrow_table(dat)
arrow::write_feather(x = dat_table, sink = fn.arrow, compression = "uncompressed")
schema <- dat_table$schema
schema$metadata <- NULL
dat_table <- dat_table$cast(schema)
arrow::write_feather(x = dat_table, sink = fn.arrow.nometa, compression = "uncompressed")
}
}
times.robj <- matrix(0, nrow=length(rownums), ncol=length(colnums))
rownames(times.robj) <- paste(rownums,"rows")
colnames(times.robj) <- paste(colnums,"cols")
times.arrow <- times.robj
times.arrow.nometa <- times.robj
for (i in 1:length(rownums)) {
for (j in 1:length(colnums)) {
rownum <- rownums[i]
colnum <- colnums[j]
fn.robj <- paste0(tmpdir, "/robj.",rownum,"x",colnum)
fn.arrow <- paste0(tmpdir, "/arrow.",rownum,"x",colnum)
fn.arrow.nometa <- paste0(tmpdir, "/arrow.nometa.",rownum,"x",colnum)
# measure 2nd load to account for cold caches
load(fn.robj)
start <- Sys.time();
load(fn.robj);
times.robj[i,j] <- Sys.time()-start
tst <- arrow::read_feather(fn.arrow)
start <- Sys.time();
tst <- arrow::read_feather(fn.arrow, as_data_frame = TRUE, mmap = TRUE);
times.arrow[i,j] <- Sys.time()-start
tst <- arrow::read_feather(fn.arrow.nometa)
start <- Sys.time();
tst <- arrow::read_feather(fn.arrow.nometa, as_data_frame = TRUE, mmap = TRUE);
times.arrow.nometa[i,j] <- Sys.time()-start
}
}
Gives -
> times.arrow.nometa / times.robj
10 cols 20 cols 30 cols 100 cols 150 cols 200 cols
1 rows 70.0114504 58.2468085 46.1319444 21.80683403 11.17302053 17.57530120
2 rows 43.8119658 35.5418327 44.6910569 14.93035480 92.98226950 15.06676136
3 rows 59.6066351 35.9829060 17.2069672 20.61194030 16.86906710 15.61495845
4 rows 236.3318182 44.0948905 31.1320755 16.19062500 18.24731183 8.38811445
5 rows 38.0276498 30.7560976 17.5539419 15.84103512 13.33577713 11.82111801
10 rows 29.7992278 25.0996785 13.4232082 13.74528302 10.35152838 8.64968153
20 rows 25.3423729 19.8398950 17.2170022 9.75327511 7.33414833 8.22432262
30 rows 16.1743697 11.8511628 19.6003824 7.47927032 5.79861111 5.35154017
40 rows 30.6280992 24.8726236 20.2042105 19.87437811 6.08097028 4.42742382
50 rows 29.8060000 22.1587838 18.2661499 3.84229508 6.51073729 3.17507246
60 rows 20.1960298 16.0766610 12.6747851 5.38434983 3.86645595 16.07189542
70 rows 19.9536585 13.5328597 15.0110345 4.48984526 3.51769231 2.88766452
100 rows 17.3659091 11.7341577 7.4166054 4.57267189 3.18088012 2.27087242
200 rows 16.2354892 10.2235047 6.3983116 4.09790752 1.63446432 1.69634703
300 rows 7.3573854 7.1700787 4.5906849 1.63513514 5.10005897 1.15804737
400 rows 6.7309689 4.7252280 4.1573647 1.17293525 0.92025293 0.95345718
500 rows 7.0257590 4.1061644 2.8501041 1.03354651 0.70476702 0.59290404
1000 rows 5.9288681 3.4319209 1.4538153 0.51121076 0.37171409 0.34528247
2000 rows 2.2429879 1.1519025 0.7536303 0.26461660 0.18316701 0.13906527
3000 rows 1.4939711 0.8129300 0.5481969 0.17433917 0.13261505 0.11275994
4000 rows 1.3325031 0.6623176 0.5343539 0.13499381 0.09493322 0.07435454
5000 rows 0.8804653 3.8379121 0.3498308 0.13787353 0.07957967 0.06604075
10000 rows 0.7404644 0.2288824 0.1536465 0.06676482 0.04172911 0.03254246
My measurements differ from yours, but even yours show robj wins by a wide margin for <2000 lines.
Describe the bug, including details regarding any error messages, version, and platform.
Test script that measures R/arrow load time for various sizes:
Results:
Is this known overhead? It seems rather large...
Component(s)
R