h2oai / db-benchmark

reproducible benchmark of database-like ops
https://h2oai.github.io/db-benchmark
Mozilla Public License 2.0
322 stars 85 forks source link

check if cuDF join materializes answer #155

Closed jangorecki closed 4 years ago

jangorecki commented 4 years ago

join timings for 1e7 rows data

                 question 0.9.0 0.10.0 0.11.0 0.12.0 0.13.0
1:     small inner on int 2.029  1.998  1.962  2.119  0.031
2:    medium inner on int 0.819  0.753  0.760  0.235  0.037
3:    medium outer on int 0.908  0.881  1.384  0.785  0.040
4: medium inner on factor 0.303  0.270  0.269  0.213  0.035
5:       big inner on int 4.170  3.936  4.185  2.212  0.194

Time of join questions for cuDF has been greatly reduced in the recent 0.13.0 version. We should ensure it actually materializes joined data, and not just compute matching rows on both side of the join. The same as we did recently for python datatable, as described in #141.

jangorecki commented 4 years ago

I don't see any deferred evaluation here. t - time of join query chk - time of head+tail+sum over joined column csv - time of writing to csv

#q1 - csv later
#t:   0.02915720595046878
#chk: 0.05908817891031504
#csv: 6.139759267214686
#q1 - csv sooner
#t:   0.029867439065128565
#csv: 6.12119265506044
#chk: 0.06874015228822827
#q5 - csv later
#t:   0.20077825523912907
#chk: 0.07884587720036507
#csv: 8.895040195900947
#q5 - csv sooner
#t:   0.19877093704417348
#csv: 9.080209706909955
#chk: 0.09811642114073038

script for future reference

#!/usr/bin/env python

print("# test-join-cudf.py", flush=True)

import os
import gc
import timeit
import cudf as cu

exec(open("./_helpers/helpers.py").read())

ver = cu.__version__.split("+", 1)[0]

data_name = "J1_1e7_NA_0_0"
src_jn_x = os.path.join("data", data_name+".csv")
y_data_name = join_to_tbls(data_name)
src_jn_y = [os.path.join("data", y_data_name[0]+".csv"), os.path.join("data", y_data_name[1]+".csv"), os.path.join("data", y_data_name[2]+".csv")]
if len(src_jn_y) != 3:
    raise Exception("Something went wrong in preparing files used for join")

print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True)

x = cu.read_csv(src_jn_x, header=0, dtype=['int32','int32','int32','str','str','str','float64'])
small = cu.read_csv(src_jn_y[0], header=0, dtype=['int32','str','float64'])
big = cu.read_csv(src_jn_y[2], header=0, dtype=['int32','int32','int32','str','str','str','float64'])

print(len(x.index), flush=True)
print(len(small.index), flush=True)
print(len(big.index), flush=True)

task_init = timeit.default_timer()
print("joining...", flush=True)

question = "small inner on int" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x.merge(small, on='id1')
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
print(t, flush=True)

#t_start = timeit.default_timer()
#ans.to_csv("cudf_join_1e7_q1_ans.csv")
#csvt = timeit.default_timer() - t_start
#print(csvt, flush=True)

t_start = timeit.default_timer()
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
chk = [ans['v1'].sum(), ans['v2'].sum()]
chkt = timeit.default_timer() - t_start
print(chkt, flush=True)

t_start = timeit.default_timer()
ans.to_csv("cudf_join_1e7_q1_ans.csv")
csvt = timeit.default_timer() - t_start
print(csvt, flush=True)

del ans

question = "big inner on int" # q5
gc.collect()
t_start = timeit.default_timer()
ans = x.merge(big, on='id3')
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
print(t, flush=True)

#t_start = timeit.default_timer()
#ans.to_csv("cudf_join_1e7_q1_ans.csv")
#csvt = timeit.default_timer() - t_start
#print(csvt, flush=True)

t_start = timeit.default_timer()
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
chk = [ans['v1'].sum(), ans['v2'].sum()]
chkt = timeit.default_timer() - t_start
print(chkt, flush=True)

t_start = timeit.default_timer()
ans.to_csv("cudf_join_1e7_q5_ans.csv")
csvt = timeit.default_timer() - t_start
print(csvt, flush=True)