Closed jangorecki closed 4 years ago
I don't see any deferred evaluation here. t - time of join query chk - time of head+tail+sum over joined column csv - time of writing to csv
#q1 - csv later
#t: 0.02915720595046878
#chk: 0.05908817891031504
#csv: 6.139759267214686
#q1 - csv sooner
#t: 0.029867439065128565
#csv: 6.12119265506044
#chk: 0.06874015228822827
#q5 - csv later
#t: 0.20077825523912907
#chk: 0.07884587720036507
#csv: 8.895040195900947
#q5 - csv sooner
#t: 0.19877093704417348
#csv: 9.080209706909955
#chk: 0.09811642114073038
script for future reference
#!/usr/bin/env python
print("# test-join-cudf.py", flush=True)
import os
import gc
import timeit
import cudf as cu
exec(open("./_helpers/helpers.py").read())
ver = cu.__version__.split("+", 1)[0]
data_name = "J1_1e7_NA_0_0"
src_jn_x = os.path.join("data", data_name+".csv")
y_data_name = join_to_tbls(data_name)
src_jn_y = [os.path.join("data", y_data_name[0]+".csv"), os.path.join("data", y_data_name[1]+".csv"), os.path.join("data", y_data_name[2]+".csv")]
if len(src_jn_y) != 3:
raise Exception("Something went wrong in preparing files used for join")
print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True)
x = cu.read_csv(src_jn_x, header=0, dtype=['int32','int32','int32','str','str','str','float64'])
small = cu.read_csv(src_jn_y[0], header=0, dtype=['int32','str','float64'])
big = cu.read_csv(src_jn_y[2], header=0, dtype=['int32','int32','int32','str','str','str','float64'])
print(len(x.index), flush=True)
print(len(small.index), flush=True)
print(len(big.index), flush=True)
task_init = timeit.default_timer()
print("joining...", flush=True)
question = "small inner on int" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x.merge(small, on='id1')
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
print(t, flush=True)
#t_start = timeit.default_timer()
#ans.to_csv("cudf_join_1e7_q1_ans.csv")
#csvt = timeit.default_timer() - t_start
#print(csvt, flush=True)
t_start = timeit.default_timer()
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
chk = [ans['v1'].sum(), ans['v2'].sum()]
chkt = timeit.default_timer() - t_start
print(chkt, flush=True)
t_start = timeit.default_timer()
ans.to_csv("cudf_join_1e7_q1_ans.csv")
csvt = timeit.default_timer() - t_start
print(csvt, flush=True)
del ans
question = "big inner on int" # q5
gc.collect()
t_start = timeit.default_timer()
ans = x.merge(big, on='id3')
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
print(t, flush=True)
#t_start = timeit.default_timer()
#ans.to_csv("cudf_join_1e7_q1_ans.csv")
#csvt = timeit.default_timer() - t_start
#print(csvt, flush=True)
t_start = timeit.default_timer()
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
chk = [ans['v1'].sum(), ans['v2'].sum()]
chkt = timeit.default_timer() - t_start
print(chkt, flush=True)
t_start = timeit.default_timer()
ans.to_csv("cudf_join_1e7_q5_ans.csv")
csvt = timeit.default_timer() - t_start
print(csvt, flush=True)
join timings for 1e7 rows data
Time of join questions for cuDF has been greatly reduced in the recent 0.13.0 version. We should ensure it actually materializes joined data, and not just compute matching rows on both side of the join. The same as we did recently for python datatable, as described in #141.