import numpy as np
import janitor
import pandas as pd
def east_west():
num_rows_left, num_rows_right = 5_000_000, 5_000_000
rng = np.random.default_rng(42)
# Generate two separate datasets where revenue/cost are linearly related to
# duration/time, but add some noise to the west table so that there are some
# rows where the cost for the same or greater time will be less than the east table.
east_dur = rng.integers(1_000, 10_000_000, num_rows_left)
east_rev = (east_dur * 0.123).astype(np.int32)
west_time = rng.integers(1_000, 500_000, num_rows_right)
west_cost = west_time * 0.123
west_cost += rng.normal(0.0, 1.0, num_rows_right)
west_cost = west_cost.astype(np.int32)
east = pd.DataFrame(
{
"id": np.arange(0, num_rows_left),
"dur": east_dur,
"rev": east_rev,
"cores": rng.integers(1, 10, num_rows_left),
}
)
west = pd.DataFrame(
{
"t_id": np.arange(0, num_rows_right),
"time": west_time,
"cost": west_cost,
"cores": rng.integers(1, 10, num_rows_right),
}
)
return east, west
east,west = east_west()
%timeit eastt.conditional_join(westt, ('dur','time','<='),('rev','cost','>='), ('id','t_id', '>'), use_numba=False)
2.61 s ± 21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit eastt.conditional_join(westt, ('dur','time','<='),('rev','cost','>='), ('id','t_id', '>'), use_numba=True)
11.3 s ± 55.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
can we get some gains via the join optimizer idea from duckdb blog? at the very least, could we get some gains via early filtering based on min/max stats?
adapted from here
speed difference is large. worth looking into