NickCrews / mismo

The SQL/Ibis powered sklearn of record linkage
https://nickcrews.github.io/mismo/
GNU Lesser General Public License v3.0
12 stars 3 forks source link

joining on arrays is slow #29

Closed NickCrews closed 4 months ago

NickCrews commented 4 months ago
import mismo
import ibis

from ibis import _

ibis.options.interactive = True

n = 10_000
k = 5
t = ibis.memtable(
    {
        "record_id": list(range(n)),
        "vals": [list(range(i, i + k)) for i in range(n)],
    }
)
be = ibis.get_backend()
t = be.create_table("t", t, overwrite=True)
# results in ~n blocking keys, each with ~k values,
# for a total of ~n*k naive pairs.
# Some of these pairs will be redundant, but the total
t
# ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
# ┃ record_id ┃ vals            ┃
# ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
# │ int64     │ array<int64>    │
# ├───────────┼─────────────────┤
# │         0 │ [0, 1, ... +3]  │
# │         1 │ [1, 2, ... +3]  │
# │         2 │ [2, 3, ... +3]  │
# │         3 │ [3, 4, ... +3]  │
# │         4 │ [4, 5, ... +3]  │
# │         5 │ [5, 6, ... +3]  │
# │         6 │ [6, 7, ... +3]  │
# │         7 │ [7, 8, ... +3]  │
# │         8 │ [8, 9, ... +3]  │
# │         9 │ [9, 10, ... +3] │
# │         … │ …               │
# └───────────┴─────────────────┘

def old(left, right):
    l_keyed = left.mutate(_key=_.vals.unnest())
    r_keyed = right.mutate(_key=_.vals.unnest())
    j = ibis.join(l_keyed, r_keyed, "_key", lname="{name}_l", rname="{name}_r").drop(
        "_key"
    )
    j = j.distinct(on=["record_id_l", "record_id_r"])
    return j

def new(left, right):
    l_keyed = left.mutate(_key=_.vals.unnest())
    r_keyed = right.mutate(_key=_.vals.unnest())
    j = ibis.join(l_keyed, r_keyed, "_key", lname="{name}_l", rname="{name}_r")
    j = j["record_id_l", "record_id_r"].distinct()
    j = j.left_join(left, j.record_id_l == left.record_id)
    j = j.left_join(right, j.record_id_r == right.record_id)
    return j

o = old(t, t)
n = new(t, t)

from mismo.block import _sql_analyze as sa

# print(sa._explain_str(o, analyze=True))
# print(sa._explain_str(n, analyze=True))
# ibis.to_sql(o)
# ibis.to_sql(n)
NickCrews commented 4 months ago

The old was producing SQL like

WITH "t1" AS (
  SELECT
    "t0"."record_id",
    "t0"."vals",
    UNNEST("t0"."vals") AS "_key"
  FROM "t" AS "t0"
)
SELECT
  "t7"."record_id_l",
  "t7"."vals_l",
  "t7"."record_id_r",
  "t7"."vals_r"
FROM (
  SELECT
    "t6"."record_id_l",
    "t6"."record_id_r",
    FIRST("t6"."vals_l") AS "vals_l",
    FIRST("t6"."vals_r") AS "vals_r"
  FROM (
    SELECT
      "t5"."record_id_l",
      "t5"."vals_l",
      "t5"."record_id_r",
      "t5"."vals_r"
    FROM (
      SELECT
        "t3"."record_id" AS "record_id_l",
        "t3"."vals" AS "vals_l",
        "t3"."_key",
        "t4"."record_id" AS "record_id_r",
        "t4"."vals" AS "vals_r"
      FROM "t1" AS "t3"
      INNER JOIN "t1" AS "t4"
        ON "t3"."_key" = "t4"."_key"
    ) AS "t5"
  ) AS "t6"
  GROUP BY
    1,
    2
) AS "t7"

EXPLAIN ANALYZE shows

┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││    Query Profiling Information    ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
EXPLAIN ANALYZE WITH "t1" AS (   SELECT     "t0"."record_id",     "t0"."vals",     UNNEST("t0"."vals") AS "_key"   FROM "t" AS "t0" ) SELECT   "t7"."record_id_l",   "t7"."vals_l",   "t7"."record_id_r",   "t7"."vals_r" FROM (   SELECT     "t6"."record_id_l",     "t6"."record_id_r",     FIRST("t6"."vals_l") AS "vals_l",     FIRST("t6"."vals_r") AS "vals_r"   FROM (     SELECT       "t5"."record_id_l",       "t5"."vals_l",       "t5"."record_id_r",       "t5"."vals_r"     FROM (       SELECT         "t3"."record_id" AS "record_id_l",         "t3"."vals" AS "vals_l",         "t3"."_key",         "t4"."record_id" AS "record_id_r",         "t4"."vals" AS "vals_r"       FROM "t1" AS "t3"       INNER JOIN "t1" AS "t4"         ON "t3"."_key" = "t4"."_key"     ) AS "t5"   ) AS "t6"   GROUP BY     1,     2 ) AS "t7"
┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││         Total Time: 1.09s         ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
┌───────────────────────────┐                             
│      RESULT_COLLECTOR     │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│             0             │                             
│          (0.00s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│      EXPLAIN_ANALYZE      │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│             0             │                             
│          (0.00s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│         PROJECTION        │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│        record_id_l        │                             
│           vals_l          │                             
│        record_id_r        │                             
│           vals_r          │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│           89980           │                             
│          (0.00s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│         PROJECTION        │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│__internal_decompress_integ│                             
│     ral_bigint(#0, 0)     │                             
│__internal_decompress_integ│                             
│     ral_bigint(#1, 0)     │                             
│             #2            │                             
│             #3            │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│           89980           │                             
│          (0.00s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│       HASH_GROUP_BY       │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│             #0            │                             
│             #1            │                             
│         first(#2)         │                             
│         first(#3)         │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│           89980           │                             
│          (2.16s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│         PROJECTION        │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│        record_id_l        │                             
│        record_id_r        │                             
│           vals_l          │                             
│           vals_r          │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│           249960          │                             
│          (0.01s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│         PROJECTION        │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│__internal_compress_integra│                             
│     l_usmallint(#0, 0)    │                             
│             #1            │                             
│__internal_compress_integra│                             
│     l_usmallint(#2, 0)    │                             
│             #3            │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│           249960          │                             
│          (0.04s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│         PROJECTION        │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│        record_id_l        │                             
│           vals_l          │                             
│        record_id_r        │                             
│           vals_r          │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│           249960          │                             
│          (0.01s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│         HASH_JOIN         │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│           INNER           │                             
│        _key = _key        │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ├──────────────┐              
│         EC: 10364         │              │              
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │              │              
│           249960          │              │              
│          (0.14s)          │              │              
└─────────────┬─────────────┘              │                                           
┌─────────────┴─────────────┐┌─────────────┴─────────────┐
│           UNNEST          ││           UNNEST          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│           50000           ││           50000           │
│          (0.00s)          ││          (0.00s)          │
└─────────────┬─────────────┘└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐┌─────────────┴─────────────┐
│         SEQ_SCAN          ││         SEQ_SCAN          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             t             ││             t             │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│         record_id         ││         record_id         │
│            vals           ││            vals           │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│         EC: 10000         ││         EC: 10000         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│           10000           ││           10000           │
│          (0.00s)          ││          (0.00s)          │
└───────────────────────────┘└───────────────────────────┘   

was saying this was slow because the outermost groupby agg. I'm guessing it is somehow being slow at dealing with all of the array elements.

NickCrews commented 4 months ago

The new SQL is

WITH "t3" AS (
  SELECT
    "t0"."record_id",
    "t0"."vals",
    UNNEST("t0"."vals") AS "_key"
  FROM "t" AS "t0"
)
SELECT
  "t9"."record_id_l",
  "t9"."record_id_r",
  "t1"."record_id",
  "t1"."vals",
  "t2"."record_id" AS "record_id_right",
  "t2"."vals" AS "vals_right"
FROM (
  SELECT DISTINCT
    *
  FROM (
    SELECT
      "t5"."record_id" AS "record_id_l",
      "t6"."record_id" AS "record_id_r"
    FROM "t3" AS "t5"
    INNER JOIN "t3" AS "t6"
      ON "t5"."_key" = "t6"."_key"
  ) AS "t7"
) AS "t9"
LEFT OUTER JOIN "t" AS "t1"
  ON "t9"."record_id_l" = "t1"."record_id"
LEFT OUTER JOIN "t" AS "t2"
  ON "t9"."record_id_r" = "t2"."record_id"

which has a query plan of

┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││    Query Profiling Information    ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
EXPLAIN ANALYZE WITH "t3" AS (   SELECT     "t0"."record_id",     "t0"."vals",     UNNEST("t0"."vals") AS "_key"   FROM "t" AS "t0" ) SELECT   "t9"."record_id_l",   "t9"."record_id_r",   "t1"."record_id",   "t1"."vals",   "t2"."record_id" AS "record_id_right",   "t2"."vals" AS "vals_right" FROM (   SELECT DISTINCT     *   FROM (     SELECT       "t5"."record_id" AS "record_id_l",       "t6"."record_id" AS "record_id_r"     FROM "t3" AS "t5"     INNER JOIN "t3" AS "t6"       ON "t5"."_key" = "t6"."_key"   ) AS "t7" ) AS "t9" LEFT OUTER JOIN "t" AS "t1"   ON "t9"."record_id_l" = "t1"."record_id" LEFT OUTER JOIN "t" AS "t2"   ON "t9"."record_id_r" = "t2"."record_id"
┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││        Total Time: 0.0899s        ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
┌───────────────────────────┐                                                                                       
│      RESULT_COLLECTOR     │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│             0             │                                                                                       
│          (0.00s)          │                                                                                       
└─────────────┬─────────────┘                                                                                                                    
┌─────────────┴─────────────┐                                                                                       
│      EXPLAIN_ANALYZE      │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│             0             │                                                                                       
│          (0.00s)          │                                                                                       
└─────────────┬─────────────┘                                                                                                                    
┌─────────────┴─────────────┐                                                                                       
│         HASH_JOIN         │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│            LEFT           │                                                                                       
│  record_id_r = record_id  │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ├────────────────────────────────────────────────────────────────────────┐              
│         EC: 10364         │                                                                        │              
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                        │              
│           89980           │                                                                        │              
│          (0.00s)          │                                                                        │              
└─────────────┬─────────────┘                                                                        │                                           
┌─────────────┴─────────────┐                                                          ┌─────────────┴─────────────┐
│         HASH_JOIN         │                                                          │         SEQ_SCAN          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│            LEFT           │                                                          │             t             │
│  record_id_l = record_id  │                                                          │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ├───────────────────────────────────────────┐              │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│         EC: 10364         │                                           │              │         EC: 10000         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                           │              │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│           89980           │                                           │              │           10000           │
│          (0.00s)          │                                           │              │          (0.00s)          │
└─────────────┬─────────────┘                                           │              └───────────────────────────┘                             
┌─────────────┴─────────────┐                             ┌─────────────┴─────────────┐                             
│         PROJECTION        │                             │         SEQ_SCAN          │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│__internal_decompress_integ│                             │             t             │                             
│     ral_bigint(#0, 0)     │                             │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│__internal_decompress_integ│                             │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│     ral_bigint(#1, 0)     │                             │         EC: 10000         │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│           89980           │                             │           10000           │                             
│          (0.00s)          │                             │          (0.00s)          │                             
└─────────────┬─────────────┘                             └───────────────────────────┘                                                          
┌─────────────┴─────────────┐                                                                                       
│       HASH_GROUP_BY       │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│             #0            │                                                                                       
│             #1            │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│           89980           │                                                                                       
│          (0.01s)          │                                                                                       
└─────────────┬─────────────┘                                                                                                                    
┌─────────────┴─────────────┐                                                                                       
│         PROJECTION        │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│       t7.record_id_l      │                                                                                       
│       t7.record_id_r      │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│           249960          │                                                                                       
│          (0.00s)          │                                                                                       
└─────────────┬─────────────┘                                                                                                                    
┌─────────────┴─────────────┐                                                                                       
│         PROJECTION        │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│__internal_compress_integra│                                                                                       
│     l_usmallint(#0, 0)    │                                                                                       
│__internal_compress_integra│                                                                                       
│     l_usmallint(#1, 0)    │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│           249960          │                                                                                       
│          (0.00s)          │                                                                                       
└─────────────┬─────────────┘                                                                                                                    
┌─────────────┴─────────────┐                                                                                       
│         PROJECTION        │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│        record_id_l        │                                                                                       
│        record_id_r        │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│           249960          │                                                                                       
│          (0.00s)          │                                                                                       
└─────────────┬─────────────┘                                                                                                                    
┌─────────────┴─────────────┐                                                                                       
│         HASH_JOIN         │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                       
│           INNER           │                                                                                       
│        _key = _key        │                                                                                       
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ├──────────────┐                                                                        
│         EC: 10364         │              │                                                                        
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │              │                                                                        
│           249960          │              │                                                                        
│          (0.07s)          │              │                                                                        
└─────────────┬─────────────┘              │                                                                                                     
┌─────────────┴─────────────┐┌─────────────┴─────────────┐                                                          
│         PROJECTION        ││         PROJECTION        │                                                          
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
│         record_id         ││         record_id         │                                                          
│            _key           ││            _key           │                                                          
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
│           50000           ││           50000           │                                                          
│          (0.00s)          ││          (0.00s)          │                                                          
└─────────────┬─────────────┘└─────────────┬─────────────┘                                                                                       
┌─────────────┴─────────────┐┌─────────────┴─────────────┐                                                          
│           UNNEST          ││           UNNEST          │                                                          
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
│           50000           ││           50000           │                                                          
│          (0.00s)          ││          (0.00s)          │                                                          
└─────────────┬─────────────┘└─────────────┬─────────────┘                                                                                       
┌─────────────┴─────────────┐┌─────────────┴─────────────┐                                                          
│         SEQ_SCAN          ││         SEQ_SCAN          │                                                          
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
│             t             ││             t             │                                                          
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
│         record_id         ││         record_id         │                                                          
│            vals           ││            vals           │                                                          
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
│         EC: 10000         ││         EC: 10000         │                                                          
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
│           10000           ││           10000           │                                                          
│          (0.00s)          ││          (0.00s)          │                                                          
└───────────────────────────┘└───────────────────────────┘  
NickCrews commented 4 months ago

closing this because there isn't an action here, I just needed to write down my experiments