pola-rs / polars

Dataframes powered by a multithreaded, vectorized query engine, written in Rust
https://docs.pola.rs
Other
28.14k stars 1.74k forks source link

Entered unreachable code on valid SQL #17381

Open guilherme-wisdom opened 2 weeks ago

guilherme-wisdom commented 2 weeks ago

Checks

Reproducible example

from datetime import datetime

import polars as pl

users = pl.DataFrame(
    schema=["id", "email"],
    data=[
        ("1", "john.doe@company.com"),
        ("2", "jane.doe@company.com"),
        ("3", "doe.smith@company.com"),
        ("4", "emily.johnsom@company.com"),
        ("5", "alex.brown@company.com"),
        ("6", "michael.davies@company.com"),
    ],
)

user_groups = pl.DataFrame(
    schema=["user_id", "group_id"],
    data=[
        ("1", "1"),
        ("2", "1"),
        ("3", "2"),
        ("4", "2"),
        ("5", "3"),
        ("6", "4"),
    ],
)

group_group = pl.DataFrame(
    schema=["parent_id", "child_id"],
    data=[
        ("3", "1"),
        ("3", "2"),
        ("4", "3"),
    ],
)

deals = pl.DataFrame(
    schema=[
        "id",
        "region",
        "owner_id",
        "product",
        "date",
        "units",
        "price",
    ],
    data=[
        (
            1,
            "East",
            "1",
            "Tee",
            datetime(year=2005, month=1, day=27, hour=1, minute=20),
            12,
            11.04,
        ),
        (
            2,
            "East",
            "1",
            "Golf",
            datetime(year=2005, month=1, day=27, hour=2, minute=30),
            12,
            13.00,
        ),
        (
            3,
            "East",
            "1",
            "Fancy",
            datetime(year=2005, month=1, day=27, hour=3, minute=40),
            12,
            11.96,
        ),
        (4, "East", "2", "Tee", datetime(year=2005, month=1, day=28), 10, 11.27),
        (5, "East", "2", "Golf", datetime(year=2005, month=1, day=28), 10, 12.12),
        (6, "East", "2", "Fancy", datetime(year=2005, month=1, day=29), 10, 13.74),
        (7, "West", "3", "Tee", datetime(year=2005, month=1, day=29), 11, 11.44),
        (8, "West", "3", "Golf", datetime(year=2005, month=1, day=29), 11, 12.63),
        (9, "West", "3", "Fancy", datetime(year=2005, month=1, day=30), 12, 12.06),
        (10, "West", "4", "Tee", datetime(year=2005, month=1, day=31), 12, 13.42),
        (11, "West", "4", "Golf", datetime(year=2005, month=1, day=31), 11, 11.48),
        (12, "West", "5", "Fancy", datetime(year=2005, month=1, day=31), 11, 11.48),
    ],
)

ctx = pl.SQLContext({
    "users": users,
    "user_groups": user_groups,
    "group_group": group_group,
    "deals": deals,
})

a = ctx.execute(
    'WITH "user_by_email" AS (SELECT "users"."id" FROM "users" WHERE'
    ' "email" IN (\'john.doe@company.com\')), "user_child" AS (SELECT'
    ' "user_groups"."user_id" AS "input_user_id",'
    ' "group_group"."child_id" FROM "user_groups" INNER JOIN'
    ' "user_by_email" ON "user_groups"."user_id" = "user_by_email"."id" INNER'
    ' JOIN "group_group" ON "user_groups"."group_id" ='
    ' "group_group"."parent_id"), "deals_authz" AS (SELECT * FROM "deals"'
    ' WHERE (("deals"."owner_id" IN (SELECT "users"."id" FROM "users"'
    ' WHERE "email" IN (\'john.doe@company.com\'))) OR ("deals"."owner_id" IN'
    ' (SELECT DISTINCT "right"."user_id" FROM "user_groups" AS "left" INNER'
    ' JOIN "user_by_email" ON "user_groups"."user_id" = "user_by_email"."id"'
    ' INNER JOIN "user_groups" AS "right" ON "left"."group_id" ='
    ' "right"."group_id"))) OR ("deals"."owner_id" IN (SELECT DISTINCT'
    ' "user_groups"."user_id" FROM "user_groups" INNER JOIN'
    ' "user_child" ON "user_groups"."group_id" = "user_child"."child_id")))'
    ' SELECT "id" FROM "deals_authz" AS "deals" ORDER BY "id" ASC'
).collect()

Log output

% POLARS_VERBOSE=1 RUST_BACKTRACE=1 PYTHONPATH=python:$PYTHONPATH python 'python/wisdom/core/dataframe_transforms/tests/repro.py'
found multiple sources; run comm_subplan_elim
thread '<unnamed>' panicked at crates/polars-plan/src/logical_plan/optimizer/predicate_pushdown/mod.rs:711:24:
internal error: entered unreachable code
stack backtrace:
   0: _rust_begin_unwind
   1: core::panicking::panic_fmt
   2: core::panicking::panic
   3: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down::{{closure}}
   4: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down
   5: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down::{{closure}}
   6: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down
   7: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::pushdown_and_continue
   8: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down::{{closure}}
   9: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down
  10: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::pushdown_and_assign
  11: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down::{{closure}}
  12: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down
  13: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::pushdown_and_continue
  14: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down::{{closure}}
  15: polars_plan::logical_plan::optimizer::predicate_pushdown::PredicatePushDown::push_down
  16: polars_plan::logical_plan::optimizer::cache_states::set_cache_states
  17: polars_plan::logical_plan::optimizer::optimize
  18: polars_lazy::frame::LazyFrame::optimize_with_scratch
  19: polars_lazy::frame::LazyFrame::collect
  20: polars::lazyframe::PyLazyFrame::__pymethod_collect__
  21: pyo3::impl_::trampoline::trampoline
  22: polars::lazyframe::_::__INVENTORY::trampoline
  23: _method_vectorcall_VARARGS_KEYWORDS
  24: __PyEval_EvalFrameDefault
  25: _PyEval_EvalCode
  26: _run_eval_code_obj
  27: _run_mod
  28: _pyrun_file
  29: __PyRun_SimpleFileObject
  30: __PyRun_AnyFileObject
  31: _pymain_run_file_obj
  32: _pymain_run_file
  33: _Py_RunMain
  34: _Py_BytesMain
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
Traceback (most recent call last):
  File "/Users/guilherme/Workspace/wisdom/python/wisdom/core/dataframe_transforms/tests/repro.py", line 112, in <module>
    ).collect()
      ^^^^^^^^^
  File "/Users/guilherme/Workspace/wisdom/.venv/lib/python3.11/site-packages/polars/lazyframe/frame.py", line 1967, in collect
    return wrap_df(ldf.collect(callback))
                   ^^^^^^^^^^^^^^^^^^^^^
pyo3_runtime.PanicException: internal error: entered unreachable code

Issue description

My application is failing on the example provided. I could not pinpoint exactly what is the combination that is causing the issue. The outer query has three OR operands and if I remove the last OR component it works:

    'WITH "user_by_email" AS (SELECT "users"."id" FROM "users" WHERE'
    ' "email" IN (\'john.doe@company.com\')), "user_child" AS (SELECT'
    ' "user_groups"."user_id" AS "input_user_id",'
    ' "group_group"."child_id" FROM "user_groups" INNER JOIN'
    ' "user_by_email" ON "user_groups"."user_id" = "user_by_email"."id" INNER'
    ' JOIN "group_group" ON "user_groups"."group_id" ='
    ' "group_group"."parent_id"), "deals_authz" AS (SELECT * FROM "deals"'
    ' WHERE (("deals"."owner_id" IN (SELECT "users"."id" FROM "users"'
    ' WHERE "email" IN (\'john.doe@company.com\'))) OR ("deals"."owner_id" IN'
    ' (SELECT DISTINCT "right"."user_id" FROM "user_groups" AS "left" INNER'
    ' JOIN "user_by_email" ON "user_groups"."user_id" = "user_by_email"."id"'
    ' INNER JOIN "user_groups" AS "right" ON "left"."group_id" ='
    ' "right"."group_id"))))'
    ' SELECT "id" FROM "deals_authz" AS "deals" ORDER BY "id" ASC'

So it has something to do with that last component, or how that component interacts with everything else.

I also ran this against the newest Polars 1.0.0 and it also failed.

Expected behavior

The right output is

1 2 3 4 5 6

Installed versions

``` --------Version info--------- Polars: 0.20.31 Index type: UInt32 Platform: macOS-13.3-arm64-arm-64bit Python: 3.11.7 (main, Dec 4 2023, 18:10:11) [Clang 15.0.0 (clang-1500.1.0.2.5)] ----Optional dependencies---- adbc_driver_manager: cloudpickle: connectorx: 0.3.2 deltalake: fastexcel: fsspec: 2023.6.0 gevent: hvplot: matplotlib: 3.8.4 nest_asyncio: 1.6.0 numpy: 1.26.1 openpyxl: pandas: 2.0.3 pyarrow: 10.0.1 pydantic: 1.10.13 pyiceberg: pyxlsb: 1.0.10 sqlalchemy: 2.0.13 torch: 2.0.1 xlsx2csv: xlsxwriter: ```
ritchie46 commented 2 weeks ago

pinging @alexander-beedie here.

henryharbeck commented 2 weeks ago

@alexander-beedie , potentially simpler repro / root-cause. I know it is not the same error, but kept trying to reduce the issue and got here. If you think it is a separate issue, I will create one

import polars as pl
pl.__version__ # 1.0.0

users = pl.DataFrame({"id": "1"})
user_groups = pl.DataFrame({"user_id": "1"})

pl.sql("""
WITH user_by_email AS (SELECT id FROM users)

SELECT user_groups.user_id AS input_user_id
FROM user_groups
INNER JOIN user_by_email ON user_groups.user_id = user_by_email.id
-- users    <--- if you remove this line, you will get an error
-- It is commented out, but it must have to do with global variable discovery
"""
).collect()
# Raises SQLInterfaceError: relation 'users' was not found when `-- users` line is removed

EDIT: separate issue, I will raise one, please ignore

henryharbeck commented 2 weeks ago

Tidier repro

# removing the email field fixes it
users = pl.DataFrame({"id": "1", "email": "abc"}) 
user_groups = pl.DataFrame({"id": "1"})
group_group = pl.DataFrame({"id": "1"})
deals = pl.DataFrame({"id": "1"})

ctx = pl.SQLContext({
    "users": users,
    "user_groups": user_groups,
    "group_group": group_group,
    "deals": deals,
})

ctx.execute("""
    WITH user_by_email AS (SELECT id FROM users), 
    user_child AS (
        SELECT group_group.id 
        FROM user_groups
        -- removing this join fixes it
        INNER JOIN user_by_email ON user_groups.id = user_by_email.id 
        INNER JOIN group_group ON user_groups.id = group_group.id
    )
    SELECT * 
    FROM deals 
    WHERE (
        deals.id IN (
            SELECT id 
            FROM users 
            -- removing the below line changes the error from 
            -- `PanicException: internal error: entered unreachable code` to
            -- `PanicException: called `Option::unwrap()` on a `None` value`
            WHERE id = '1'
        )
        OR deals.id IN (
            SELECT DISTINCT user_groups.id 
            FROM user_groups AS left
            -- removing this join also fixes it
            INNER JOIN user_by_email ON user_groups.id = user_by_email.id 
        ) 
        OR deals.id IN (
            SELECT DISTINCT user_groups.id 
            FROM user_groups 
            -- removing this join also fixes it
            INNER JOIN user_child ON user_groups.id = user_child.id
        )
    )
    """,
    eager=True
)
# PanicException: internal error: entered unreachable code

And have been still able to get a panic (thought not the same "entered unreachable code" error) with a smaller repro again

a = pl.DataFrame({"id": "1"})

ctx = pl.SQLContext({"a": a})

ctx.execute("""
    -- changing this from `SELECT a.id` to `SELECT id` fixes it
    -- even though this CTE is never used
    -- removing the CTE also fixes the issue
    WITH c AS (SELECT a.id FROM a)

    SELECT *
    FROM a 
    WHERE id IN (
        SELECT id 
        FROM a
        -- removing this join fixes the issue
        INNER JOIN a AS a2 ON a.id = a2.id
    )
    """,
    eager=True
)
# PanicException: called `Option::unwrap()` on a `None` value
cmdlineluser commented 2 weeks ago

I did reproduce the unreachable code error, but it happens elsewhere so not entirely sure if it is the same underlying issue.

users = groups = deals = pl.DataFrame({"id": [1]})

pl.sql("""
with 
   A as ( select id from groups join users using (id) )
select * from deals
where
   id in ( select id from A join A as B using (id) )
""").collect()

# thread '<unnamed>' panicked at crates/polars-plan/src/plans/ir/schema.rs:106:24:
# internal error: entered unreachable code
ritchie46 commented 2 weeks ago

@alexander-beedie I see that in the example above by @cmdlineluser it creates an external context. I really suspect this to be the culprit. Can we use horizontal concat here instead?

alexander-beedie commented 2 weeks ago

@alexander-beedie I see that in the example above by @cmdlineluser it creates an external context. I really suspect this to be the culprit. Can we use horizontal concat here instead?

I'll find out (though I'm on vacation at the moment so can't dig in properly yet - am only able to sneak the time for small PRs at the moment 😉)

ritchie46 commented 2 weeks ago

Oh yeah.. enjoy!