bug: various NotImplemented errors w/ an odd workaround

lostmygithubaccount commented 1 month ago

What happened?

this is not terribly important for what I'm doing, but I did find it surprising. I'm generating some synthetic data of ibis spottings. I have it saved in a Parquet file:

import ibis

ibis.options.interactive = True

t = ibis.read_parquet("ibii.parquet")

the schema is a timestamp + array of structs:

ibis.Schema {
  timestamp  timestamp(6)
  ibii       array<struct<species: string, name: string, height: float64, weight: float64, colors: array<string>, description: string, location: array<string>, reporter: struct<name: string, email: string, ip: string>>>
}

I just wanted to demonstrate that you could do operations on the table and export a Substrait plan:

from ibis_substrait.compiler.core import SubstraitCompiler

compiler = SubstraitCompiler()
proto = compiler.compile(ibii)
proto

resulting in:

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
File /Users/cody/repos/Icarus/eda.qmd:2
      [1](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/eda.qmd:1) compiler = SubstraitCompiler()
----> [2](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/eda.qmd:2) proto = compiler.compile(ibii)
      [3](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/eda.qmd:3) proto

File ~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:222, in SubstraitCompiler.compile(self, expr, **kwargs)
    [217](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:217) from .translate import translate
    [219](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:219) expr_schema = expr.schema()
    [220](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:220) rel = stp.PlanRel(
    [221](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:221)     root=stalg.RelRoot(
--> [222](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:222)         input=translate(expr.op(), compiler=self, **kwargs),
    [223](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:223)         names=translate(expr_schema).names,
    [224](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:224)     )
    [225](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:225) )
    [226](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:226) ver = vparse(__substrait_version__)
    [227](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:227) return stp.Plan(
    [228](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:228)     version=stp.Version(
    [229](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:229)         major_number=ver.major,
   (...)
    [256](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:256)     relations=[rel],
    [257](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/core.py:257) )

File ~/.local/share/uv/python/cpython-3.12.5-macos-aarch64-none/lib/python3.12/functools.py:907, in singledispatch.<locals>.wrapper(*args, **kw)
    [903](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/.local/share/uv/python/cpython-3.12.5-macos-aarch64-none/lib/python3.12/functools.py:903) if not args:
    [904](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/.local/share/uv/python/cpython-3.12.5-macos-aarch64-none/lib/python3.12/functools.py:904)     raise TypeError(f'{funcname} requires at least '
    [905](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/.local/share/uv/python/cpython-3.12.5-macos-aarch64-none/lib/python3.12/functools.py:905)                     '1 positional argument')
--> [907](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/.local/share/uv/python/cpython-3.12.5-macos-aarch64-none/lib/python3.12/functools.py:907) return dispatch(args[0].__class__)(*args, **kw)

File ~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/translate.py:55, in translate(*args, **kwargs)
     [53](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/translate.py:53) @functools.singledispatch
     [54](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/translate.py:54) def translate(*args: Any, **kwargs: Any) -> Any:
---> [55](https://file+.vscode-resource.vscode-cdn.net/Users/cody/repos/Icarus/~/repos/Icarus/.venv/lib/python3.12/site-packages/ibis_substrait/compiler/translate.py:55)     raise NotImplementedError(*args)

NotImplementedError: <ibis.expr.operations.relations.DatabaseTable object at 0x1371a87d0>

I can hack around this by creating an unbound (?) table w/ the same schema:

ts = ibis.table(
    ibii.schema(),
    "ts",
)
# expr = ts.group_by("timestamp").agg(ibis._.count())

compiler = SubstraitCompiler()
proto = compiler.compile(ts)
proto

and then basic operations (like the group_by shown) just work

I can share the data file/code to generate it internally

What version of ibis-substrait are you using?

latest release

What substrait consumer(s) are you using, if any?

none

Relevant log output

No response

lostmygithubaccount commented 1 month ago

I'll just dump the code used to generate ibii here -- requires a few extra installs (faker mainly):

import os
import ibis
import ibis.selectors as s
import ibis.expr.datatypes as dt

from faker import Faker
from ibis_substrait.compiler.core import SubstraitCompiler

ibis.options.interactive = True
ibis.options.repr.interactive.max_rows = 20
ibis.options.repr.interactive.max_length = 5

fake = Faker()

lookback = ibis.interval(days=1)
step = ibis.interval(seconds=1)

source = (
    ibis.range(ibis.now() - lookback, ibis.now(), step=step)
    .unnest()
    .name("timestamp")
    .as_table()
    .mutate(
        index=(ibis.row_number().over(order_by="timestamp")),
        **{
            c: 2 * (ibis.random() - 0.5)
            for c in list(map(chr, range(ord("a"), ord("c") + 1)))
        },
    )
    .relocate("index", "timestamp")
    .order_by("timestamp")
)

# ibis species choices
species_choices = [
    "Glossy ibis",
    "Scarlet ibis",
    "American white ibis",
    "Australian white ibis",
    "African sacred ibis",
    "Green ibis",
    "Madagascan ibis",
    "Hadada ibis",
    "Bare-faced ibis",
    "Plumbeous ibis",
    "Puna ibis",
    "Buff-necked ibis",
]

ibii_schema = dt.Struct(
    {
        "species": str,
        "name": str,
        "height": float,
        "weight": float,
        "colors": list[str],
        "description": str,
        "location": list[str],
        "reporter": dt.Struct(
            {
                "name": str,
                "email": str,
                "ip": str,
            }
        ),
    }
)

@ibis.udf.scalar.python
def ibii_sighting(
    a: float,
    b: float,
    c: float,
) -> dt.Array(ibii_schema):
    """
    Generate records of fake data.
    """
    batch_size = fake.random_int(min=1, max=42)

    res = [
        {
            "species": fake.random_element(elements=species_choices)
            if a >= 0.3
            else None,
            "name": fake.first_name_nonbinary(),
            "height": fake.random_int(min=10, max=7500) / 100,
            "weight": fake.random_int(min=700, max=2700) / 1000,
            "colors": [fake.color_name() for _ in range(fake.random_int(min=1, max=5))],
            "description": fake.sentence(),
            "location": fake.location_on_land(),
            "reporter": {
                "name": fake.name(),
                "email": fake.email(),
                "ip": fake.ipv4() if ((b + c) / 2) >= 0 else fake.ipv6(),
            },
        }
        for _ in range(batch_size)
    ]

    return res

filename = "ibii.parquet"

if not os.path.exists(filename):
    ibii = source.select(
        "timestamp",
        ibii=ibii_sighting(source["a"], source["b"], source["c"]),
    ).cache()

    ibii.to_parquet(filename, overwrite=True)
else:
    ibii = ibis.read_parquet(filename)

ibii

lostmygithubaccount commented 1 month ago

also if this only works on unbounded tables I will feel silly

tokoko commented 1 month ago

It currently only works for unbounded tables :smile:

gforsyth commented 1 month ago

Yeah -- we could make it work with DatabaseTable, now that that's also a more predictable thing.

lostmygithubaccount commented 1 month ago

thanks! perhaps good user feedback for improving the docs (though I also didn't read the docs so 😂 maybe this is already clear)

ibis-project / ibis-substrait