ibis-project / ibis

the portable Python dataframe library
https://ibis-project.org
Apache License 2.0
4.5k stars 548 forks source link

bug: ibis.decompile(expr) returns SQL code #8597

Open lostmygithubaccount opened 3 months ago

lostmygithubaccount commented 3 months ago

What happened?

feel free to subsume this issue w/ a general one for improving the decompiler

import ibis

con = ibis.connect("duckdb://imdb.ddb")
ex_str = "imdb"

for name, obj in inspect.getmembers(ibis.examples):
    if ex_str in name:
        con.create_table(name, obj.fetch().to_pyarrow())

con = ibis.connect("duckdb://imdb.ddb")
con.list_tables()
['imdb_name_basics',
 'imdb_title_akas',
 'imdb_title_basics',
 'imdb_title_crew',
 'imdb_title_episode',
 'imdb_title_principals',
 'imdb_title_ratings']
# SQL from Ibis Birdbrain
sql = res.attachments[0].open()
print(sql)
SELECT 
  tb.tconst, 
  tb.titleType, 
  tb.primaryTitle, 
  tb.originalTitle, 
  tb.isAdult, 
  tb.startYear, 
  tb.endYear, 
  tb.runtimeMinutes, 
  tb.genres, 
  tr.averageRating, 
  tr.numVotes
FROM imdb_title_basics AS tb
JOIN imdb_title_ratings AS tr ON tb.tconst = tr.tconst
JOIN imdb_title_principals AS tp ON tp.tconst = tb.tconst
JOIN imdb_name_basics AS nb ON nb.nconst = tp.nconst
WHERE tb.titleType = 'movie'
  AND nb.primaryName = 'Johnny Depp'
  AND tr.numVotes ">=" 10000
ORDER BY tr.averageRating DESC
t = con.sql(sql)
t
SQLQueryResult
  query:
    SELECT tb.tconst, tb.titleType, tb.primaryTitle, tb.originalTitle, tb.isAdult, …
  schema:
    tconst         string
    titleType      string
    primaryTitle   string
    originalTitle  string
    isAdult        int64
    startYear      int64
    endYear        string
    runtimeMinutes int64
    genres         string
    averageRating  float64
    numVotes       int64

decompile the expression to Ibis code:

ibis_code = ibis.decompile(t)
print(ibis_code)
import ibis

result = SELECT 
  tb.tconst, 
  tb.titleType, 
  tb.primaryTitle, 
  tb.originalTitle, 
  tb.isAdult, 
  tb.startYear, 
  tb.endYear, 
  tb.runtimeMinutes, 
  tb.genres, 
  tr.averageRating, 
  tr.numVotes
FROM imdb_title_basics AS tb
JOIN imdb_title_ratings AS tr ON tb.tconst = tr.tconst
JOIN imdb_title_principals AS tp ON tb.tconst = tp.tconst
JOIN imdb_name_basics AS nb ON nb.nconst = tp.nconst
WHERE tb.titleType = 'movie'
  AND nb.primaryName = 'Johnny Depp'
  AND tr.numVotes >= 10000
ORDER BY tr.averageRating DESC.s_q_l_query_result(schema=ibis.Schema {
  tconst          string
  titleType       string
  primaryTitle    string
  originalTitle   string
  isAdult         int64
  startYear       int64
  endYear         string
  runtimeMinutes  int64
  genres          string
  averageRating   float64
  numVotes        int64
}, source=<ibis.backends.duckdb.Backend object at 0x14880abd0>)

What version of ibis are you using?

main

What backend(s) are you using, if any?

duckdb

Relevant log output

No response

Code of Conduct

lostmygithubaccount commented 3 months ago

mapping out how I got here with a little more context:

[ins] In [1]: import ibis

[ins] In [2]: con = ibis.connect("duckdb://penguins.ddb")

[ins] In [3]: con.create_table("penguins", ibis.examples.penguins.fetch().to_pyarrow(), overwrite=True)
Out[3]:
DatabaseTable: penguins
  species           string
  island            string
  bill_length_mm    float64
  bill_depth_mm     float64
  flipper_length_mm int64
  body_mass_g       int64
  sex               string
  year              int64

[ins] In [4]: from ibis.expr.sql import parse_sql

[ins] In [5]: sql = "select species, island, count(*) as count from penguins"

[ins] In [6]: expr = parse_sql(sql, catalog={con.table(table).get_name(): con.table(table).schema() for table in con.list_tables()})
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[6], line 1
----> 1 expr = parse_sql(sql, catalog={con.table(table).get_name(): con.table(table).schema() for table in con.list_tables()})

File ~/repos/ibis/ibis/expr/sql.py:307, in parse_sql(sqlstring, catalog, dialect)
    304 tree = sgo.optimize(expr, catalog.to_sqlglot(), rules=sgo.RULES)
    305 plan = sgp.Plan(tree)
--> 307 return convert(plan.root, catalog=catalog)

File ~/.local/share/mise/installs/python/3.11/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
    905 if not args:
    906     raise TypeError(f'{funcname} requires at least '
    907                     '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)

File ~/repos/ibis/ibis/expr/sql.py:163, in convert_aggregate(agg, catalog)
    161 table = catalog[agg.source]
    162 if agg.aggregations:
--> 163     metrics = [convert(a, catalog=catalog) for a in agg.aggregations]
    164     groups = [convert(g, catalog=catalog) for k, g in agg.group.items()]
    165     table = table.aggregate(metrics, by=groups)

File ~/repos/ibis/ibis/expr/sql.py:163, in <listcomp>(.0)
    161 table = catalog[agg.source]
    162 if agg.aggregations:
--> 163     metrics = [convert(a, catalog=catalog) for a in agg.aggregations]
    164     groups = [convert(g, catalog=catalog) for k, g in agg.group.items()]
    165     table = table.aggregate(metrics, by=groups)

File ~/.local/share/mise/installs/python/3.11/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
    905 if not args:
    906     raise TypeError(f'{funcname} requires at least '
    907                     '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)

File ~/repos/ibis/ibis/expr/sql.py:194, in convert_alias(alias, catalog)
    192 @convert.register(sge.Alias)
    193 def convert_alias(alias, catalog):
--> 194     this = convert(alias.this, catalog=catalog)
    195     return this.name(alias.alias)

File ~/.local/share/mise/installs/python/3.11/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
    905 if not args:
    906     raise TypeError(f'{funcname} requires at least '
    907                     '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)

File ~/repos/ibis/ibis/expr/sql.py:268, in convert_sum(reduction, catalog)
    265 @convert.register(sge.AggFunc)
    266 def convert_sum(reduction, catalog):
    267     method = _reduction_methods[type(reduction)]
--> 268     this = convert(reduction.this, catalog=catalog)
    269     return getattr(this, method)()

File ~/.local/share/mise/installs/python/3.11/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
    905 if not args:
    906     raise TypeError(f'{funcname} requires at least '
    907                     '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)

File ~/repos/ibis/ibis/expr/sql.py:200, in convert_column(column, catalog)
    198 @convert.register(sge.Column)
    199 def convert_column(column, catalog):
--> 200     table = catalog[column.table]
    201     return table[column.name]

KeyError: ''

[ins] In [7]: sql = "select species, island from penguins"

[ins] In [8]: expr = parse_sql(sql, catalog={con.table(table).get_name(): con.table(table).schema() for table in con.list_tables()})

[ins] In [9]: print(ibis.decompile(expr))
import ibis

penguins = ibis.table(name='penguins', schema={'species': 'string', 'island': 'string', 'bill_length_mm': 'float64', 'bill_depth_mm': 'float64', 'flipper_length_mm': 'int64', 'body_mass_g': 'int64', 'sex': 'string', 'year': 'int64'})

result = penguins.select(penguins.species, penguins.island)