Closed lostmygithubaccount closed 1 month ago
I'll just dump the code used to generate ibii
here -- requires a few extra installs (faker mainly):
import os
import ibis
import ibis.selectors as s
import ibis.expr.datatypes as dt
from faker import Faker
from ibis_substrait.compiler.core import SubstraitCompiler
ibis.options.interactive = True
ibis.options.repr.interactive.max_rows = 20
ibis.options.repr.interactive.max_length = 5
fake = Faker()
lookback = ibis.interval(days=1)
step = ibis.interval(seconds=1)
source = (
ibis.range(ibis.now() - lookback, ibis.now(), step=step)
.unnest()
.name("timestamp")
.as_table()
.mutate(
index=(ibis.row_number().over(order_by="timestamp")),
**{
c: 2 * (ibis.random() - 0.5)
for c in list(map(chr, range(ord("a"), ord("c") + 1)))
},
)
.relocate("index", "timestamp")
.order_by("timestamp")
)
# ibis species choices
species_choices = [
"Glossy ibis",
"Scarlet ibis",
"American white ibis",
"Australian white ibis",
"African sacred ibis",
"Green ibis",
"Madagascan ibis",
"Hadada ibis",
"Bare-faced ibis",
"Plumbeous ibis",
"Puna ibis",
"Buff-necked ibis",
]
ibii_schema = dt.Struct(
{
"species": str,
"name": str,
"height": float,
"weight": float,
"colors": list[str],
"description": str,
"location": list[str],
"reporter": dt.Struct(
{
"name": str,
"email": str,
"ip": str,
}
),
}
)
@ibis.udf.scalar.python
def ibii_sighting(
a: float,
b: float,
c: float,
) -> dt.Array(ibii_schema):
"""
Generate records of fake data.
"""
batch_size = fake.random_int(min=1, max=42)
res = [
{
"species": fake.random_element(elements=species_choices)
if a >= 0.3
else None,
"name": fake.first_name_nonbinary(),
"height": fake.random_int(min=10, max=7500) / 100,
"weight": fake.random_int(min=700, max=2700) / 1000,
"colors": [fake.color_name() for _ in range(fake.random_int(min=1, max=5))],
"description": fake.sentence(),
"location": fake.location_on_land(),
"reporter": {
"name": fake.name(),
"email": fake.email(),
"ip": fake.ipv4() if ((b + c) / 2) >= 0 else fake.ipv6(),
},
}
for _ in range(batch_size)
]
return res
filename = "ibii.parquet"
if not os.path.exists(filename):
ibii = source.select(
"timestamp",
ibii=ibii_sighting(source["a"], source["b"], source["c"]),
).cache()
ibii.to_parquet(filename, overwrite=True)
else:
ibii = ibis.read_parquet(filename)
ibii
also if this only works on unbounded tables I will feel silly
It currently only works for unbounded tables :smile:
Yeah -- we could make it work with DatabaseTable
, now that that's also a more predictable thing.
thanks! perhaps good user feedback for improving the docs (though I also didn't read the docs so 😂 maybe this is already clear)
What happened?
this is not terribly important for what I'm doing, but I did find it surprising. I'm generating some synthetic data of ibis spottings. I have it saved in a Parquet file:
the schema is a timestamp + array of structs:
I just wanted to demonstrate that you could do operations on the table and export a Substrait plan:
resulting in:
I can hack around this by creating an unbound (?) table w/ the same schema:
and then basic operations (like the group_by shown) just work
I can share the data file/code to generate it internally
What version of ibis-substrait are you using?
latest release
What substrait consumer(s) are you using, if any?
none
Relevant log output
No response