opendp / opendp

The core library of differential privacy algorithms powering the OpenDP Project.
https://opendp.org
MIT License
284 stars 46 forks source link

Polars: filter #1534

Open Shoeboxam opened 2 weeks ago

Shoeboxam commented 2 weeks ago

Closes #899 Example usage.

import opendp.prelude as dp
import polars as pl

# TODO: a version of this should be in the library proper
def placeholder(schema):
    return pl.DataFrame(None, schema, orient="row").lazy()

dp.enable_features("contrib")

lf = pl.LazyFrame(
    [
        pl.Series("A", [1.0] * 50, dtype=pl.Float64),
        pl.Series("B", [1, 2, 3, 4, 5] * 10, dtype=pl.Int32),
        pl.Series("C", ["1"] * 49 + [None], dtype=pl.String),
        pl.Series("D", [2] * 50, dtype=pl.Int32),
    ]
)

# specify domain descriptors of the lazyframe (without the data)
lf_domain = dp.lazyframe_domain(
    [
        dp.series_domain("A", dp.atom_domain(T=dp.f64)),
        dp.series_domain("B", dp.atom_domain(T=dp.i32)),
        dp.series_domain("C", dp.option_domain(dp.atom_domain(T=dp.String))),
        dp.series_domain("D", dp.atom_domain(T=dp.i32)),
    ]
)

# TODO: support imputation, then replace the above with this shortened version:
# lf_domain = dp.infer_lazyframe_domain(lf)

# specify properties of the data when grouped by "C"
lf_domain = dp.with_margin(
    lf_domain, by=["C"], public_info="keys", max_partition_length=50
)

# USER STARTS HERE
proposed_plan = (
    placeholder(lf.schema)
    .filter(pl.col("A") < pl.col("B"))
    .group_by("C")
    .agg(pl.col("D").dp.sum(bounds=(1, 5), scale=1))
)

# Context API to look like this:
# release = (
#     context.query()
#     .filter(pl.col("A") < pl.col("B"))
#     .group_by("C")
#     .agg(pl.col("D").dp.sum(bounds=(1, 5), scale=1))
#     .release()
# )

# IN SERVER
m_lf = dp.m.make_private_lazyframe(
    input_domain=lf_domain,
    input_metric=dp.symmetric_distance(),
    output_measure=dp.max_divergence(T=float),
    lazyframe=proposed_plan,
)

df_release = m_lf(lf).collect()

print(df_release)
Shoeboxam commented 2 weeks ago

[!WARNING] This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite. Learn more

This stack of pull requests is managed by Graphite. Learn more about stacking.

Join @Shoeboxam and the rest of your teammates on Graphite Graphite