pola-rs / polars

Dataframes powered by a multithreaded, vectorized query engine, written in Rust
https://docs.pola.rs
Other
28.23k stars 1.75k forks source link

Polars issue with optimizer (CSE) #16138

Closed rlartiga closed 2 months ago

rlartiga commented 2 months ago

Checks

Reproducible example

holdings.csv

import polars as pl

print(pl.__version__)

holdings = pl.read_csv("holdings.csv")

usd = ["USD"]
clp = ["CLP"]
eur = ["EUR"]
uf = ["CLF", "UF"]

# usd = pl.Series(values=usd, dtype=pl.Utf8)
# clp = pl.Series(values=clp, dtype=pl.Utf8)
# eur = pl.Series(values=eur, dtype=pl.Utf8)
# uf = pl.Series(values=uf, dtype=pl.Utf8)

asia = [
    "Australia",
    "Hong Kong",
    "Nueva Zelanda",
    "Singapur",
    "China",
    "Corea del Sur",
    "Filipinas",
    "India",
    "Indonesia",
    "Malasia",
    "Tailandia",
    "Taiwan",
    "Vietnam",
    "Japon",
    "Japón",
    "Israel",
    "Pakistán",
    "Pakistan",
    "Bahrain",
    "Brunei",
    "Iraq",
    "Jordan",
    "Kazakhstan",
    "Kuwait",
    "Lebanon",
    "Macau",
    "Malaysia",
    "Mongolia",
    "Oman",
    "Philippines",
    "Qatar",
    "Arabia Saudita",
    "Singapur",
    "Sri Lanka",
    "Thailand",
    "Vietnam",
    "Emiratos Arabes",
]

asia = pl.Series(values=asia, dtype=pl.Utf8)

local_fixed_income = [
    "DPC",
    "DPL",
    "BB",
    "BE",
    "PE",
    "PDBC",
    "BCU",
    "BTU",
    "BCP",
    "BTP",
    "BS",
    "BNEE",
    "BBNEE",
    "PRC",
    "LH",
    "BEE",
    "BVL",
    "BU",
    "OTROF",
    "BBFE",
]
rvi = ["CFME", "ETFA", "ETFB"]

rvi = pl.Series(values=rvi, dtype=pl.Utf8)

rvl = [
    "RVL",
    "RVL_AERO",
    "RVL_ALIM",
    "RVL_AZUC",
    "RVL_BAN",
    "RVL_BEB",
    "RVL_CEM",
    "RVL_CONG",
    "RVL_CONST",
    "RVL_CONS",
    "DER_PREF",
    "DPFI",
    "DPFIE",
    "DPSAE",
    "DPSA",
    "RVL_ELAT",
    "RVL_ELOC",
    "RVL_ENTR",
    "RVL_FIN",
    "RVL_HLD",
    "RVL_IND",
    "RVL_TELECO",
    "RVL_MET",
    "RVL_MIN",
    "RVL_PYC",
    "RVL_PESCA",
    "RVL_RE",
]

rvl = pl.Series(values=rvl, dtype=pl.Utf8)

nemos = ["CFI-GRBCIE", "CFI-SEBCIE", "CFI-PGTPBE"]

nemos = pl.Series(values=nemos, dtype=pl.Utf8)

latam = [
    "Argentina",
    "Brasil",
    "Colombia",
    "México",
    "Panamá",
    "Perú",
    "Venezuela",
    "Bolivia",
    "Costa Rica",
    "Ecuador",
    "Uruguay",
    "El Salvador",
    "Belize",
    "Paraguay",
    "Trinidad y Tobago",
]

latam = pl.Series(values=latam, dtype=pl.Utf8)

europe = [
    "Alemania",
    "Austria",
    "Belgica",
    "Bélgica",
    "Dinamarca",
    "España",
    "Finlandia",
    "Francia",
    "Gran Bretaña",
    "Grecia",
    "Holanda",
    "Irlanda",
    "Italia",
    "Luxemburgo",
    "Noruega",
    "Portugal",
    "Reino Unido",
    "Suecia",
    "Suiza",
    "Islas Caimán",
    "Islas Caiman",
    "Islas Vírgenes Britá",
    "Islas Virgenes Brita",
    "Hungría",
    "Polonia",
    "Republica Checa",
    "República Checa",
    "Rusia",
    "Eslovaquia",
    "Estonia",
    "Latvia",
    "Lituania",
    "Turquia",
    "Turquía",
    "Cyprus",
    "Gibraltar",
    "Guernsey",
    "Isle of Man",
    "Jersey",
    "Liechtenstein",
    "Luxembourg",
    "Malta",
    "Romania",
    "Eslovenia",
    "Ucrania",
    "Serbia",
    "Islandia",
    "Croacia",
    "Albania",
    "Monaco",
    "Georgia",
]

europe = pl.Series(values=europe, dtype=pl.Utf8)

north_america = ["Canada", "U.S.A.", "Bahamas", "Bermuda", "Jamaica", "Sudáfrica"]

north_america = pl.Series(values=north_america, dtype=pl.Utf8)

chile = pl.Series(values=["Chile", "chile"], dtype=pl.Utf8)

factor_query_dict = {}
factor_query_dict["EQUITY_ASIA"] = pl.col("asset_type").is_in(rvi) & pl.col(
    "pais"
).is_in(asia)
factor_query_dict["EQUITY_CHILE"] = (
    pl.col("asset_class_code").is_in(rvl)
    | pl.col("asset_type").is_in(rvi) & pl.col("pais").is_in(["Chile", "chile"])
    | pl.col("asset_nemo").is_in(nemos)
)
factor_query_dict["EQUITY_LATAM_EX_CHILE"] = pl.col("asset_type").is_in(rvi) & pl.col(
    "pais"
).is_in(latam)

factor_query_dict["EQUITY_EUROPE"] = pl.col("pais").is_in(europe) & pl.col(
    "asset_type"
).is_in(rvi)
factor_query_dict["EQUITY_NORTH_AMERICA"] = pl.col("pais").is_in(
    north_america
) & pl.col("asset_type").is_in(rvi)
factor_query_dict["RATE_CLP_DUR_LTE_1"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(clp)
    & pl.col("duracion").le(pl.lit(float(1)))
)
factor_query_dict["RATE_CLP_DUR_GT_1_LTE_3"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(clp)
    & pl.col("duracion").gt(pl.lit(float(1)))
    & pl.col("duracion").le(pl.lit(float(3)))
)
factor_query_dict["RATE_CLP_DUR_GT_3_LTE_5"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(clp)
    & pl.col("duracion").gt(pl.lit(float(3)))
    & pl.col("duracion").le(pl.lit(float(5)))
)
factor_query_dict["RATE_CLP_DUR_GT_5_LTE_7"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(clp)
    & pl.col("duracion").gt(pl.lit(float(5)))
    & pl.col("duracion").le(pl.lit(float(5)))
)
factor_query_dict["RATE_CLP_DUR_GT_7"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(clp)
    & pl.col("duracion").gt(pl.lit(float(7)))
)
factor_query_dict["RATE_UF_DUR_LTE_1"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(uf)
    & pl.col("duracion").le(pl.lit(float(1)))
)
factor_query_dict["RATE_UF_DUR_GT_1_LTE_3"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(["CLF", "UF"])
    & pl.col("duracion").gt(pl.lit(1))
    & pl.col("duracion").le(pl.lit(3))
)
factor_query_dict["RATE_UF_DUR_GT_3_LTE_5"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(["CLF", "UF"])
    & pl.col("duracion").gt(pl.lit(3))
    & pl.col("duracion").le(pl.lit(5))
)
factor_query_dict["RATE_UF_DUR_GT_5_LTE_7"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(["CLF", "UF"])
    & pl.col("duracion").gt(pl.lit(5))
    & pl.col("duracion").le(pl.lit(7))
)
factor_query_dict["RATE_UF_DUR_GT_7"] = (
    pl.col("asset_type").is_in(local_fixed_income)
    & pl.col("asset_currency").is_in(["CLF", "UF"])
    & pl.col("duracion").gt(pl.lit(7))
)
factor_query_dict["RATE_USD"] = pl.col("asset_type").is_in(local_fixed_income) & pl.col(
    "asset_currency"
).is_in(usd)

currency_factor_query_dict = {
    "CURRENCY_USD_FUND_CLP": pl.col("asset_currency").is_in(usd)
    & pl.col("fund_currency").is_in(clp),
    "CURRENCY_EUR_FUND_CLP": pl.col("asset_currency").is_in(eur)
    & pl.col("fund_currency").is_in(clp),
    "CURRENCY_EUR_FUND_USD": pl.col("asset_currency").is_in(eur)
    & pl.col("fund_currency").is_in(usd),
    "CURRENCY_CLP_FUND_CLP": pl.col("asset_currency").is_in(clp)
    & pl.col("fund_currency").is_in(clp),
    "CURRENCY_USD_FUND_USD": pl.col("asset_currency").is_in(usd)
    & pl.col("fund_currency").is_in(usd),
    "CURRENCY_UF_FUND_CLP": pl.col("asset_currency").is_in(uf)
    & pl.col("fund_currency").is_in(clp),
    "CURRENCY_UF_FUND_USD": pl.col("asset_currency").is_in(uf)
    & pl.col("fund_currency").is_in(usd),
}

factor_holdings = holdings.lazy().with_columns(
        [
            pl.coalesce(
                pl.when(polars_query).then(pl.lit(factor))
                for factor, polars_query in factor_query_dict.items()
            ).alias("factor"),
            pl.coalesce(
                pl.when(polars_query).then(pl.lit(factor))
                for factor, polars_query in currency_factor_query_dict.items()
            ).alias("currency_factor"),
        ]
    )

print(factor_holdings.collect())
print(factor_holdings.collect(comm_subexpr_elim=False))

Log output

It doesn't generate an exception, the output is different using the optimization.

Issue description

When running the code if I use the optimizer one of the columns created returns only null values, if I turn off the optimizer with .collect(comm_subexpr_elim=False) or .collect(no_optimization=True) as was suggested on stackoverflow the column is populated

Expected behavior

I was expecting the same result even if the optimizer is turned on.

Installed versions

``` pl.show_versions() --------Version info--------- Polars: 0.20.25 Index type: UInt32 Platform: macOS-14.4.1-arm64-arm-64bit Python: 3.10.4 (v3.10.4:9d38120e33, Mar 23 2022, 17:29:05) [Clang 13.0.0 (clang-1300.0.29.30)] ----Optional dependencies---- adbc_driver_manager: cloudpickle: connectorx: deltalake: fastexcel: fsspec: gevent: hvplot: matplotlib: 3.5.1 nest_asyncio: 1.5.6 numpy: 1.22.3 openpyxl: 3.0.9 pandas: 2.0.0 pyarrow: pydantic: pyiceberg: pyxlsb: sqlalchemy: torch: xlsx2csv: xlsxwriter: ```
MarcoGorelli commented 2 months ago

thanks for the report, can confirm this reproduces

to expedite resolution, it would probably be very helpful if you could narrow down the example so it's as small as possible

rlartiga commented 2 months ago

@MarcoGorelli I will try

Also a company friend told me that the issue started at the version 0.19 if we try with 0.18.15 the result with the optimization is the same as without. I tested and confirmed that.

MarcoGorelli commented 2 months ago

It took some effort, but here's a small-ish example which reproduces the same issue:

import polars as pl

print(pl.__version__)

holdings = pl.DataFrame({
    'fund_currency': ['CLP', 'CLP'],
    'asset_currency': ['EUR', 'USA'],
})

usd = ["USD"]
eur = ["EUR"]
clp = ['CLP']

factor_query_dict = {}

currency_factor_query_dict = {
    "CURRENCY_EUR_FUND_CLP": pl.col("asset_currency").is_in(eur)
    & pl.col("fund_currency").is_in(clp),
    "CURRENCY_EUR_FUND_USD": pl.col("asset_currency").is_in(eur)
    & pl.col("fund_currency").is_in(usd),
    "CURRENCY_CLP_FUND_CLP": pl.col("asset_currency").is_in(clp)
    & pl.col("fund_currency").is_in(clp),
    "CURRENCY_USD_FUND_USD": pl.col("asset_currency").is_in(usd)
    & pl.col("fund_currency").is_in(usd),
}

factor_holdings = holdings.lazy().with_columns(
        [
            pl.coalesce(
                pl.when(polars_query).then(pl.lit(factor))
                for factor, polars_query in currency_factor_query_dict.items()
            ).alias("currency_factor"),
        ]
    )

print(factor_holdings.collect())
print(factor_holdings.collect(comm_subexpr_elim=False))

Output:

0.20.25
shape: (2, 3)
┌───────────────┬────────────────┬─────────────────┐
│ fund_currency ┆ asset_currency ┆ currency_factor │
│ ---           ┆ ---            ┆ ---             │
│ str           ┆ str            ┆ str             │
╞═══════════════╪════════════════╪═════════════════╡
│ CLP           ┆ EUR            ┆ null            │
│ CLP           ┆ USA            ┆ null            │
└───────────────┴────────────────┴─────────────────┘
shape: (2, 3)
┌───────────────┬────────────────┬───────────────────────┐
│ fund_currency ┆ asset_currency ┆ currency_factor       │
│ ---           ┆ ---            ┆ ---                   │
│ str           ┆ str            ┆ str                   │
╞═══════════════╪════════════════╪═══════════════════════╡
│ CLP           ┆ EUR            ┆ CURRENCY_EUR_FUND_CLP │
│ CLP           ┆ USA            ┆ null                  │
└───────────────┴────────────────┴───────────────────────┘
MarcoGorelli commented 2 months ago

going to nerd-snipe tag @ritchie46 on this one

etiennebacher commented 2 months ago

Here's a slightly more minimal version:

import polars as pl

print(pl.__version__)

holdings = pl.DataFrame(
    {
        "fund_currency": ["CLP", "CLP"],
        "asset_currency": ["EUR", "USA"],
    }
)

usd = ["USD"]
eur = ["EUR"]
clp = ["CLP"]

currency_factor_query_dict = [
    pl.col("asset_currency").is_in(eur) & pl.col("fund_currency").is_in(clp),
    pl.col("asset_currency").is_in(eur) & pl.col("fund_currency").is_in(usd),
    pl.col("asset_currency").is_in(clp) & pl.col("fund_currency").is_in(clp),
    pl.col("asset_currency").is_in(usd) & pl.col("fund_currency").is_in(usd),
]

factor_holdings = holdings.lazy().with_columns(
    [
        pl.coalesce(currency_factor_query_dict).alias("currency_factor"),
    ]
)

print(factor_holdings.collect())
print(factor_holdings.collect(comm_subexpr_elim=False))

Output:

0.20.25
shape: (2, 3)
┌───────────────┬────────────────┬─────────────────┐
│ fund_currency ┆ asset_currency ┆ currency_factor │
│ ---           ┆ ---            ┆ ---             │
│ str           ┆ str            ┆ bool            │
╞═══════════════╪════════════════╪═════════════════╡
│ CLP           ┆ EUR            ┆ false           │
│ CLP           ┆ USA            ┆ false           │
└───────────────┴────────────────┴─────────────────┘

shape: (2, 3)
┌───────────────┬────────────────┬─────────────────┐
│ fund_currency ┆ asset_currency ┆ currency_factor │
│ ---           ┆ ---            ┆ ---             │
│ str           ┆ str            ┆ bool            │
╞═══════════════╪════════════════╪═════════════════╡
│ CLP           ┆ EUR            ┆ true            │
│ CLP           ┆ USA            ┆ false           │
└───────────────┴────────────────┴─────────────────┘