pola-rs / polars

Dataframes powered by a multithreaded, vectorized query engine, written in Rust
https://docs.pola.rs
Other
30.52k stars 1.98k forks source link

`LazyFrame::cross_join` + `concat_list` error #18587

Open kgv opened 2 months ago

kgv commented 2 months ago

Checks

Reproducible example

// manual cartesian product (OK)
{
    let mut lazy_frame = df! {
        "1" => df! {
            "u32" => &[0u32, 0, 0, 0, 1, 1, 1, 1],
            "str" => &["a", "a", "a", "a", "b", "b", "b", "b"],
        }
        .unwrap()
        .into_struct(""),
        "2" => df! {
            "u32" => &[0u32, 0, 1, 1, 0, 0, 1, 1],
            "str" => &["a", "a", "b", "b", "a", "a", "b", "b"],
        }
        .unwrap()
        .into_struct(""),
        "3" => df! {
            "u32" => &[0u32, 1, 0, 1, 0, 1, 0, 1],
            "str" => &["a", "b", "a", "b", "a", "b", "a", "b"],
        }
        .unwrap()
        .into_struct(""),
    }
    .unwrap()
    .lazy();
    println!(
        "manual cartesian product data_frame: {}",
        lazy_frame.clone().collect().unwrap()
    );
    lazy_frame = lazy_frame.select([concat_list(["1", "2", "3"]).unwrap().alias("LIST")]);
    println!(
        "manual cartesian product concat_list data_frame: {}",
        lazy_frame.clone().collect().unwrap()
    );
}

// `cross_join` cartesian product (ERROR)
{
    let mut lazy_frame = df! {
        "u32" => &[0u32, 1],
        "str" => &["a", "b"],
    }
    .unwrap()
    .lazy();
    lazy_frame = lazy_frame
        .clone()
        .select([as_struct(vec![col("u32"), col("str")]).alias("1")])
        .cross_join(
            lazy_frame
                .clone()
                .select([as_struct(vec![col("u32"), col("str")]).alias("2")]),
            None,
        )
        .cross_join(
            lazy_frame.select([as_struct(vec![col("u32"), col("str")]).alias("3")]),
            None,
        );
    println!(
        "cross_join cartesian product data_frame: {}",
        lazy_frame.clone().collect().unwrap()
    );
    // AFTER THIS LINE ERROR
    lazy_frame = lazy_frame.select([concat_list(["1", "2", "3"]).unwrap().alias("LIST")]);
    println!(
        "cross_join cartesian product concat_list data_frame: {}",
        lazy_frame.clone().collect().unwrap()
    );
}

Log output

manual cartesian product data_frame: shape: (8, 3)
┌───────────┬───────────┬───────────┐
│ 1         ┆ 2         ┆ 3         │
│ ---       ┆ ---       ┆ ---       │
│ struct[2] ┆ struct[2] ┆ struct[2] │
╞═══════════╪═══════════╪═══════════╡
│ {0,"a"}   ┆ {0,"a"}   ┆ {0,"a"}   │
│ {0,"a"}   ┆ {0,"a"}   ┆ {1,"b"}   │
│ {0,"a"}   ┆ {1,"b"}   ┆ {0,"a"}   │
│ {0,"a"}   ┆ {1,"b"}   ┆ {1,"b"}   │
│ {1,"b"}   ┆ {0,"a"}   ┆ {0,"a"}   │
│ {1,"b"}   ┆ {0,"a"}   ┆ {1,"b"}   │
│ {1,"b"}   ┆ {1,"b"}   ┆ {0,"a"}   │
│ {1,"b"}   ┆ {1,"b"}   ┆ {1,"b"}   │
└───────────┴───────────┴───────────┘
manual cartesian product concat_list data_frame: shape: (8, 1)
┌─────────────────────────────┐
│ LIST                        │
│ ---                         │
│ list[struct[2]]             │
╞═════════════════════════════╡
│ [{0,"a"}, {0,"a"}, {0,"a"}] │
│ [{0,"a"}, {0,"a"}, {1,"b"}] │
│ [{0,"a"}, {1,"b"}, {0,"a"}] │
│ [{0,"a"}, {1,"b"}, {1,"b"}] │
│ [{1,"b"}, {0,"a"}, {0,"a"}] │
│ [{1,"b"}, {0,"a"}, {1,"b"}] │
│ [{1,"b"}, {1,"b"}, {0,"a"}] │
│ [{1,"b"}, {1,"b"}, {1,"b"}] │
└─────────────────────────────┘
cross_join cartesian product data_frame: shape: (8, 3)
┌───────────┬───────────┬───────────┐
│ 1         ┆ 2         ┆ 3         │
│ ---       ┆ ---       ┆ ---       │
│ struct[2] ┆ struct[2] ┆ struct[2] │
╞═══════════╪═══════════╪═══════════╡
│ {0,"a"}   ┆ {0,"a"}   ┆ {0,"a"}   │
│ {0,"a"}   ┆ {0,"a"}   ┆ {1,"b"}   │
│ {0,"a"}   ┆ {1,"b"}   ┆ {0,"a"}   │
│ {0,"a"}   ┆ {1,"b"}   ┆ {1,"b"}   │
│ {1,"b"}   ┆ {0,"a"}   ┆ {0,"a"}   │
│ {1,"b"}   ┆ {0,"a"}   ┆ {1,"b"}   │
│ {1,"b"}   ┆ {1,"b"}   ┆ {0,"a"}   │
│ {1,"b"}   ┆ {1,"b"}   ┆ {1,"b"}   │
└───────────┴───────────┴───────────┘

called `Result::unwrap()` on an `Err` value: ShapeMismatch(ErrString("series length 2 does not match expected length of 8"))

Issue description

concat_list(["1", "2"]) - OK concat_list(["1", "3"]), concat_list(["2", "3"]) - ERROR

Expected behavior

created with cross_join cartesian product will behave the same as "manual" cartesian product.

Installed versions

polars = { version = "0.42.0", features = [ "abs", "concat_str", "cross_join", "cum_agg", "diagonal_concat", "dtype-array", "dtype-i8", "dtype-struct", "dtype-u8", "is_in", "lazy", "list_any_all", "list_count", "list_eval", "regex", "round_series", "serde", "strings", ] }
cmdlineluser commented 2 months ago

Python repro:

import polars as pl

df = pl.LazyFrame({
    "u32": [0, 1],
    "str": ["a", "b"],
})

(df.select(pl.struct(pl.all()).alias("1"))
   .join(
       df.select(pl.struct(pl.all()).alias("2")),
       how = "cross"
   )
   .join(
       df.select(pl.struct(pl.all()).alias("3")),
       how = "cross"
   )
   .select(pl.concat_list("1", "2", "3"))
   .collect()
)
# ShapeError: series length 2 does not match expected length of 8
kgv commented 2 months ago

Add DataFrame::as_single_chunk_par after last cross_join, before concat_list fix this for rust lang.