pola-rs / polars

Dataframes powered by a multithreaded, vectorized query engine, written in Rust
https://docs.pola.rs
Other
30.7k stars 2k forks source link

InvalidOperationError: expected 'struct' dtype, got Unknown(Any) #20043

Open wukan1986 opened 6 days ago

wukan1986 commented 6 days ago

Checks

Reproducible example

import polars as pl
import talib as ta

df = pl.DataFrame({"a": [1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ]})

a = df.with_columns([
    pl.col('a').map_batches(lambda x: pl.DataFrame(ta.MACD(x, 3, 4, 5)).to_struct())
]).with_columns(
    pl.col('a').struct[0]
)
print(a)
"""
shape: (11, 2)
┌──────────────────────────────┬──────────┐
│ a                            ┆ column_0 │
│ ---                          ┆ ---      │
│ struct[3]                    ┆ f64      │
╞══════════════════════════════╪══════════╡
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ …                            ┆ …        │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {0.4676,0.38472,0.08288}     ┆ 0.4676   │
│ {0.48056,0.416667,0.063893}  ┆ 0.48056  │
│ {0.488336,0.440556,0.04778}  ┆ 0.488336 │
│ {0.493002,0.458038,0.034963} ┆ 0.493002 │
└──────────────────────────────┴──────────┘
"""

a = df.with_columns([
    pl.col('a').map_batches(lambda x: pl.DataFrame(ta.MACD(x, 3, 4, 5), nan_to_null=True).to_struct()).struct[0]
])
print(a)
# polars.exceptions.InvalidOperationError: expected 'struct' dtype, got Unknown(Any)

dtype = pl.Struct([pl.Field("a", pl.Float64), pl.Field("b", pl.Float64), pl.Field("c", pl.Float64)])
a = df.with_columns([
    pl.col('a').map_batches(lambda x: pl.DataFrame(ta.MACD(x, 3, 4, 5), nan_to_null=True).to_struct(), return_dtype=dtype).struct[0]
])
print(a)
# polars.exceptions.StructFieldNotFoundError: a

dtype = pl.Struct([pl.Field("column_0", pl.Float64), pl.Field("column_1", pl.Float64), pl.Field("column_2", pl.Float64)])
a = df.with_columns([
    pl.col('a').map_batches(lambda x: pl.DataFrame(ta.MACD(x, 3, 4, 5), nan_to_null=True).to_struct(), return_dtype=dtype).struct[0]
])
print(a)
"""
shape: (11, 2)
┌─────┬──────────┐
│ a   ┆ column_0 │
│ --- ┆ ---      │
│ i64 ┆ f64      │
╞═════╪══════════╡
│ 1   ┆ NaN      │
│ 1   ┆ NaN      │
│ 2   ┆ NaN      │
│ 3   ┆ NaN      │
│ 4   ┆ NaN      │
│ …   ┆ …        │
│ 6   ┆ NaN      │
│ 7   ┆ 0.4676   │
│ 8   ┆ 0.48056  │
│ 9   ┆ 0.488336 │
│ 10  ┆ 0.493002 │
└─────┴──────────┘
"""

Log output

No response

Issue description

return_dtype can not infer struct in map_batches

Expected behavior

I hope return_dtype can infer struct

Installed versions

--------Version info--------- Polars: 1.15.0 Index type: UInt32 Platform: Windows-10-10.0.22631-SP0 Python: 3.11.3 | packaged by Anaconda, Inc. | (main, Apr 19 2023, 23:46:34) [MSC v.1916 64 bit (AMD64)] LTS CPU: False ----Optional dependencies---- adbc_driver_manager altair 5.0.1 boto3 cloudpickle connectorx deltalake fastexcel fsspec gevent google.auth 2.26.1 great_tables matplotlib 3.7.1 nest_asyncio 1.5.6 numpy 1.26.4 openpyxl 3.1.2 pandas 2.2.3 pyarrow 12.0.0 pydantic 1.10.7 pyiceberg sqlalchemy 2.0.13 torch 2.0.1+cu118 xlsx2csv xlsxwriter 3.2.0
wukan1986 commented 5 days ago

only change to lazy mode

import polars as pl
import talib as ta

df = pl.DataFrame({"a": [1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ]})

a = df.with_columns([
    pl.col('a').map_batches(lambda x: pl.DataFrame(ta.MACD(x, 3, 4, 5)).to_struct())
]).with_columns(
    pl.col('a').struct[0]
)
print(a)
"""
shape: (11, 2)
┌──────────────────────────────┬──────────┐
│ a                            ┆ column_0 │
│ ---                          ┆ ---      │
│ struct[3]                    ┆ f64      │
╞══════════════════════════════╪══════════╡
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ …                            ┆ …        │
│ {NaN,NaN,NaN}                ┆ NaN      │
│ {0.4676,0.38472,0.08288}     ┆ 0.4676   │
│ {0.48056,0.416667,0.063893}  ┆ 0.48056  │
│ {0.488336,0.440556,0.04778}  ┆ 0.488336 │
│ {0.493002,0.458038,0.034963} ┆ 0.493002 │
└──────────────────────────────┴──────────┘
"""
a = df.lazy().with_columns([
    pl.col('a').map_batches(lambda x: pl.DataFrame(ta.MACD(x, 3, 4, 5)).to_struct())
]).with_columns(
    pl.col('a').struct[0]
)
print(a.collect())
"""
polars.exceptions.InvalidOperationError: expected 'struct' dtype, got Unknown(Any)
"""