Open wence- opened 5 months ago
One could support this on the python side with something like:
diff --git a/py-polars/polars/datatypes/_parse.py b/py-polars/polars/datatypes/_parse.py
index 55345909c..fcbdd9376 100644
--- a/py-polars/polars/datatypes/_parse.py
+++ b/py-polars/polars/datatypes/_parse.py
@@ -37,7 +37,7 @@ else: # pragma: no cover
UnionType = UnionTypeOld
-def parse_into_dtype(input: Any) -> PolarsDataType:
+def parse_into_dtype(input: Any, *, include_unknown: bool = False) -> PolarsDataType:
"""
Parse an input into a Polars data type.
@@ -46,7 +46,7 @@ def parse_into_dtype(input: Any) -> PolarsDataType:
TypeError
If the input cannot be parsed into a Polars data type.
"""
- if is_polars_dtype(input):
+ if is_polars_dtype(input, include_unknown=include_unknown):
return input
elif isinstance(input, ForwardRef):
return _parse_forward_ref_into_dtype(input)
diff --git a/py-polars/polars/datatypes/classes.py b/py-polars/polars/datatypes/classes.py
index 08aeb53c5..68cc7be18 100644
--- a/py-polars/polars/datatypes/classes.py
+++ b/py-polars/polars/datatypes/classes.py
@@ -604,7 +604,7 @@ class List(NestedType):
inner: PolarsDataType
def __init__(self, inner: PolarsDataType | PythonDataType):
- self.inner = polars.datatypes.parse_into_dtype(inner)
+ self.inner = polars.datatypes.parse_into_dtype(inner, include_unknown=True)
def __eq__(self, other: PolarsDataType) -> bool: # type: ignore[override]
# This equality check allows comparison of type classes and type instances.
@@ -675,7 +675,7 @@ class Array(NestedType):
msg = "Array constructor is missing the required argument `shape`"
raise TypeError(msg)
- inner_parsed = polars.datatypes.parse_into_dtype(inner)
+ inner_parsed = polars.datatypes.parse_into_dtype(inner, include_unknown=True)
inner_shape = inner_parsed.shape if isinstance(inner_parsed, Array) else ()
if isinstance(shape, int):
@@ -754,7 +754,7 @@ class Field:
def __init__(self, name: str, dtype: PolarsDataType):
self.name = name
- self.dtype = polars.datatypes.parse_into_dtype(dtype)
+ self.dtype = polars.datatypes.parse_into_dtype(dtype, include_unknown=True)
def __eq__(self, other: Field) -> bool: # type: ignore[override]
return (self.name == other.name) & (self.dtype == other.dtype)
Checks
Reproducible example
Log output
No response
Issue description
When calling
map_batches
without providing areturn_dtype
in a grouped context, the resulting dtype for the batch is inferred by looking at the first value. Without doing this, it is thereforeUnknown
. When we ask forcollect_schema()
, however, the grouped column's schema will beList(Unknown)
andparse_into_dtype
will be called via theList
dtype constructor on the python side. This raisesTypeError
(by default) forUnknown
values.On the rust conversion side, we call
unwrap
on this error result and therefore get a (difficult to catch)PanicException
from pyo3:Expected behavior
No panic, and a re-raised
TypeError
.Perhaps also one could accept returning a schema that has
List(Unknown)
as the dtype for the column.Installed versions