pola-rs / polars

Dataframes powered by a multithreaded, vectorized query engine, written in Rust
https://docs.pola.rs
Other
29.38k stars 1.86k forks source link

Incorrect rounding of `Decimal` values during casting in window aggregation with `drop_nulls()` #14785

Closed avimallu closed 6 months ago

avimallu commented 6 months ago

Checks

Reproducible example

import polars as pl
from decimal import Decimal
pl.DataFrame({
    "a": [Decimal(100), Decimal(80.5), None, None],
    "b": [1, 2, 2, 2]
}).with_columns(
    pl.col("a").drop_nulls().cast(pl.Float64).min().over("b"))

Log output

shape: (4, 2)
┌────────┬─────┐
│ a      ┆ b   │
│ ---    ┆ --- │
│ f64    ┆ i64 │
╞════════╪═════╡
│ 1000.0 ┆ 1   │
│ 805.0  ┆ 2   │
│ 805.0  ┆ 2   │
│ 805.0  ┆ 2   │
└────────┴─────┘

Issue description

The output should be 80.5 not 805.0.

Unable to make more minimal - it seems to require dropping nulls, inside a window aggregation to trigger.

Expected behavior

Produce the following DF:

shape: (4, 2)
┌────────┬─────┐
│ a      ┆ b   │
│ ---    ┆ --- │
│ f64    ┆ i64 │
╞════════╪═════╡
│ 1000.0 ┆ 1   │
│ 80.5   ┆ 2   │
│ 80.5   ┆ 2   │
│ 80.5   ┆ 2   │
└────────┴─────┘

Installed versions

Have installed 0.20.13 just before running the example, but pl.show_versions() produces an error, which is also placed below (LMK if that requires a separate issue).

```py --------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[5], line 1 ----> 1 pl.show_versions() File ~/miniconda3/lib/python3.10/site-packages/polars/meta/versions.py:45, in show_versions() 41 # Note: we import 'platform' here (rather than at the top of the 42 # module) as a micro-optimization for polars' initial import 43 import platform ---> 45 deps = _get_dependency_info() 46 core_properties = ("Polars", "Index type", "Platform", "Python") 47 keylen = max(len(x) for x in [*core_properties, *deps.keys()]) + 1 File ~/miniconda3/lib/python3.10/site-packages/polars/meta/versions.py:82, in _get_dependency_info() 60 def _get_dependency_info() -> dict[str, str]: 61 # see the list of dependencies in pyproject.toml 62 opt_deps = [ 63 "adbc_driver_manager", 64 "cloudpickle", (...) 80 "xlsxwriter", 81 ] ---> 82 return {f"{name}:": _get_dependency_version(name) for name in opt_deps} File ~/miniconda3/lib/python3.10/site-packages/polars/meta/versions.py:82, in (.0) 60 def _get_dependency_info() -> dict[str, str]: 61 # see the list of dependencies in pyproject.toml 62 opt_deps = [ 63 "adbc_driver_manager", 64 "cloudpickle", (...) 80 "xlsxwriter", 81 ] ---> 82 return {f"{name}:": _get_dependency_version(name) for name in opt_deps} File ~/miniconda3/lib/python3.10/site-packages/polars/meta/versions.py:91, in _get_dependency_version(dep_name) 88 import importlib.metadata 90 try: ---> 91 module = importlib.import_module(dep_name) 92 except ImportError: 93 return "" File ~/miniconda3/lib/python3.10/importlib/__init__.py:126, in import_module(name, package) 124 break 125 level += 1 --> 126 return _bootstrap._gcd_import(name[level:], package, level) File :1050, in _gcd_import(name, package, level) File :1027, in _find_and_load(name, import_) File :1006, in _find_and_load_unlocked(name, import_) File :688, in _load_unlocked(spec) File :883, in exec_module(self, module) File :241, in _call_with_frames_removed(f, *args, **kwds) File ~/miniconda3/lib/python3.10/site-packages/hvplot/__init__.py:69 65 import holoviews as _hv 67 from holoviews import Store, render # noqa ---> 69 from .converter import HoloViewsConverter 70 from .interactive import Interactive 71 from .ui import explorer # noqa File ~/miniconda3/lib/python3.10/site-packages/hvplot/converter.py:23 16 from holoviews.core.util import max_range 17 from holoviews.element import ( 18 Curve, Scatter, Area, Bars, BoxWhisker, Dataset, Distribution, 19 Table, HeatMap, Image, HexTiles, QuadMesh, Bivariate, Histogram, 20 Violin, Contours, Polygons, Points, Path, Labels, RGB, ErrorBars, 21 VectorField, Rectangles, Segments 22 ) ---> 23 from holoviews.plotting.bokeh import OverlayPlot, colormap_generator 24 from holoviews.plotting.util import process_cmap 25 from holoviews.operation import histogram File ~/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/__init__.py:34 32 from .graphs import GraphPlot, NodePlot, TriMeshPlot, ChordPlot 33 from .heatmap import HeatMapPlot, RadialHeatMapPlot ---> 34 from .hex_tiles import HexTilesPlot 35 from .links import LinkCallback # noqa (API import) 36 from .path import PathPlot, PolygonPlot, ContourPlot File ~/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/hex_tiles.py:18 14 from .selection import BokehOverlaySelectionDisplay 15 from .styles import base_properties, line_properties, fill_properties ---> 18 class hex_binning(Operation): 19 """ 20 Applies hex binning by computing aggregates on a hexagonal grid. 21 22 Should not be user facing as the returned element is not directly 23 useable. 24 """ 26 aggregator = param.ClassSelector( 27 default=np.size, class_=(types.FunctionType, tuple), doc=""" 28 Aggregation function or dimension transform used to compute bin 29 values. Defaults to np.size to count the number of values 30 in each bin.""") File ~/miniconda3/lib/python3.10/site-packages/holoviews/plotting/bokeh/hex_tiles.py:26, in hex_binning() 18 class hex_binning(Operation): 19 """ 20 Applies hex binning by computing aggregates on a hexagonal grid. 21 22 Should not be user facing as the returned element is not directly 23 useable. 24 """ ---> 26 aggregator = param.ClassSelector( 27 default=np.size, class_=(types.FunctionType, tuple), doc=""" 28 Aggregation function or dimension transform used to compute bin 29 values. Defaults to np.size to count the number of values 30 in each bin.""") 32 gridsize = param.ClassSelector(default=50, class_=(int, tuple)) 34 invert_axes = param.Boolean(default=False) File ~/miniconda3/lib/python3.10/site-packages/param/__init__.py:1331, in ClassSelector.__init__(self, class_, default, instantiate, is_instance, **params) 1329 self.is_instance = is_instance 1330 super(ClassSelector,self).__init__(default=default,instantiate=instantiate,**params) -> 1331 self._validate(default) File ~/miniconda3/lib/python3.10/site-packages/param/__init__.py:1335, in ClassSelector._validate(self, val) 1333 def _validate(self, val): 1334 super(ClassSelector, self)._validate(val) -> 1335 self._validate_class_(val, self.class_, self.is_instance) File ~/miniconda3/lib/python3.10/site-packages/param/__init__.py:1347, in ClassSelector._validate_class_(self, val, class_, is_instance) 1345 if is_instance: 1346 if not (isinstance(val, class_)): -> 1347 raise ValueError( 1348 "%s parameter %r value must be an instance of %s, not %r." % 1349 (param_cls, self.name, class_name, val)) 1350 else: 1351 if not (issubclass(val, class_)): ValueError: ClassSelector parameter None value must be an instance of (function, tuple), not . ```
Julian-J-S commented 6 months ago

this is possibly related to #13848

seems like grouping (in my exampl group_by and here over) could cause this.

flisky commented 6 months ago

I can reproduce this on polars v0.20.15, and cannot reproduce on main branch. Hopefully, it's fixed by next release!

avimallu commented 6 months ago

Thanks @flisky, can confirm that its fixed in 0.20.16rc1.