mwaskom / seaborn

Statistical data visualization in Python
https://seaborn.pydata.org
BSD 3-Clause "New" or "Revised" License
12.6k stars 1.93k forks source link

Polars error for plotting when a datetime column is present, even when that column is not plotted #3781

Closed zacharygibbs closed 2 weeks ago

zacharygibbs commented 2 weeks ago

https://github.com/mwaskom/seaborn/blob/b4e5f8d261d6d5524a00b7dd35e00a40e4855872/seaborn/_core/data.py#L313C9-L313C55

import polars as pl
import seaborn as sns

df = pl.LazyFrame({
     'col1': [1,2,3],
     'col2': [1,2,3],
     'duration_col': [1,2,3],
})

df = df.with_columns(pl.duration(days=pl.col('duration_col')).alias('duration_col')).collect()
df

sns.scatterplot(df, x='duration_col', y='col1')

which gives error:


NotImplementedError                       Traceback (most recent call last)
File [/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_core/data.py:313](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_core/data.py#line=312), in convert_dataframe_to_pandas(data)
    306 try:
    307     # This is going to convert all columns in the input dataframe, even though
    308     # we may only need one or two of them. It would be more efficient to select
   (...)
    311     # interface where variables passed in Plot() may only be referenced later
    312     # in Plot.add(). But noting here in case this seems to be a bottleneck.
--> 313     return pd.api.interchange.from_dataframe(data)
    314 except Exception as err:

File [/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py:71](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py#line=70), in from_dataframe(df, allow_copy)
     69     raise ValueError("`df` does not support __dataframe__")
---> 71 return _from_dataframe(
     72     df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy
     73 )

File [/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py:94](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py#line=93), in _from_dataframe(df, allow_copy)
     93 for chunk in df.get_chunks():
---> 94     pandas_df = protocol_df_chunk_to_pandas(chunk)
     95     pandas_dfs.append(pandas_df)

File [/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py:150](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py#line=149), in protocol_df_chunk_to_pandas(df)
    149 elif dtype == DtypeKind.DATETIME:
--> 150     columns[name], buf = datetime_column_to_ndarray(col)
    151 else:

File [/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py:396](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py#line=395), in datetime_column_to_ndarray(col)
    384 data = buffer_to_ndarray(
    385     dbuf,
    386     (
   (...)
    393     length=col.size(),
    394 )
--> 396 data = parse_datetime_format_str(format_str, data)  # type: ignore[assignment]
    397 data = set_nulls(data, col, buffers["validity"])

File [/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py:361](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/pandas/core/interchange/from_dataframe.py#line=360), in parse_datetime_format_str(format_str, data)
    359     return data
--> 361 raise NotImplementedError(f"DateTime kind is not supported: {format_str}")

NotImplementedError: DateTime kind is not supported: tDu

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
Cell In[20], line 10
      7 df = df.with_columns(pl.duration(days=pl.col('duration_col')).alias('duration_col')).collect()
      8 df
---> 10 sns.scatterplot(df, x='duration_col', y='col1')

File [/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/relational.py:615](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/relational.py#line=614), in scatterplot(data, x, y, hue, size, style, palette, hue_order, hue_norm, sizes, size_order, size_norm, markers, style_order, legend, ax, **kwargs)
    606 def scatterplot(
    607     data=None, *,
    608     x=None, y=None, hue=None, size=None, style=None,
   (...)
    612     **kwargs
    613 ):
--> 615     p = _ScatterPlotter(
    616         data=data,
    617         variables=dict(x=x, y=y, hue=hue, size=size, style=style),
    618         legend=legend
    619     )
    621     p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
    622     p.map_size(sizes=sizes, order=size_order, norm=size_norm)

File [/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/relational.py:396](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/relational.py#line=395), in _ScatterPlotter.__init__(self, data, variables, legend)
    387 def __init__(self, *, data=None, variables={}, legend=None):
    388 
    389     # TODO this is messy, we want the mapping to be agnostic about
    390     # the kind of plot to draw, but for the time being we need to set
    391     # this information so the SizeMapping can use it
    392     self._default_size_range = (
    393         np.r_[.5, 2] * np.square(mpl.rcParams["lines.markersize"])
    394     )
--> 396     super().__init__(data=data, variables=variables)
    398     self.legend = legend

File [/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_base.py:634](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_base.py#line=633), in VectorPlotter.__init__(self, data, variables)
    629 # var_ordered is relevant only for categorical axis variables, and may
    630 # be better handled by an internal axis information object that tracks
    631 # such information and is set up by the scale_* methods. The analogous
    632 # information for numeric axes would be information about log scales.
    633 self._var_ordered = {"x": False, "y": False}  # alt., used DefaultDict
--> 634 self.assign_variables(data, variables)
    636 # TODO Lots of tests assume that these are called to initialize the
    637 # mappings to default values on class initialization. I'd prefer to
    638 # move away from that and only have a mapping when explicitly called.
    639 for var in ["hue", "size", "style"]:

File [/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_base.py:679](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_base.py#line=678), in VectorPlotter.assign_variables(self, data, variables)
    674 else:
    675     # When dealing with long-form input, use the newer PlotData
    676     # object (internal but introduced for the objects interface)
    677     # to centralize [/](http://localhost:8888/) standardize data consumption logic.
    678     self.input_format = "long"
--> 679     plot_data = PlotData(data, variables)
    680     frame = plot_data.frame
    681     names = plot_data.names

File [/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_core/data.py:57](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_core/data.py#line=56), in PlotData.__init__(self, data, variables)
     51 def __init__(
     52     self,
     53     data: DataSource,
     54     variables: dict[str, VariableSpec],
     55 ):
---> 57     data = handle_data_source(data)
     58     frame, names, ids = self._assign_variables(data, variables)
     60     self.frame = frame

File [/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_core/data.py:275](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_core/data.py#line=274), in handle_data_source(data)
    271 """Convert the data source object to a common union representation."""
    272 if isinstance(data, pd.DataFrame) or hasattr(data, "__dataframe__"):
    273     # Check for pd.DataFrame inheritance could be removed once
    274     # minimal pandas version supports dataframe interchange (1.5.0).
--> 275     data = convert_dataframe_to_pandas(data)
    276 elif data is not None and not isinstance(data, Mapping):
    277     err = f"Data source must be a DataFrame or Mapping, not {type(data)!r}."

File [/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_core/data.py:319](http://localhost:8888/opt/conda/envs/ds/lib/python3.12/site-packages/seaborn/_core/data.py#line=318), in convert_dataframe_to_pandas(data)
    314 except Exception as err:
    315     msg = (
    316         "Encountered an exception when converting data source "
    317         "to a pandas DataFrame. See traceback above for details."
    318     )
--> 319     raise RuntimeError(msg) from err

RuntimeError: Encountered an exception when converting data source to a pandas DataFrame. See traceback above for details.
mwaskom commented 2 weeks ago

Seaborn converts input data to pandas dataframes using the dataframe interchange protocol. If there's some type that's unable to be converted using that protocol, there's nothing seaborn can do about it.

zacharygibbs commented 2 weeks ago

@mwaskom I do understand your take here; pandas does seem like the right place to raise this issue. The only way around this would be to pre-select only the columns used in hue/x/y/col/rows, ... Is that route to complex to consider?