dask / dask

Parallel computing with task scheduling
https://dask.org
BSD 3-Clause "New" or "Revised" License
12.4k stars 1.69k forks source link

dask.dataframe.describe error with nullable data types #8530

Open scharlottej13 opened 2 years ago

scharlottej13 commented 2 years ago

What happened: Error with dask.dataframe.describe() when columns contain nullable data types.

What you expected to happen: Similar output as pandas.dataframe.describe() (which works)

Minimal Complete Verifiable Example:

import pandas as pd
import dask.dataframe as dd

n = 10
test = pd.DataFrame({
    '1': pd.Series(['a', pd.NA]*n, dtype = pd.StringDtype()), 
    '2': pd.Series([1, pd.NA]*n, dtype = pd.Int64Dtype()),
    '3': pd.Series([0.56, pd.NA]*n, dtype = pd.Float64Dtype())
})
test.describe() # works no problem

ddf = dd.from_pandas(test, npartitions=2)
ddf.describe().compute() # error
full traceback ```python traceback --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) AttributeError: 'NAType' object has no attribute 'conjugate' The above exception was the direct cause of the following exception: TypeError Traceback (most recent call last) /var/folders/hf/2s7qjx7j5ndc5220_qxv8y800000gn/T/ipykernel_4176/3509056714.py in ----> 1 dd_df[['2']].describe().compute() ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/dataframe/core.py in describe(self, split_every, percentiles, percentiles_method, include, exclude, datetime_is_numeric) 2522 bools_and_times = self._meta.select_dtypes(include=_include) 2523 if len(bools_and_times.columns) == 0: -> 2524 return self._describe_numeric( 2525 self, 2526 split_every, ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/dataframe/core.py in _describe_numeric(self, data, split_every, percentiles, percentiles_method, is_timedelta_column, is_datetime_column) 2630 num.count(split_every=split_every), 2631 num.mean(split_every=split_every), -> 2632 num.std(split_every=split_every), 2633 num.min(split_every=split_every), 2634 num.quantile(percentiles, method=percentiles_method), ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/dataframe/core.py in wrapper(self, *args, **kwargs) 94 elif kwargs.get("numeric_only") is True: 95 self = self._get_numeric_data() ---> 96 return func(self, *args, **kwargs) 97 98 return wrapper ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/dataframe/core.py in std(self, axis, skipna, ddof, split_every, dtype, out, numeric_only) 2148 return handle_out(out, result) 2149 else: -> 2150 v = self.var(skipna=skipna, ddof=ddof, split_every=split_every) 2151 name = self._token_prefix + "std" 2152 result = map_partitions( ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/dataframe/core.py in wrapper(self, *args, **kwargs) 94 elif kwargs.get("numeric_only") is True: 95 self = self._get_numeric_data() ---> 96 return func(self, *args, **kwargs) 97 98 return wrapper ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/dataframe/core.py in var(self, axis, skipna, ddof, split_every, dtype, out, numeric_only) 2007 2008 # pandas 1.0+ does not implement var on timedelta -> 2009 result = self._var_numeric(skipna, ddof, split_every) 2010 2011 if isinstance(self, DataFrame): ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/dataframe/core.py in _var_numeric(self, skipna, ddof, split_every) 2028 cols = num._meta.columns if is_dataframe_like(num) else None 2029 -> 2030 var_shape = num._meta_nonempty.values.var(axis=0).shape 2031 array_var_name = (array_var._name,) + (0,) * len(var_shape) 2032 ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/numpy/core/_methods.py in _var(a, axis, dtype, out, ddof, keepdims, where) 239 # numbers and complex types with non-native byteorder 240 else: --> 241 x = um.multiply(x, um.conjugate(x), out=x).real 242 243 ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where) TypeError: loop of ufunc does not support argument 0 of type NAType which has no callable conjugate method ```

Anything else we need to know?: Individually calling the methods in describe, std and quantile throw TypeError: Cannot interpret 'Int64Dtype()' as a data type

full traceback ```python traceback --------------------------------------------------------------------------- TypeError Traceback (most recent call last) /var/folders/hf/2s7qjx7j5ndc5220_qxv8y800000gn/T/ipykernel_4176/418681936.py in ----> 1 ddf['2'].quantile().compute() ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/base.py in compute(self, **kwargs) 286 dask.base.compute 287 """ --> 288 (result,) = compute(self, traverse=False, **kwargs) 289 return result 290 ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/base.py in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs) 569 postcomputes.append(x.__dask_postcompute__()) 570 --> 571 results = schedule(dsk, keys, **kwargs) 572 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)]) 573 ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs) 77 pool = MultiprocessingPoolExecutor(pool) 78 ---> 79 results = get_async( 80 pool.submit, 81 pool._max_workers, ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/local.py in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs) 505 _execute_task(task, data) # Re-execute locally 506 else: --> 507 raise_exception(exc, tb) 508 res, worker_id = loads(res_info) 509 state["cache"][key] = res ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/local.py in reraise(exc, tb) 313 if exc.__traceback__ is not tb: 314 raise exc.with_traceback(tb) --> 315 raise exc 316 317 ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception) 218 try: 219 task, data = loads(task_info) --> 220 result = _execute_task(task, data) 221 id = get_id() 222 result = dumps((result, id)) ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/core.py in _execute_task(arg, cache, dsk) 117 # temporaries by their reference count and can execute certain 118 # operations in-place. --> 119 return func(*(_execute_task(a, cache) for a in args)) 120 elif not ishashable(arg): 121 return arg ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/utils.py in __call__(self, arg, *args, **kwargs) 619 """ 620 meth = self.dispatch(type(arg)) --> 621 return meth(arg, *args, **kwargs) 622 623 @property ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/dataframe/backends.py in percentile(a, q, interpolation) 531 @percentile_lookup.register((pd.Series, pd.Index)) 532 def percentile(a, q, interpolation="linear"): --> 533 return _percentile(a, q, interpolation) 534 535 ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/dask/array/percentile.py in _percentile(a, q, interpolation) 29 a = a.values 30 ---> 31 if np.issubdtype(a.dtype, np.datetime64): 32 values = a 33 a2 = values.view("i8") ~/mambaforge/envs/dask-mini-tutorial/lib/python3.9/site-packages/numpy/core/numerictypes.py in issubdtype(arg1, arg2) 417 """ 418 if not issubclass_(arg1, generic): --> 419 arg1 = dtype(arg1).type 420 if not issubclass_(arg2, generic): 421 arg2 = dtype(arg2).type TypeError: Cannot interpret 'Int64Dtype()' as a data type ```

Environment:

jsignell commented 2 years ago

Yeah this isn't terribly surprising. The pd nullable types are not super well supported. I think you can get through your first issue with this change:

diff --git a/dask/dataframe/core.py b/dask/dataframe/core.py
index 3a1e517b..d5a1f910 100644
--- a/dask/dataframe/core.py
+++ b/dask/dataframe/core.py
@@ -2068,7 +2068,7 @@ Dask Name: {name}, {task} tasks"""
         name = self._token_prefix + "var-numeric" + tokenize(num, split_every)
         cols = num._meta.columns if is_dataframe_like(num) else None

-        var_shape = num._meta_nonempty.values.var(axis=0).shape
+        var_shape = num._meta_nonempty.var(axis=0).shape
         array_var_name = (array_var._name,) + (0,) * len(var_shape)

         layer = {(name, 0): (methods.wrap_var_reduction, array_var_name, cols)}

but I haven't been able to get to the bottom of the next error yet. Are you planning on looking into this?

scharlottej13 commented 2 years ago

Thanks @jsignell! Yup, I can keep looking into this.