pandas-dev / pandas

Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more
https://pandas.pydata.org
BSD 3-Clause "New" or "Revised" License
43.61k stars 17.9k forks source link

BUG: ArrowNotImplementedError when setting multiindex with dictionary[pyarrow] columns #52657

Closed aprandin closed 10 months ago

aprandin commented 1 year ago

Pandas version checks

Reproducible Example

import pandas as pd
import numpy as np

data = {
    'x_1':np.random.randint(0,5,size=20),
    'x_2':np.random.randint(5,10,size=20),
    'x_3':np.random.randint(10,15,size=20),
    'x_4':np.random.randint(15,20,size=20),

}
df = pd.DataFrame(data=data)
for col in ['x_1','x_2']:
    df[col] = df[col].astype('category')
df.to_feather('example.fh')

dff = pd.read_feather('example.fh', dtype_backend='pyarrow')
display(dff.dtypes)

# Setting only one "pyarrow categorical" column as index works fine
display(dff.set_index('x_1').head())

# Setting multiple NOT "pyarrow categorical" columns as index works fine
display(dff.set_index(['x_3','x_4']).head())

# Setting multiple "pyarrow categorical" columns as index DOES NOT WORK
display(dff.set_index(['x_1','x_2']).head())

Issue Description


ArrowNotImplementedError Traceback (most recent call last) Cell In[8], line 2 1 # Setting multiple "pyarrow categorical" columns as index DOES NOT WORK ----> 2 display(dff.set_index(['x_1','x_2']).head())

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\frame.py:5923, in DataFrame.set_index(self, keys, drop, append, inplace, verify_integrity) 5915 if len(arrays[-1]) != len(self): 5916 # check newest element against length of calling frame, since 5917 # ensure_index_from_sequences would not raise for append=False. 5918 raise ValueError( 5919 f"Length mismatch: Expected {len(self)} rows, " 5920 f"received array of length {len(arrays[-1])}" 5921 ) -> 5923 index = ensure_index_from_sequences(arrays, names) 5925 if verify_integrity and not index.is_unique: 5926 duplicates = index[index.duplicated()].unique()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\indexes\base.py:7066, in ensure_index_from_sequences(sequences, names) 7064 return Index(sequences[0], name=names) 7065 else: -> 7066 return MultiIndex.from_arrays(sequences, names=names)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\indexes\multi.py:505, in MultiIndex.from_arrays(cls, arrays, sortorder, names) 502 if len(arrays[i]) != len(arrays[i - 1]): 503 raise ValueError("all arrays must be same length") --> 505 codes, levels = factorize_from_iterables(arrays) 506 if names is lib.no_default: 507 names = [getattr(arr, "name", None) for arr in arrays]

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2603, in factorize_from_iterables(iterables) 2599 if len(iterables) == 0: 2600 # For consistency, it should return two empty lists. 2601 return [], [] -> 2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) 2604 return list(codes), list(categories)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2603, in (.0) 2599 if len(iterables) == 0: 2600 # For consistency, it should return two empty lists. 2601 return [], [] -> 2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) 2604 return list(codes), list(categories)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2576, in factorize_from_iterable(values) 2571 codes = values.codes 2572 else: 2573 # The value of ordered is irrelevant since we don't use cat as such, 2574 # but only the resulting categories, the order of which is independent 2575 # from ordered. Set ordered to False as default. See GH #15457 -> 2576 cat = Categorical(values, ordered=False) 2577 categories = cat.categories 2578 codes = cat.codes

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:425, in Categorical.init(self, values, categories, ordered, dtype, fastpath, copy) 423 if dtype.categories is None: 424 try: --> 425 codes, categories = factorize(values, sort=True) 426 except TypeError as err: 427 codes, categories = factorize(values, sort=False)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\algorithms.py:747, in factorize(values, sort, use_na_sentinel, size_hint) 738 # Implementation notes: This method is responsible for 3 things 739 # 1.) coercing data to array-like (ndarray, Index, extension array) 740 # 2.) factorizing codes and uniques (...) 744 # responsible only for factorization. All data coercion, sorting and boxing 745 # should happen here. 746 if isinstance(values, (ABCIndex, ABCSeries)): --> 747 return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel) 749 values = _ensure_arraylike(values) 750 original = values

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\base.py:1164, in IndexOpsMixin.factorize(self, sort, use_na_sentinel) 1146 @doc( 1147 algorithms.factorize, 1148 values="", (...) 1162 use_na_sentinel: bool = True, 1163 ) -> tuple[npt.NDArray[np.intp], Index]: -> 1164 codes, uniques = algorithms.factorize( 1165 self._values, sort=sort, use_na_sentinel=use_na_sentinel 1166 ) 1167 if uniques.dtype == np.float16: 1168 uniques = uniques.astype(np.float32)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\algorithms.py:763, in factorize(values, sort, use_na_sentinel, size_hint) 759 return codes, uniques 761 elif not isinstance(values, np.ndarray): 762 # i.e. ExtensionArray --> 763 codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel) 765 else: 766 values = np.asarray(values) # convert DTA/TDA/MultiIndex

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\arrow\array.py:841, in ArrowExtensionArray.factorize(self, use_na_sentinel) 838 else: 839 data = self._data --> 841 encoded = data.dictionary_encode(null_encoding=null_encoding) 842 if encoded.length() == 0: 843 indices = np.array([], dtype=np.intp)

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\table.pxi:586, in pyarrow.lib.ChunkedArray.dictionary_encode()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow_compute.pyx:560, in pyarrow._compute.call_function()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow_compute.pyx:355, in pyarrow._compute.Function.call()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()

File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\error.pxi:121, in pyarrow.lib.check_status()

ArrowNotImplementedError: Function 'dictionary_encode' has no kernel matching input types (dictionary)

Expected Behavior

image

Installed Versions

INSTALLED VERSIONS

commit : 478d340667831908b5b4bf09a2787a11a14560c9 python : 3.8.16.final.0 python-bits : 64 OS : Windows OS-release : 10 Version : 10.0.19045 machine : AMD64 processor : Intel64 Family 6 Model 142 Stepping 12, GenuineIntel byteorder : little LC_ALL : None LANG : None LOCALE : English_United States.1252

pandas : 2.0.0 numpy : 1.23.5 pytz : 2023.3 dateutil : 2.8.2 setuptools : 67.6.1 pip : 23.0.1 Cython : None pytest : None hypothesis : None sphinx : None blosc : None feather : None xlsxwriter : 3.0.9 lxml.etree : 4.9.2 html5lib : None pymysql : None psycopg2 : None jinja2 : 3.1.2 IPython : 8.12.0 pandas_datareader: None bs4 : 4.12.2 bottleneck : None brotli : fastparquet : None fsspec : None gcsfs : None matplotlib : 3.7.1 numba : 0.56.4 numexpr : None odfpy : None openpyxl : 3.1.2 pandas_gbq : None pyarrow : 11.0.0 pyreadstat : 1.2.1 pyxlsb : None s3fs : None scipy : 1.10.1 snappy : None sqlalchemy : 2.0.9 tables : None tabulate : None xarray : None xlrd : 1.2.0 zstandard : None tzdata : 2023.3 qtpy : None pyqt5 : None

phofl commented 1 year ago

An xfailed test is welcome

aprandin commented 1 year ago

Hi @phofl many thanks for working on the issue.

Do you mean a test like the one below?

import pandas as pd
import numpy as np
import pyarrow as pa
import pytest

data = {
    'x_1':np.random.randint(0,5,size=20),
    'x_2':np.random.randint(5,10,size=20),
    'x_3':np.random.randint(10,15,size=20),
    'x_4':np.random.randint(15,20,size=20),
}
df = pd.DataFrame(data=data)
for col in ['x_1','x_2']:
    df[col] = df[col].astype('category')
df.to_feather('example.fh')

dff = pd.read_feather('example.fh', dtype_backend='pyarrow')

@pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError, reason="Unable to set a pyarrow-categorical column to index")
def test_index_one_cat():
    dff.set_index('x_1')

@pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError, reason="Unable to set multiple non-pyarrow-categorical columns to index")
def test_index_two_non_cat():
    dff.set_index(['x_3','x_4'])

@pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError, reason="Unable to set multiple pyarrow-categorical columns to index")
def test_index_two_cat():
    dff.set_index(['x_1','x_2'])

Please let me know if this is in line with your expectations.

Thank you.

mroeschke commented 10 months ago

Looks like this needs to be fixed upstream in pyarrow so closing