Closed aprandin closed 10 months ago
An xfailed test is welcome
Hi @phofl many thanks for working on the issue.
Do you mean a test like the one below?
import pandas as pd
import numpy as np
import pyarrow as pa
import pytest
data = {
'x_1':np.random.randint(0,5,size=20),
'x_2':np.random.randint(5,10,size=20),
'x_3':np.random.randint(10,15,size=20),
'x_4':np.random.randint(15,20,size=20),
}
df = pd.DataFrame(data=data)
for col in ['x_1','x_2']:
df[col] = df[col].astype('category')
df.to_feather('example.fh')
dff = pd.read_feather('example.fh', dtype_backend='pyarrow')
@pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError, reason="Unable to set a pyarrow-categorical column to index")
def test_index_one_cat():
dff.set_index('x_1')
@pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError, reason="Unable to set multiple non-pyarrow-categorical columns to index")
def test_index_two_non_cat():
dff.set_index(['x_3','x_4'])
@pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError, reason="Unable to set multiple pyarrow-categorical columns to index")
def test_index_two_cat():
dff.set_index(['x_1','x_2'])
Please let me know if this is in line with your expectations.
Thank you.
Looks like this needs to be fixed upstream in pyarrow so closing
Pandas version checks
[X] I have checked that this issue has not already been reported.
[X] I have confirmed this bug exists on the latest version of pandas.
[ ] I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
Issue Description
ArrowNotImplementedError Traceback (most recent call last) Cell In[8], line 2 1 # Setting multiple "pyarrow categorical" columns as index DOES NOT WORK ----> 2 display(dff.set_index(['x_1','x_2']).head())
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\frame.py:5923, in DataFrame.set_index(self, keys, drop, append, inplace, verify_integrity) 5915 if len(arrays[-1]) != len(self): 5916 # check newest element against length of calling frame, since 5917 # ensure_index_from_sequences would not raise for append=False. 5918 raise ValueError( 5919 f"Length mismatch: Expected {len(self)} rows, " 5920 f"received array of length {len(arrays[-1])}" 5921 ) -> 5923 index = ensure_index_from_sequences(arrays, names) 5925 if verify_integrity and not index.is_unique: 5926 duplicates = index[index.duplicated()].unique()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\indexes\base.py:7066, in ensure_index_from_sequences(sequences, names) 7064 return Index(sequences[0], name=names) 7065 else: -> 7066 return MultiIndex.from_arrays(sequences, names=names)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\indexes\multi.py:505, in MultiIndex.from_arrays(cls, arrays, sortorder, names) 502 if len(arrays[i]) != len(arrays[i - 1]): 503 raise ValueError("all arrays must be same length") --> 505 codes, levels = factorize_from_iterables(arrays) 506 if names is lib.no_default: 507 names = [getattr(arr, "name", None) for arr in arrays]
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2603, in factorize_from_iterables(iterables) 2599 if len(iterables) == 0: 2600 # For consistency, it should return two empty lists. 2601 return [], [] -> 2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) 2604 return list(codes), list(categories)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2603, in(.0)
2599 if len(iterables) == 0:
2600 # For consistency, it should return two empty lists.
2601 return [], []
-> 2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
2604 return list(codes), list(categories)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:2576, in factorize_from_iterable(values) 2571 codes = values.codes 2572 else: 2573 # The value of ordered is irrelevant since we don't use cat as such, 2574 # but only the resulting categories, the order of which is independent 2575 # from ordered. Set ordered to False as default. See GH #15457 -> 2576 cat = Categorical(values, ordered=False) 2577 categories = cat.categories 2578 codes = cat.codes
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\categorical.py:425, in Categorical.init(self, values, categories, ordered, dtype, fastpath, copy) 423 if dtype.categories is None: 424 try: --> 425 codes, categories = factorize(values, sort=True) 426 except TypeError as err: 427 codes, categories = factorize(values, sort=False)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\algorithms.py:747, in factorize(values, sort, use_na_sentinel, size_hint) 738 # Implementation notes: This method is responsible for 3 things 739 # 1.) coercing data to array-like (ndarray, Index, extension array) 740 # 2.) factorizing codes and uniques (...) 744 # responsible only for factorization. All data coercion, sorting and boxing 745 # should happen here. 746 if isinstance(values, (ABCIndex, ABCSeries)): --> 747 return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel) 749 values = _ensure_arraylike(values) 750 original = values
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\base.py:1164, in IndexOpsMixin.factorize(self, sort, use_na_sentinel) 1146 @doc( 1147 algorithms.factorize, 1148 values="", (...) 1162 use_na_sentinel: bool = True, 1163 ) -> tuple[npt.NDArray[np.intp], Index]: -> 1164 codes, uniques = algorithms.factorize( 1165 self._values, sort=sort, use_na_sentinel=use_na_sentinel 1166 ) 1167 if uniques.dtype == np.float16: 1168 uniques = uniques.astype(np.float32)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\algorithms.py:763, in factorize(values, sort, use_na_sentinel, size_hint) 759 return codes, uniques 761 elif not isinstance(values, np.ndarray): 762 # i.e. ExtensionArray --> 763 codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel) 765 else: 766 values = np.asarray(values) # convert DTA/TDA/MultiIndex
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pandas\core\arrays\arrow\array.py:841, in ArrowExtensionArray.factorize(self, use_na_sentinel) 838 else: 839 data = self._data --> 841 encoded = data.dictionary_encode(null_encoding=null_encoding) 842 if encoded.length() == 0: 843 indices = np.array([], dtype=np.intp)
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\table.pxi:586, in pyarrow.lib.ChunkedArray.dictionary_encode()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow_compute.pyx:560, in pyarrow._compute.call_function()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow_compute.pyx:355, in pyarrow._compute.Function.call()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\error.pxi:144, in pyarrow.lib.pyarrow_internal_check_status()
File ~\AppData\Local\anaconda_gpc\envs\pandas-test\lib\site-packages\pyarrow\error.pxi:121, in pyarrow.lib.check_status()
ArrowNotImplementedError: Function 'dictionary_encode' has no kernel matching input types (dictionary)
Expected Behavior
Installed Versions
INSTALLED VERSIONS
commit : 478d340667831908b5b4bf09a2787a11a14560c9 python : 3.8.16.final.0 python-bits : 64 OS : Windows OS-release : 10 Version : 10.0.19045 machine : AMD64 processor : Intel64 Family 6 Model 142 Stepping 12, GenuineIntel byteorder : little LC_ALL : None LANG : None LOCALE : English_United States.1252
pandas : 2.0.0 numpy : 1.23.5 pytz : 2023.3 dateutil : 2.8.2 setuptools : 67.6.1 pip : 23.0.1 Cython : None pytest : None hypothesis : None sphinx : None blosc : None feather : None xlsxwriter : 3.0.9 lxml.etree : 4.9.2 html5lib : None pymysql : None psycopg2 : None jinja2 : 3.1.2 IPython : 8.12.0 pandas_datareader: None bs4 : 4.12.2 bottleneck : None brotli : fastparquet : None fsspec : None gcsfs : None matplotlib : 3.7.1 numba : 0.56.4 numexpr : None odfpy : None openpyxl : 3.1.2 pandas_gbq : None pyarrow : 11.0.0 pyreadstat : 1.2.1 pyxlsb : None s3fs : None scipy : 1.10.1 snappy : None sqlalchemy : 2.0.9 tables : None tabulate : None xarray : None xlrd : 1.2.0 zstandard : None tzdata : 2023.3 qtpy : None pyqt5 : None