vaexio / vaex

Out-of-Core hybrid Apache Arrow/NumPy DataFrame for Python, ML, visualization and exploration of big tabular data at a billion rows per second 🚀
https://vaex.io
MIT License
8.28k stars 590 forks source link

[BUG-REPORT] `vaex.from_arrow_dataset` throws`'pyarrow._dataset.FileFragment' object has no attribute 'row_groups'` #2219

Closed jmakov closed 2 years ago

jmakov commented 2 years ago

Description I'm trying to read feather files as a dataset into one dataframe. Works with pyarrow, exception in vaex:

import pyarrow
import pyarrow.dataset
import pyarrow.feather
import pandas
import vaex

# pandas version
df1 = pandas.DataFrame({"a": [1,2,3], "b": [4,5,6]})
df2 = pandas.DataFrame({"a": [10,20,30], "b": [40,50,60]})

# write independent Feather files
path_base = "/home/toaster/Downloads/"
pyarrow.feather.write_feather(df1, path_base + "123.feather")
pyarrow.feather.write_feather(df2, path_base + "456.feather")

# read Feather files as Arrow dataset
ds = pyarrow.dataset.dataset(path_base, format="feather")
ds.files  # lists both "123.feather" and "456.feather"
ds.to_table().to_pandas()  # outputs expected dataframe

# trying to import Arrow dataset in Vaex
vaex.from_arrow_dataset(ds)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In [48], line 1
----> 1 df_vaex = vaex.from_arrow_dataset(ds)

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/__init__.py:384, in from_arrow_dataset(arrow_dataset)
    382 '''Create a DataFrame from an Apache Arrow dataset.'''
    383 import vaex.arrow.dataset
--> 384 return from_dataset(vaex.arrow.dataset.DatasetArrow(arrow_dataset))

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/arrow/dataset.py:258, in DatasetArrow.__init__(self, ds, max_rows_read)
    256 def __init__(self, ds, max_rows_read=1024**2*10):
    257     self._arrow_ds = ds
--> 258     super().__init__(max_rows_read=max_rows_read)

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/arrow/dataset.py:26, in DatasetArrowBase.__init__(self, max_rows_read)
     24 super().__init__()
     25 self.max_rows_read = max_rows_read
---> 26 self._create_columns()

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/arrow/dataset.py:268, in DatasetArrow._create_columns(self)
    267 def _create_columns(self):
--> 268     super()._create_columns()
    269     # self._ids = frozendict({name: vaex.cache.fingerprint(self._fingerprint, name) for name in self._columns})
    270     self._ids = frozendict()

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/arrow/dataset.py:35, in DatasetArrowBase._create_columns(self)
     33     if hasattr(fragment, "ensure_complete_metadata"):
     34         fragment.ensure_complete_metadata()
---> 35     for rg in fragment.row_groups:
     36         row_count += rg.num_rows
     37 self._row_count = row_count

AttributeError: 'pyarrow._dataset.FileFragment' object has no attribute 'row_groups'
---------------------------------------------------------------------------

# check if we have to write using `pyarrow.dataset.write_dataset` instead of independend Feather files
table_pa = ds.to_table(filter=pyarrow.dataset.field("a") > 2) 
pyarrow.dataset.write_dataset(table_pa, "test", format="feather")

ds_test = pyarrow.dataset.dataset("test", format="feather")
vaex.from_arrow_dataset(ds_test)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In [30], line 1
----> 1 vaex.from_arrow_dataset(ds_test)

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/__init__.py:384, in from_arrow_dataset(arrow_dataset)
    382 '''Create a DataFrame from an Apache Arrow dataset.'''
    383 import vaex.arrow.dataset
--> 384 return from_dataset(vaex.arrow.dataset.DatasetArrow(arrow_dataset))

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/arrow/dataset.py:258, in DatasetArrow.__init__(self, ds, max_rows_read)
    256 def __init__(self, ds, max_rows_read=1024**2*10):
    257     self._arrow_ds = ds
--> 258     super().__init__(max_rows_read=max_rows_read)

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/arrow/dataset.py:26, in DatasetArrowBase.__init__(self, max_rows_read)
     24 super().__init__()
     25 self.max_rows_read = max_rows_read
---> 26 self._create_columns()

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/arrow/dataset.py:268, in DatasetArrow._create_columns(self)
    267 def _create_columns(self):
--> 268     super()._create_columns()
    269     # self._ids = frozendict({name: vaex.cache.fingerprint(self._fingerprint, name) for name in self._columns})
    270     self._ids = frozendict()

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/arrow/dataset.py:35, in DatasetArrowBase._create_columns(self)
     33     if hasattr(fragment, "ensure_complete_metadata"):
     34         fragment.ensure_complete_metadata()
---> 35     for rg in fragment.row_groups:
     36         row_count += rg.num_rows
     37 self._row_count = row_count

AttributeError: 'pyarrow._dataset.FileFragment' object has no attribute 'row_groups'
---------------------------------------------------------------------------
​

#Workarounds
# as a workaround this seems to work, but it loads all the data into memory!
table_pa = ds.to_table()
df_vaex = vaex.from_arrow_table(table_pa)

# this also works
df_vaex = vaex.open(path_base + "*.feather")

Software information