vaexio / vaex

Out-of-Core hybrid Apache Arrow/NumPy DataFrame for Python, ML, visualization and exploration of big tabular data at a billion rows per second 🚀
https://vaex.io
MIT License
8.23k stars 590 forks source link

[BUG-REPORT] loading a sub dataset from a complex hdf5 file #2296

Open satra opened 1 year ago

satra commented 1 year ago

Description

based on #833 I am trying to read a 2d array from a nested structure. this file is an anndata file. i was hoping to do this without extracting this array into a new hdf5 file.

In [7]: f["obsm"]["spatial"][:10, :]
Out[7]: 
array([[1419.98832894,  825.15090888],
       [1432.79529662,  822.45169245],
       [1401.86558569,  825.04345513],
       [1384.86873138,  829.65679024],
       [1369.14299852,  836.31881653],
       [1320.11758083,  848.98107624],
       [1342.12711877,  847.71693131],
       [1357.25932077,  846.88102846],
       [1362.12752974,  852.21970886],
       [1400.3948844 ,  855.91202043]])

reading this file with vaex

In [6]: f = vaex.open(filename, group="/obsm/spatial/")
[12/06/22 20:19:07] ERROR    error opening <filename>

results in:

complete traceback ```python Traceback (most recent call last): File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/__ini t__.py", line 244, in open ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs) File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/datas et.py", line 81, in open return opener.open(path, fs_options=fs_options, fs=fs, *args, **kwargs) File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/datas et.py", line 1457, in open return cls(path, *args, **kwargs) File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/hdf5/ dataset.py", line 71, in __init__ self._load() File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/hdf5/ dataset.py", line 214, in _load self._load_columns(self.h5file[self.group]) File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/hdf5/ dataset.py", line 291, in _load_columns h5columns = h5data if self._version == 1 else h5data['columns'] File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/h5py/_hl/d ataset.py", line 760, in __getitem__ return self.fields(names, _prior_dtype=new_dtype)[args] File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/h5py/_hl/d ataset.py", line 418, in fields return FieldsWrapper(self, _prior_dtype, names) File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/h5py/_hl/d ataset.py", line 254, in __init__ self.read_dtype = readtime_dtype(prior_dtype, names) File "/Users/satra/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/h5py/_hl/d ataset.py", line 273, in readtime_dtype raise ValueError("Field names only allowed for compound types") ValueError: Field names only allowed for compound types --------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[6], line 1 ----> 1 f = vaex.open("atlas_brain_638850.hdf5", group="/obsm/spatial/") File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/__init__.py:244, in open(path, convert, progress, shuffle, fs_options, fs, *args, **kwargs) 242 ds = vaex.dataset.open(path_output, fs_options=fs_options, fs=fs) 243 else: --> 244 ds = vaex.dataset.open(path, fs_options=fs_options, fs=fs, **kwargs) 245 df = vaex.from_dataset(ds) 246 if df is None: File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/dataset.py:81, in open(path, fs_options, fs, *args, **kwargs) 79 if opener.quick_test(path, fs_options=fs_options, fs=fs): 80 if opener.can_open(path, fs_options=fs_options, fs=fs, *args, **kwargs): ---> 81 return opener.open(path, fs_options=fs_options, fs=fs, *args, **kwargs) 83 # otherwise try all openers 84 for opener in opener_classes: File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/dataset.py:1457, in DatasetFile.open(cls, path, *args, **kwargs) 1455 @classmethod 1456 def open(cls, path, *args, **kwargs): -> 1457 return cls(path, *args, **kwargs) File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/hdf5/dataset.py:71, in Hdf5MemoryMapped.__init__(self, path, write, fs_options, fs, nommap, group, _fingerprint) 69 self.group = group 70 self._version = 1 ---> 71 self._load() 72 if not write: # in write mode, call freeze yourself, so the hashes are computed 73 self._freeze() File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/hdf5/dataset.py:214, in Hdf5MemoryMapped._load(self) 212 else: 213 self._version = 2 --> 214 self._load_columns(self.h5file[self.group]) 216 if "properties" in self.h5file: 217 self._load_variables(self.h5file["/properties"]) # old name, kept for portability File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/vaex/hdf5/dataset.py:291, in Hdf5MemoryMapped._load_columns(self, h5data, first) 289 self.description = ensure_string(h5data.attrs["description"]) 290 # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later --> 291 h5columns = h5data if self._version == 1 else h5data['columns'] 292 if "column_order" in h5columns.attrs: 293 column_order = ensure_string(h5columns.attrs["column_order"]).split(",") File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper() File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper() File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/h5py/_hl/dataset.py:760, in Dataset.__getitem__(self, args, new_dtype) 758 names = names[0] # Read with simpler dtype of this field 759 args = tuple(x for x in args if not isinstance(x, str)) --> 760 return self.fields(names, _prior_dtype=new_dtype)[args] 762 if new_dtype is None: 763 new_dtype = self.dtype File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/h5py/_hl/dataset.py:418, in Dataset.fields(self, names, _prior_dtype) 416 if _prior_dtype is None: 417 _prior_dtype = self.dtype --> 418 return FieldsWrapper(self, _prior_dtype, names) File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/h5py/_hl/dataset.py:254, in FieldsWrapper.__init__(self, dset, prior_dtype, names) 252 self.extract_field = names 253 names = [names] --> 254 self.read_dtype = readtime_dtype(prior_dtype, names) File ~/software/mambaforge/envs/cubiekb/lib/python3.10/site-packages/h5py/_hl/dataset.py:273, in readtime_dtype(basetype, names) 271 """Make a NumPy compound dtype with a subset of available fields""" 272 if basetype.names is None: # Names provided, but not compound --> 273 raise ValueError("Field names only allowed for compound types") 275 for name in names: # Check all names are legal 276 if not name in basetype.names: ValueError: Field names only allowed for compound types ```

Software information