LSSTDESC / tables_io

A small package to provide tools to read / write and convert tabular data for DESC
MIT License
1 stars 1 forks source link

`getGroupInputDataLength` runtime error #89

Open drewoldag opened 10 months ago

drewoldag commented 10 months ago

Bug report Running iterHdf5ToDict with a non-None groupname parameter will result in a runtime error when it executes the getGroupInputDataLength(f) line. This is because f is defined to be the hdf5 group, not the file.

In arrayUtils.py line 88, we call hg.keys() which is only valid for the hdf5 file.

The following code reproduces the runtime error. To run this you'll need an environment with h5py and tables_io installed.

import h5py
import json

from tables_io.ioUtils import iterHdf5ToDict

GROUPNAME = 'example_group'

# write an example hdf5 file using variable length strings
with h5py.File('bug_example.hdf5', 'w') as file:
    dicts = [
        {'a':0, 'b':[[8,3]]},
        {'a':1, 'b':[[1,2,3]]},
        {'a':2, 'b':[[1,2,3,4,5],[7,8,9]]},
    ]

    # convert the dictionaries to json strings
    data = [json.dumps(this_dict) for this_dict in dicts]
    dt = h5py.special_dtype(vlen=str)

    # add the data to the hdf5 file
    dataset = file.create_dataset(GROUPNAME, data=data, dtype=dt)

SHOW_BUG = True
if SHOW_BUG:
    # get an iterator into the hdf5 file
    buggy_iter = iterHdf5ToDict(
        "bug_example.hdf5",
        chunk_size=1,
        groupname=GROUPNAME,
        rank=0,
        parallel_size=1)

    # use the iterator to read the lines, convert to dictionaries, and print
    for start, end, data in buggy_iter:
        dicts = [json.loads(this_line) for this_line in data]
        print(f"Start, end: {start, end}")
        print(dicts)

else:
    # get an iterator into the hdf5 file
    good_iter = iterHdf5ToDict(
        "bug_example.hdf5",
        chunk_size=1,
        groupname=None,
        rank=0,
        parallel_size=1)

    # use the iterator to read the lines, convert to dictionaries, and print
    for start, end, data in good_iter:
        dicts = [json.loads(this_line) for this_line in data[GROUPNAME]]
        print(f"Start, end: {start, end}")
        print(dicts)

Produces the following error:

Traceback (most recent call last):
  File "/home/drew/code/hdf5-test/bug_example.py", line 35, in <module>
    for start, end, data in buggy_iter:
  File "/home/drew/miniconda3/envs/hdf5/lib/python3.9/site-packages/tables_io/ioUtils.py", line 379, in iterHdf5ToDict
    num_rows = getGroupInputDataLength(f)
  File "/home/drew/miniconda3/envs/hdf5/lib/python3.9/site-packages/tables_io/arrayUtils.py", line 88, in getGroupInputDataLength
    firstkey = list(hg.keys())[0]
AttributeError: 'Dataset' object has no attribute 'keys'

I believe this can be addressed with the following code in arrayUtils.py

def getGroupInputDataLength(hg):
    if isinstance(hg, h5py.File):
        return _getHdf5FileLength(hg)
    elif isinstance(hg, h5py.Group):
        return _getHdf5GroupLength(hg)

def _getHdf5FileLength(hg):
    firstkey = list(hg.keys())[0]
    nrows = len(hg[firstkey])
    firstname = hg[firstkey].name
    for value in hg.values():
        if len(value) != nrows:
            raise ValueError(
                f"Group does not represent a table. Length ({len(value)})"
                f"of column {value.name} not not match length ({nrows}) of"
                f"first column {firstname}"
            )
    return nrows

def _getHdf5GroupLength(hg):
    return len(hg)