deshaw / versioned-hdf5

Versioned HDF5 provides a versioned abstraction on top of h5py
https://deshaw.github.io/versioned-hdf5/
Other
76 stars 20 forks source link

Handle strings in _verify_new_chunk_reuse (PyInf#11487) #338

Closed ArvidJB closed 3 months ago

ArvidJB commented 3 months ago

The check in verify_chunk_reuse does not handle strings correctly:

In [3]: with TempDirCtx() as d:
   ...:     filename = d / 'data.h5'
   ...:     with h5py.File(filename, mode="w") as f:
   ...:         vf = VersionedHDF5File(f)
   ...:         with vf.stage_version("r0") as group:
   ...:             group.create_dataset(
   ...:                 "values",
   ...:                 data=np.array(['a', 'b', 'c', 'a', 'b', 'c'], dtype='O'),
   ...:                 dtype=h5py.string_dtype(length=None),
   ...:                 maxshape=(None,),
   ...:                 chunks=(3,),
   ...:             )
   ...:
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:305, in _verify_new_chunk_reuse(raw_dataset, new_data, data_hash, hashed_slice, chunk_being_written, slices_to_write, data_to_write)
    304 try:
--> 305     assert_array_equal(reused_chunk, to_be_written)
    306 except AssertionError as e:

    [... skipping hidden 1 frame]

File /opt/python/python-3.11/lib64/python3.11/contextlib.py:81, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
     80 with self._recreate_cm():
---> 81     return func(*args, **kwds)

File /usr/local/python/python-3.11/std/lib64/python3.11/site-packages/numpy/testing/_private/utils.py:862, in assert_array_compare(comparison, x, y, err_msg, verbose, header, precision, equal_nan, equal_inf, strict)
    859         msg = build_err_msg([ox, oy], err_msg,
    860                             verbose=verbose, header=header,
    861                             names=('x', 'y'), precision=precision)
--> 862         raise AssertionError(msg)
    863 except ValueError:

AssertionError:
Arrays are not equal

Mismatched elements: 3 / 3 (100%)
 x: array(['a', 'b', 'c'], dtype=object)
 y: array([b'a', b'b', b'c'], dtype='|S1')

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
Cell In[3], line 5
      3 with h5py.File(filename, mode="w") as f:
      4     vf = VersionedHDF5File(f)
----> 5     with vf.stage_version("r0") as group:
      6         group.create_dataset(
      7             "values",
      8             data=np.array(['a', 'b', 'c', 'a', 'b', 'c'], dtype='O'),
   (...)
     11             chunks=(3,),
     12         )

File /opt/python/python-3.11/lib64/python3.11/contextlib.py:144, in _GeneratorContextManager.__exit__(self, typ, value, traceback)
    142 if typ is None:
    143     try:
--> 144         next(self.gen)
    145     except StopIteration:
    146         return False

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/api.py:307, in VersionedHDF5File.stage_version(self, version_name, prev_version, make_current, timestamp, verify_chunk_reuse)
    305     yield group
    306     group.close()
--> 307     commit_version(
    308         group,
    309         group.datasets(),
    310         make_current=make_current,
    311         chunks=group.chunks,
    312         compression=group.compression,
    313         compression_opts=group.compression_opts,
    314         timestamp=timestamp,
    315         verify_chunk_reuse=verify_chunk_reuse,
    316     )
    317 except:
    318     delete_version(self.f, version_name, old_current)

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/versions.py:164, in commit_version(version_group, datasets, make_current, chunks, compression, compression_opts, timestamp, verify_chunk_reuse)
    162     slices = write_dataset_chunks(f, name, data.data_dict, verify_chunk_reuse=verify_chunk_reuse)
    163 else:
--> 164     slices = write_dataset(
    165         f,
    166         name,
    167         data,
    168         chunks=chunks[name],
    169         compression=compression[name],
    170         compression_opts=compression_opts[name],
    171         fillvalue=fillvalue,
    172         verify_chunk_reuse=verify_chunk_reuse,
    173     )
    174 if shape is None:
    175     if isinstance(data, dict):

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:126, in write_dataset(f, name, data, chunks, dtype, compression, compression_opts, fillvalue, verify_chunk_reuse)
    114 def write_dataset(
    115     f,
    116     name,
   (...)
    123     verify_chunk_reuse=True,
    124 ):
    125     if name not in f["_version_data"]:
--> 126         return create_base_dataset(
    127             f,
    128             name,
    129             data=data,
    130             dtype=dtype,
    131             chunks=chunks,
    132             compression=compression,
    133             compression_opts=compression_opts,
    134             fillvalue=fillvalue,
    135         )
    137     ds = f["_version_data"][name]["raw_data"]
    138     if isinstance(chunks, int) and not isinstance(chunks, bool):

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:111, in create_base_dataset(f, name, shape, data, dtype, chunks, compression, compression_opts, fillvalue)
    100 dataset = group.create_dataset(
    101     "raw_data",
    102     shape=(0,) + chunks[1:],
   (...)
    108     fillvalue=fillvalue,
    109 )
    110 dataset.attrs["chunks"] = chunks
--> 111 return write_dataset(f, name, data, chunks=chunks)

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:193, in write_dataset(f, name, data, chunks, dtype, compression, compression_opts, fillvalue, verify_chunk_reuse)
    190     slices[data_slice] = hashed_slice
    192     if verify_chunk_reuse:
--> 193         _verify_new_chunk_reuse(
    194             raw_dataset=ds,
    195             new_data=data,
    196             data_hash=data_hash,
    197             hashed_slice=hashed_slice,
    198             chunk_being_written=data_s,
    199             slices_to_write=slices_to_write,
    200         )
    202     chunks_reused += 1
    204 else:

File /codemill/bessen/ndindex_venv/lib64/python3.11/site-packages/versioned_hdf5/backend.py:307, in _verify_new_chunk_reuse(raw_dataset, new_data, data_hash, hashed_slice, chunk_being_written, slices_to_write, data_to_write)
    305     assert_array_equal(reused_chunk, to_be_written)
    306 except AssertionError as e:
--> 307     raise ValueError(
    308         f"Hash {data_hash} of existing data chunk {reused_chunk} "
    309         f"matches the hash of new data chunk {chunk_being_written}, "
    310         "but data does not."
    311     ) from e

ValueError: Hash b'\x8b\x8a\xc9Zi\xbd\xa8\xcc\xa8\xcd:~\xf2\x1d\x81\xeb\n\xd7\xa0F9\xdb\x10\xbe\x8d[s\xc0\xf00\xb1}' of existing data chunk ['a' 'b' 'c'] matches the hash of new data chunk ['a' 'b' 'c'], but data does not.
peytondmurray commented 3 months ago

Hmm, so it looks like reused slices that are read from the underlying dataset contain bytes inside an object dtype array, but reused slices which are yet to be written to the file contain strings inside an object dtype array.

 x: array(['a', 'b', 'c'], dtype=object)
 y: array([b'a', b'b', b'c'], dtype='|S1')

I think we should be able to coerce both the data to be reused and the data the user is trying to write to bytes, and that should take care of this.