fsspec / kerchunk

Cloud-friendly access to archival data
https://fsspec.github.io/kerchunk/
MIT License
288 stars 79 forks source link

Geoh5py reading in s3 #138

Open RichardScottOZ opened 2 years ago

RichardScottOZ commented 2 years ago

An example of an hdf5 file format.

https://geoh5py.readthedocs.io/en/stable/

"The geoh5py library has been created for the manipulation and storage of a wide range of geoscientific data (points, curve, surface, 2D and 3D grids) in geoh5 file format. Users will be able to directly leverage the powerful visualization capabilities of Geoscience ANALYST along with open-source code from the Python ecosystem."

So I use it quite a bit.

In fact, now have 200K similar files, in a couple of company s3 buckets.

e.g. image

So last night I started wondering if could access them quickly from s3 as opposed to generally read each one, get what I want and have several terabytes lying around.

I remembered the pangeo netcdf/hdf5 discussion etc, so I started having a look.

This would be pretty useful.

e.g. something like this? @rsignell-usgs

from kerchunk.hdf import SingleHdf5ToZarr
import kerchunk.hdf
import fsspec
s3 = s3fs.S3FileSystem(profile='appropriateprofile')

urls = ["s3://" + p for p in [
    's3://bananasplits/100075.geoh5'
]]
so = dict(
    anon=False, default_fill_cache=False, default_cache_type='first'
)
singles = []
for u in urls:
    with s3.open(u, **so) as inf:
        h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, u, inline_threshold=100)
        singles.append(h5chunks.translate())
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_24276/251853116.py in <module>
     13     with s3.open(u, **so) as inf:
     14         h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, u, inline_threshold=100)
---> 15         singles.append(h5chunks.translate())

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\hdf.py in translate(self)
     71         lggr.debug('Translation begins')
     72         self._transfer_attrs(self._h5f, self._zroot)
---> 73         self._h5f.visititems(self._translator)
     74         if self.inline > 0:
     75             self._do_inline(self.inline)

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\h5py\_hl\group.py in visititems(self, func)
    611                 name = self._d(name)
    612                 return func(name, self[name])
--> 613             return h5o.visit(self.id, proxy)
    614 
    615     @with_phil

h5py\_objects.pyx in h5py._objects.with_phil.wrapper()

h5py\_objects.pyx in h5py._objects.with_phil.wrapper()

h5py\h5o.pyx in h5py.h5o.visit()

h5py\h5o.pyx in h5py.h5o.cb_obj_simple()

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\h5py\_hl\group.py in proxy(name)
    610                 """ Use the text name of the object, not bytes """
    611                 name = self._d(name)
--> 612                 return func(name, self[name])
    613             return h5o.visit(self.id, proxy)
    614 

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\hdf.py in _translator(self, name, h5obj)
    188 
    189             # Create a Zarr array equivalent to this HDF5 dataset...
--> 190             za = self._zroot.create_dataset(h5obj.name, shape=h5obj.shape,
    191                                             dtype=h5obj.dtype,
    192                                             chunks=h5obj.chunks or False,

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\hierarchy.py in create_dataset(self, name, **kwargs)
    806         """
    807 
--> 808         return self._write_op(self._create_dataset_nosync, name, **kwargs)
    809 
    810     def _create_dataset_nosync(self, name, data=None, **kwargs):

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\hierarchy.py in _write_op(self, f, *args, **kwargs)
    659 
    660         with lock:
--> 661             return f(*args, **kwargs)
    662 
    663     def create_group(self, name, overwrite=False):

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\hierarchy.py in _create_dataset_nosync(self, name, data, **kwargs)
    818         # create array
    819         if data is None:
--> 820             a = create(store=self._store, path=path, chunk_store=self._chunk_store,
    821                        **kwargs)
    822 

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\creation.py in create(shape, chunks, dtype, compressor, fill_value, order, store, synchronizer, overwrite, path, chunk_store, filters, cache_metadata, cache_attrs, read_only, object_codec, dimension_separator, **kwargs)
    134 
    135     # initialize array metadata
--> 136     init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor,
    137                fill_value=fill_value, order=order, overwrite=overwrite, path=path,
    138                chunk_store=chunk_store, filters=filters, object_codec=object_codec,

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\storage.py in init_array(store, shape, chunks, dtype, compressor, fill_value, order, overwrite, path, chunk_store, filters, object_codec, dimension_separator)
    350     _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite)
    351 
--> 352     _init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype,
    353                          compressor=compressor, fill_value=fill_value,
    354                          order=order, overwrite=overwrite, path=path,

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\storage.py in _init_array_metadata(store, shape, chunks, dtype, compressor, fill_value, order, overwrite, path, chunk_store, filters, object_codec, dimension_separator)
    427             if not filters:
    428                 # there are no filters so we can be sure there is no object codec
--> 429                 raise ValueError('missing object_codec for object array')
    430             else:
    431                 # one of the filters may be an object codec, issue a warning rather

ValueError: missing object_codec for object array
RichardScottOZ commented 2 years ago

in this case

h5py = 3.6
numcodes = 0.9.1
zarr = 2.8.1
RichardScottOZ commented 2 years ago

As far as docs go Rich, maybe example of opening non-public cloud file?

RichardScottOZ commented 2 years ago

throwing in this:- https://github.com/fsspec/kerchunk/pull/46/commits/d7336c403fd04036ed741562eace01119f9432ed

I get

HDF5 variable length strings are not yet supported: </GEOSCIENCE/Data/{55f27889-51c1-46e8-8655-7280f9eb806b}/Data
shape: (1,) type: object bytes: 8 bytes>
HDF5 variable length strings are not yet supported: </GEOSCIENCE/Data/{5872cf18-7cba-4678-8ec1-c19db103c407}/Data
shape: (1,) type: object bytes: 8 bytes>
HDF5 variable length strings are not yet supported: </GEOSCIENCE/Data/{e5d54f72-e7e8-46c9-93fe-5f1ea9d1a792}/Data
shape: (1,) type: object bytes: 8 bytes>
C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\hdf.py:141: FutureWarning: elementwise == comparison failed and returning scalar instead; this will raise an error or perform elementwise comparison in the future.
  if v == 'DIMENSION_SCALE':
Caught TypeError: Origin@/GEOSCIENCE/Groups/{4577958a-de14-4ee5-b600-bc05e09c0bb0}/Objects/{4548f77a-9f78-4fb7-9aef-4105372a1355} = (1111385.2998561, -1518334.14711173, 12356.1) (<class 'numpy.void'>)
Traceback (most recent call last):
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\hdf.py", line 144, in _transfer_attrs
    zobj.attrs[n] = v
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\attrs.py", line 79, in __setitem__
    self._write_op(self._setitem_nosync, item, value)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\attrs.py", line 73, in _write_op
    return f(*args, **kwargs)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\attrs.py", line 90, in _setitem_nosync
    self._put_nosync(d)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\attrs.py", line 112, in _put_nosync
    self.store[self.key] = json_dumps(d)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\util.py", line 29, in json_dumps
    return json.dumps(o, indent=4, sort_keys=True, ensure_ascii=True,
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\json\__init__.py", line 234, in dumps
    return cls(
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\json\encoder.py", line 201, in encode
    chunks = list(chunks)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\json\encoder.py", line 431, in _iterencode
    yield from _iterencode_dict(o, _current_indent_level)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\json\encoder.py", line 405, in _iterencode_dict
    yield from chunks
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\json\encoder.py", line 438, in _iterencode
    o = _default(o)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\json\encoder.py", line 179, in default
    raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type void is not JSON serializable
Traceback (most recent call last):
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\util.py", line 279, in normalize_fill_value
    fill_value = np.array(fill_value, dtype=dtype)[()]
ValueError: invalid literal for int() with base 10: ' '

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\rscott\OneDrive - OZ Minerals\Exploration2022\Block-Golem\src\Xarray-hdf5.py", line 29, in <module>
    singles.append(h5chunks.translate())
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\hdf.py", line 73, in translate
    self._h5f.visititems(self._translator)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\h5py\_hl\group.py", line 613, in visititems
    return h5o.visit(self.id, proxy)
  File "h5py\_objects.pyx", line 54, in h5py._objects.with_phil.wrapper
  File "h5py\_objects.pyx", line 55, in h5py._objects.with_phil.wrapper
  File "h5py\h5o.pyx", line 355, in h5py.h5o.visit
  File "h5py\h5o.pyx", line 302, in h5py.h5o.cb_obj_simple
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\h5py\_hl\group.py", line 612, in proxy
    return func(name, self[name])
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\hdf.py", line 198, in _translator
    za = self._zroot.create_dataset(h5obj.name, shape=h5obj.shape,
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\hierarchy.py", line 808, in create_dataset
    return self._write_op(self._create_dataset_nosync, name, **kwargs)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\hierarchy.py", line 661, in _write_op
    return f(*args, **kwargs)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\hierarchy.py", line 820, in _create_dataset_nosync
    a = create(store=self._store, path=path, chunk_store=self._chunk_store,
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\creation.py", line 136, in create
    init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor,
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\storage.py", line 352, in init_array
    _init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype,
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\storage.py", line 393, in _init_array_metadata
    fill_value = normalize_fill_value(fill_value, dtype)
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\util.py", line 283, in normalize_fill_value
    raise ValueError('fill_value {!r} is not valid for dtype {}; nested '
ValueError: fill_value ' ' is not valid for dtype [('I', '<i4'), ('J', '<i4'), ('K', '<i4'), ('NCells', '<i4')]; nested exception: invalid literal for int() with base 10: ' '
RichardScottOZ commented 2 years ago

So is that a numpy dtypes and json problem for one?

RichardScottOZ commented 2 years ago

https://geoh5py.readthedocs.io/en/stable/content/geoh5_format/index.html#standards

martindurant commented 2 years ago

The HDF5/strings story isn't totally clear in kerchunk yet, as hdf5 has a few ways of storing them, and the most general one (heap allocation) doesn't match well with reading discrete blocks of bytes. https://github.com/fsspec/kerchunk/pull/40 was one attempt to cope with it, but wasn't complete and we didn't really need it.

I don't know the specifics of GeoHDF/Geo5py, would be interested to learn. The typical usage of this library, currently, is to make zarr- and netCDF-like datasets from the input, so if it's not that, then some more work would be required.

RichardScottOZ commented 2 years ago

Yes, I figured that was the case. Not netcdf, no. Focused on data used for digging stuff up

RichardScottOZ commented 2 years ago

image

RichardScottOZ commented 2 years ago

So I should probably start with a smaller, simpler file and see if anything is doable at my level.

RichardScottOZ commented 2 years ago

So if I take one of those files before all the algorithms run and data creation by inversion, so it just has a couple of things to look at with x, y, z, values.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\util.py in normalize_fill_value(fill_value, dtype)
    278             else:
--> 279                 fill_value = np.array(fill_value, dtype=dtype)[()]
    280 

ValueError: could not convert string to float: ''

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_41732/2531647224.py in <module>
     15     with s3.open(u, **so) as inf:
     16         h5chunks = kerchunk.hdf.SingleHdf5ToZarr(inf, u, inline_threshold=100)
---> 17         singles.append(h5chunks.translate())

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\hdf.py in translate(self)
     71         lggr.debug('Translation begins')
     72         self._transfer_attrs(self._h5f, self._zroot)
---> 73         self._h5f.visititems(self._translator)
     74         if self.inline > 0:
     75             self._do_inline(self.inline)

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\h5py\_hl\group.py in visititems(self, func)
    611                 name = self._d(name)
    612                 return func(name, self[name])
--> 613             return h5o.visit(self.id, proxy)
    614 
    615     @with_phil

h5py\_objects.pyx in h5py._objects.with_phil.wrapper()

h5py\_objects.pyx in h5py._objects.with_phil.wrapper()

h5py\h5o.pyx in h5py.h5o.visit()

h5py\h5o.pyx in h5py.h5o.cb_obj_simple()

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\h5py\_hl\group.py in proxy(name)
    610                 """ Use the text name of the object, not bytes """
    611                 name = self._d(name)
--> 612                 return func(name, self[name])
    613             return h5o.visit(self.id, proxy)
    614 

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\hdf.py in _translator(self, name, h5obj)
    196 
    197             # Create a Zarr array equivalent to this HDF5 dataset...
--> 198             za = self._zroot.create_dataset(h5obj.name, shape=h5obj.shape,
    199                                             dtype=h5obj.dtype,
    200                                             chunks=h5obj.chunks or False,

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\hierarchy.py in create_dataset(self, name, **kwargs)
    806         """
    807 
--> 808         return self._write_op(self._create_dataset_nosync, name, **kwargs)
    809 
    810     def _create_dataset_nosync(self, name, data=None, **kwargs):

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\hierarchy.py in _write_op(self, f, *args, **kwargs)
    659 
    660         with lock:
--> 661             return f(*args, **kwargs)
    662 
    663     def create_group(self, name, overwrite=False):

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\hierarchy.py in _create_dataset_nosync(self, name, data, **kwargs)
    818         # create array
    819         if data is None:
--> 820             a = create(store=self._store, path=path, chunk_store=self._chunk_store,
    821                        **kwargs)
    822 

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\creation.py in create(shape, chunks, dtype, compressor, fill_value, order, store, synchronizer, overwrite, path, chunk_store, filters, cache_metadata, cache_attrs, read_only, object_codec, dimension_separator, **kwargs)
    134 
    135     # initialize array metadata
--> 136     init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor,
    137                fill_value=fill_value, order=order, overwrite=overwrite, path=path,
    138                chunk_store=chunk_store, filters=filters, object_codec=object_codec,

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\storage.py in init_array(store, shape, chunks, dtype, compressor, fill_value, order, overwrite, path, chunk_store, filters, object_codec, dimension_separator)
    350     _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite)
    351 
--> 352     _init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype,
    353                          compressor=compressor, fill_value=fill_value,
    354                          order=order, overwrite=overwrite, path=path,

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\storage.py in _init_array_metadata(store, shape, chunks, dtype, compressor, fill_value, order, overwrite, path, chunk_store, filters, object_codec, dimension_separator)
    391     chunks = normalize_chunks(chunks, shape, dtype.itemsize)
    392     order = normalize_order(order)
--> 393     fill_value = normalize_fill_value(fill_value, dtype)
    394 
    395     # optional array metadata

~\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\util.py in normalize_fill_value(fill_value, dtype)
    281         except Exception as e:
    282             # re-raise with our own error message to be helpful
--> 283             raise ValueError('fill_value {!r} is not valid for dtype {}; nested '
    284                              'exception: {}'.format(fill_value, dtype, e))
    285 

ValueError: fill_value ' ' is not valid for dtype [('x', '<f8'), ('y', '<f8'), ('z', '<f8')]; nested exception: could not convert string to float: ''

So similar sorts of issues. ​

RichardScottOZ commented 2 years ago
NAME /GEOSCIENCE/Data/{35d34a64-6158-4a4e-92db-ef89c783ab40}/Data SHAPE (650,) TYPE float64
FILL 0.0 FILTER []
NAME /GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/Vertices SHAPE (650,) TYPE [('x', '<f8'), ('y', '<f8'), ('z', '<f8')]
FILL   FILTER []
RichardScottOZ commented 2 years ago

If we naively do this for the sake of the exercise, likely want a better thing than zero:

            if fill == " ":
                fill = 0.0
RichardScottOZ commented 2 years ago

then

print(singles)
[{'version': 1, 'templates': {'u': 's3://s3://bananasplits/100075.geoh5'}, 'refs': {'.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/.zattrs': '{\n    "Contributors": "UserName",\n    "Distance unit": "meter",\n    "GA Version": "1",\n    "Version": 1.0\n}', 'GEOSCIENCE/Data/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Data/{35d34a64-6158-4a4e-92db-ef89c783ab40}/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Data/{35d34a64-6158-4a4e-92db-ef89c783ab40}/.zattrs': '{\n    "Allow delete": 1,\n    "Allow move": 1,\n    "Allow rename": 1,\n    "Association": "Vertex",\n    "ID": "{35d34a64-6158-4a4e-92db-ef89c783ab40}",\n    "Name": "gz",\n    "Public": 1,\n    "Visible": 0\n}', 'GEOSCIENCE/Data/{35d34a64-6158-4a4e-92db-ef89c783ab40}/Data/.zarray': '{\n    "chunks": [\n        650\n    ],\n    "compressor": {\n        "id": "zlib",\n        "level": 9\n    },\n    "dtype": "<f8",\n    "fill_value": 0.0,\n    "filters": null,\n    "order": "C",\n    "shape": [\n        650\n    ],\n    "zarr_format": 2\n}', 'GEOSCIENCE/Data/{35d34a64-6158-4a4e-92db-ef89c783ab40}/Data/.zattrs': '{\n    "_ARRAY_DIMENSIONS": [\n        "phony_dim_0"\n    ]\n}', 'GEOSCIENCE/Data/{35d34a64-6158-4a4e-92db-ef89c783ab40}/Data/0': ['{{u}}', 85984, 734], 'GEOSCIENCE/Data/{35d34a64-6158-4a4e-92db-ef89c783ab40}/Type/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Data/{35d34a64-6158-4a4e-92db-ef89c783ab40}/Type/.zattrs': '{\n    "Description": "gz",\n    "Hidden": 0,\n    "ID": "{c68ee320-02fc-4f7a-8fb1-4426970fc2cb}",\n    "Mapping": "equal_area",\n    "Name": "gz",\n    "Number of bins": 50,\n    "Primitive type": "Float",\n    "Transparent no data": 1\n}', 'GEOSCIENCE/Groups/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/.zattrs': '{\n    "Allow delete": 0,\n    "Allow move": 0,\n    "Allow rename": 0,\n    "ID": "{cde1051f-49d2-4269-95d4-ec888c5b322a}",\n    "Name": "Workspace",\n    "Public": 1,\n    "Visible": 1\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Data/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Groups/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/.zattrs': '{\n    "Allow delete": 1,\n    "Allow move": 1,\n    "Allow rename": 1,\n    "ID": "{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}",\n    "Last focus": "None",\n    "Name": "gravity",\n    "Public": 1,\n    "Visible": 1\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/Data/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/PropertyGroups/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/Type/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/Type/.zattrs': '{\n    "Description": "Points",\n    "ID": "{202c5db1-a56d-4004-9cad-baafd8899406}",\n    "Name": "Points"\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/Vertices/.zarray': '{\n    "chunks": [\n        325\n    ],\n    "compressor": {\n        "id": "zlib",\n        "level": 9\n    },\n    "dtype": [\n        [\n            "x",\n            "<f8"\n        ],\n        [\n            "y",\n            "<f8"\n        ],\n        [\n            "z",\n            "<f8"\n        ]\n    ],\n    "fill_value": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",\n    "filters": null,\n    "order": "C",\n    "shape": [\n        650\n    ],\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/Vertices/.zattrs': '{\n    "_ARRAY_DIMENSIONS": [\n        "phony_dim_0"\n    ]\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/Vertices/0': ['{{u}}', 72902, 1420], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{35490e4f-bc36-41e5-ad98-51a9d6d36b9b}/Vertices/1': ['{{u}}', 74322, 1374], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/.zattrs': '{\n    "Allow delete": 1,\n    "Allow move": 1,\n    "Allow rename": 1,\n    "ID": "{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}",\n    "Last focus": "None",\n    "Name": "Surface",\n    "Public": 1,\n    "Visible": 1\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/.zarray': '{\n    "chunks": [\n        1201,\n        2\n    ],\n    "compressor": {\n        "id": "zlib",\n        "level": 9\n    },\n    "dtype": "<i4",\n    "fill_value": 0,\n    "filters": null,\n    "order": "C",\n    "shape": [\n        4802,\n        3\n    ],\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/.zattrs': '{\n    "_ARRAY_DIMENSIONS": [\n        "phony_dim_0",\n        "phony_dim_1"\n    ]\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/0.0': ['{{u}}', 42329, 2960], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/0.1': ['{{u}}', 45289, 2002], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/1.0': ['{{u}}', 47291, 3000], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/1.1': ['{{u}}', 50291, 1954], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/2.0': ['{{u}}', 52245, 2943], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/2.1': ['{{u}}', 55188, 1940], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/3.0': ['{{u}}', 57128, 2937], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Cells/3.1': ['{{u}}', 60065, 1949], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Data/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/PropertyGroups/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Type/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Type/.zattrs': '{\n    "Description": "Surface",\n    "ID": "{f26feba3-aded-494b-b9e9-b2bbcbe298e1}",\n    "Name": "Surface"\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/.zarray': '{\n    "chunks": [\n        313\n    ],\n    "compressor": {\n        "id": "zlib",\n        "level": 9\n    },\n    "dtype": [\n        [\n            "x",\n            "<f8"\n        ],\n        [\n            "y",\n            "<f8"\n        ],\n        [\n            "z",\n            "<f8"\n        ]\n    ],\n    "fill_value": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",\n    "filters": null,\n    "order": "C",\n    "shape": [\n        2500\n    ],\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/.zattrs': '{\n    "_ARRAY_DIMENSIONS": [\n        "phony_dim_0"\n    ]\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/0': ['{{u}}', 28576, 1360], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/1': ['{{u}}', 29936, 1392], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/2': ['{{u}}', 31328, 1396], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/3': ['{{u}}', 32724, 1362], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/4': ['{{u}}', 34086, 1321], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/5': ['{{u}}', 35407, 1376], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/6': ['{{u}}', 36783, 1358], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{499a56ba-5845-43cd-ab30-c84ba7fbc3e9}/Vertices/7': ['{{u}}', 38141, 1300], 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Type/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Type/.zattrs': '{\n    "Allow delete contents": 1,\n    "Allow move contents": 1,\n    "Description": "NoType Group",\n    "ID": "{dd99b610-be92-48c0-873c-5b5946ea2840}",\n    "Name": "NoType Group"\n}', 'GEOSCIENCE/Objects/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Types/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Types/Data types/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Types/Group types/.zgroup': '{\n    "zarr_format": 2\n}', 'GEOSCIENCE/Types/Object types/.zgroup': '{\n    "zarr_format": 2\n}'}}]
RichardScottOZ commented 2 years ago

So 650 gravity points [x,y,z,Grav] 2500 Surface [x,y,z], with 4802 cells [triangles]

martindurant commented 2 years ago

I think fill_value = None should be valid for any complex data type

martindurant commented 2 years ago

Is it possible for me to get my hands on such a file, to see what can be done?

RichardScottOZ commented 2 years ago

Yeah, absolutely, will package a pair for you.

RichardScottOZ commented 2 years ago

@martindurant

https://drive.google.com/file/d/1QEQILk1dF4mbMskhYVdb8uLLLEFOKcwC/view?usp=sharing

10075.zip

The small file is the example used above - the bigger file with post is the same thing, but with data added - e.g. Inversion Groups, Block Models as an example of more complicated structure.

Thanks!

RichardScottOZ commented 2 years ago

It looks like the hd5 function is doing well though

RichardScottOZ commented 2 years ago

image

That is probably slightly more interesting... Points, Vertice, Cells etc.

martindurant commented 2 years ago

https://github.com/fsspec/kerchunk/pull/140 produces output for both of those examples. Here is the zarr tree view for the bigger "post" one: txt.txt

and I can confirm that zarr can read from the output, both data and attributes. So.. what to do with this now?

RichardScottOZ commented 2 years ago

Nice! So the not-so-expert me should give it a try.

and I have two different sets of approx 100K of them.

martindurant commented 2 years ago

Will you be looking to combine them into a single logical zarr dataset? I'm not sure how you would go about doing that when you don't have a netCDF-like coordinate to concatenate along.

RichardScottOZ commented 2 years ago

e.g. if I wanted to be able to examine, say the two block models here and check if the data matched some criteria

 │   └── {cde1051f-49d2-4269-95d4-ec888c5b322a}
 │       ├── Data
 │       ├── Groups
 │       ├── Objects
 │       │   ├── {35490e4f-bc36-41e5-ad98-51a9d6d36b9b}
 │       │   │   ├── Data
 │       │   │   ├── PropertyGroups
 │       │   │   └── Vertices (650,) [('x', '<f8'), ('y', '<f8'), ('z', '<f8')]
 │       │   ├── {499a56ba-5845-43cd-ab30-c84ba7fbc3e9}
 │       │   │   ├── Cells (4802, 3) int32
 │       │   │   ├── Data
 │       │   │   ├── PropertyGroups
 │       │   │   ├── Type
 │       │   │   └── Vertices (2500,) [('x', '<f8'), ('y', '<f8'), ('z', '<f8')]
 │       │   ├── {5ffef508-fa02-4afa-8e5d-dcc3e28d1e12}
 │       │   │   ├── Data
 │       │   │   ├── PropertyGroups
 │       │   │   ├── Type
 │       │   │   ├── U cell delimiters (26,) float64
 │       │   │   ├── V cell delimiters (26,) float64
 │       │   │   └── Z cell delimiters (25,) float64
 │       │   └── {bbae74f2-3146-4f69-b836-c6423641b73c}
 │       │       ├── Data
 │       │       ├── PropertyGroups
 │       │       ├── U cell delimiters (26,) float64
 │       │       ├── V cell delimiters (26,) float64
 │       │       └── Z cell delimiters (25,) float64
RichardScottOZ commented 2 years ago

Combine? Yes, possibly - I had started thinking about it, but had not come up with a good idea.

RichardScottOZ commented 2 years ago

Each one is a subsurface piece - or at the block model level, block.

If could theoretically combine - they do overlap, a good virtual reality 3D trial too if they could be all one zarr for access purposes.

RichardScottOZ commented 2 years ago

So data interrogation and whole model visualisation are two things I had been thinking about.

RichardScottOZ commented 2 years ago

It is this sort of situation image With a Z dimension that is always the same [as long as that particular block worked] - some fail as not enough data - e.g. a block that is 79% in the water for example, some have probably failed for hardware/software reasons that I don't know about yet - logs would check that, but being able to check here would also be good as part of data analysis.

RichardScottOZ commented 2 years ago

So you get it to read with zarr.open?

The actual setup/syntax for that is not necessarily clear to the novice user [me].

So we could make that an example? Reading from generic hdf5 file sets?

RichardScottOZ commented 2 years ago

Also, thanks very much for having a look!

RichardScottOZ commented 2 years ago

and later I guess I can ask the geoh5 devs - do you want a reading projects from the cloud example?

martindurant commented 2 years ago

I did:

f = open("10075/10007post.geoh5", "rb")
h = kerchunk.hdf.SingleHdf5ToZarr(f, url="10075/10007post.geoh5")
out = h.translate()
m = fsspec.get_mapper("reference://", fo=out)
z = zarr.open(m)
RichardScottOZ commented 2 years ago

Thanks and similarly? e.g. if grabbing another one

f = open("s3://bananasplits/10074post.geoh5", "rb")
h = kerchunk.hdf.SingleHdf5ToZarr(f, url="s3://bananasplits/10074post.geoh5")
out = h.translate()
m = fsspec.get_mapper("reference://", fo=out)
z = zarr.open(m)
martindurant commented 2 years ago

Exactly.

RichardScottOZ commented 2 years ago

or:- "s3use" being s3fs/fspec opening version?

f = s3use.open("s3://bananasplits/10074post.geoh5", "rb")
h = kerchunk.hdf.SingleHdf5ToZarr(f, url="s3://bananasplits/10074post.geoh5")
out = h.translate()
m = fsspec.get_mapper("reference://", fo=out)
z = zarr.open(m)
RichardScottOZ commented 2 years ago

@martindurant after putting your changes when I try the same one I sent I get:

image

RichardScottOZ commented 2 years ago

does that seem right by your results?

RichardScottOZ commented 2 years ago

and I imagine with many piece datasets - if running jobs to create them - adding the 'make json for 10075' in as part of that task would be a good plan in an efficiency/cost sense.

RichardScottOZ commented 2 years ago

and the other thing to check is - does this have numbers that look like the numbers in the project - e.g. fills etc?

RichardScottOZ commented 2 years ago

Ok, so unless I misunderstand something: [e.g. accessing the wrong thing]

check = z['GEOSCIENCE']['Data']['{bc4a9ee4-aea6-4043-b585-ad84f0dcb31b}'].attrs.asdict()
check

{'Allow delete': 1,
 'Allow move': 1,
 'Allow rename': 1,
 'Association': 'Cell',
 'ID': '{bc4a9ee4-aea6-4043-b585-ad84f0dcb31b}',
 'Name': 'block_density',
 'Public': 1,
 'Visible': 0}

datacheck = z['GEOSCIENCE']['Data']['{bc4a9ee4-aea6-4043-b585-ad84f0dcb31b}']['Data'].attrs.asdict()
datacheck

data = z['GEOSCIENCE']['Data']['{bc4a9ee4-aea6-4043-b585-ad84f0dcb31b}']['Data']
data.shape

import numpy as np
np.histogram(data)

(array([    0,     0,     0,     0,     0, 15000,     0,     0,     0,
            0], dtype=int64),
 array([-0.5, -0.4, -0.3, -0.2, -0.1,  0. ,  0.1,  0.2,  0.3,  0.4,  0.5]))

then

image

I have a zero problem by the looks.

martindurant commented 2 years ago

I see values like

array([3.55678700e+00, 3.55430724e+00, 3.55430724e+00, ...,
       1.17549435e-38, 1.17549435e-38, 1.17549435e-38])

and histogram

(array([4247,   21,   41,  158,  837, 3301, 6194,  189,    7,    5]),
 array([1.17549435e-38, 5.68615894e-01, 1.13723179e+00, 1.70584768e+00,
        2.27446357e+00, 2.84307947e+00, 3.41169536e+00, 3.98031126e+00,
        4.54892715e+00, 5.11754304e+00, 5.68615894e+00]))

so probably right, except that 1.17549435e-38 -> NaN. Maybe you are in the wrong path now?

RichardScottOZ commented 2 years ago

Thanks Martin, that is good. Would you be so kind as to post your example there and I will check what I have done wrong?

RichardScottOZ commented 2 years ago

Yours look good.

martindurant commented 2 years ago

I think your issue was in

fsspec.get_mapper("reference://", fo=out)

you needed

fsspec.get_mapper("reference://", fo=out, remote_protocol="s3")

and any S3 FS options in a dict remote_options.

RichardScottOZ commented 2 years ago

Thank you!

So something like this?

r_opts = {
    "key": creds['AWS_ACCESS_KEY_ID'],
    "secret": creds['AWS_SECRET_ACCESS_KEY'],
    "client_kwargs": {"region_name": "us-west-2"},
}

f = open("s3://bananasplits/10074post.geoh5", "rb")
h = kerchunk.hdf.SingleHdf5ToZarr(f, url="s3://bananasplits/10074post.geoh5")
out = h.translate()
m = fsspec.get_mapper("reference://", fo=out, remote_protocol="s3", remote_options=r_opts)
z = zarr.open(m)
martindurant commented 2 years ago

Yes, assuming open is acting on an s3fs instance with the same options.

RichardScottOZ commented 2 years ago

so, updating?

s3fs_use = s3fs.S3FileSystem(profile='bananasplitsprofile')  ##same as r_ropts is using

r_opts = {
    "key": creds['AWS_ACCESS_KEY_ID'],
    "secret": creds['AWS_SECRET_ACCESS_KEY'],
    "client_kwargs": {"region_name": "us-west-2"},
}

f = s3fs_use.open("s3://bananasplits/10074post.geoh5", "rb")
h = kerchunk.hdf.SingleHdf5ToZarr(f, url="s3://bananasplits/10074post.geoh5")
out = h.translate()
m = fsspec.get_mapper("reference://", fo=out, remote_protocol="s3", remote_options=r_opts)
z = zarr.open(m)
RichardScottOZ commented 2 years ago

image

RichardScottOZ commented 2 years ago

Thanks again, Martin. Now I just need to make 100K more.

RichardScottOZ commented 2 years ago

Ok, so trying just as a test:

mzz = MultiZarrToZarr(
    [singles],
    #concat_dims = ["X","Y","Z"],
    concat_dims = None,
    coo_map={"X":"data:GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{5ffef508-fa02-4afa-8e5d-dcc3e28d1e12}/U cell delimiters/",
    "Y":"data:GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{5ffef508-fa02-4afa-8e5d-dcc3e28d1e12}/V cell delimiters",
    "Z":"data:GEOSCIENCE/Groups/{cde1051f-49d2-4269-95d4-ec888c5b322a}/Objects/{5ffef508-fa02-4afa-8e5d-dcc3e28d1e12}/Z cell delimiters"},
    remote_protocol="s3",
    # remote_options={'anon': False},
    remote_options=r_opts,
    # concat_dims=["time"]
)

out = mzz.translate()

gives an error

Traceback (most recent call last):
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-c34072a7a6d6>", line 1, in <module>
    runfile('C:/Users/rscott/OneDrive - OZ Minerals/Exploration2022/Block-Golem/src/Xarray-hdf5.py', wdir='C:/Users/rscott/OneDrive - OZ Minerals/Exploration2022/Block-Golem/src')
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2021.1.1\plugins\python-ce\helpers\pydev\_pydev_bundle\pydev_umd.py", line 197, in runfile
    pydev_imports.execfile(filename, global_vars, local_vars)  # execute the script
  File "C:\Program Files\JetBrains\PyCharm Community Edition 2021.1.1\plugins\python-ce\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "C:/Users/rscott/OneDrive - OZ Minerals/Exploration2022/Block-Golem/src/Xarray-hdf5.py", line 70, in <module>
    out = mzz.translate()
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\combine.py", line 420, in translate
    self.second_pass()
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\combine.py", line 305, in second_pass
    deps = [z[_].attrs["_ARRAY_DIMENSIONS"] for _ in z]
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\kerchunk\combine.py", line 305, in <listcomp>
    deps = [z[_].attrs["_ARRAY_DIMENSIONS"] for _ in z]
  File "C:\Users\rscott\AppData\Local\Continuum\anaconda3\envs\stuartshelf\lib\site-packages\zarr\attrs.py", line 63, in __getitem__
    return self.asdict()[item]
KeyError: '_ARRAY_DIMENSIONS'

which presumably it is looking for the attrs for the base, which in this case is just the GEOSCIENCE group, which will have some e.g

{'Contributors': 'UserName', 'Distance unit': 'meter', 'GA Version': '1', 'Version': 1.0}], 

but they won't be arrays - so here's where you would need a custom version to handle a particular hdf5 case to suit?