zarr-developers / zarr-python

An implementation of chunked, compressed, N-dimensional arrays for Python.
https://zarr.readthedocs.io
MIT License
1.54k stars 287 forks source link

[v3] updating attributes breaks zarr.json metadata (e.g. dim sep) #2111

Open joshmoore opened 3 months ago

joshmoore commented 3 months ago

Zarr version

3.0.0a1 and v3 branch currently

Numcodecs version

n/a

Python Version

3.10 and 3.12

Operating System

Mac

Installation

pip install in mamba env.

Description

Behavior:

Expected:

Steps to reproduce

see: https://github.com/ome/ome2024-ngff-challenge/blob/d25e1246b900b594c32c8214a0cb6390960fabbb/src/ome2024_ngff_challenge/resave.py#L16

    ## TODO: This is not working with v3 branch nor with released version
    ## zr_array = zarr.open_array(store=output_config.zr_store, mode="a", zarr_format=3)
    ## zr_array.update_attributes({
    ##     "_ome2024_ngff_challenge_stats": stats,
    ## })

Additional output

No response

jhamman commented 3 months ago

@joshmoore - if we can simmer this down to something more bite sized, I'd be happy to take a look.

joshmoore commented 3 months ago

Here's a reproducer with 3.0.0a0:

issue2111.py ``` #!/usr/bin/env python from __future__ import annotations import argparse import json import logging import math import multiprocessing import os import random import time import warnings from pathlib import Path import tensorstore as ts import tqdm import argparse import itertools import json import logging import shutil import time from importlib.metadata import version as lib_version from pathlib import Path import numpy as np import tensorstore as ts import zarr from zarr.api.synchronous import sync try: from zarr.core.buffer import Buffer, BufferPrototype except: from zarr.buffer import Buffer, BufferPrototype if __name__ == "__main__": my_array = "example.zarr" # Used later for opening to update the attributes store_class = zarr.store.LocalStore zr_store = store_class(my_array, mode="w") # Tensorstore configuration ts_store = { "driver": "file", "path": my_array, } ts_config = { "driver": "zarr3", "kvstore": ts_store, } dimension_names = ("y", "x") dtype = "int8" shape = (256, 256) chunks = (64, 64) shards = None # shards = (256, 256) if shards: chunk_grid = { "name": "regular", "configuration": {"chunk_shape": shards}, } # write size sharding_codec = { "name": "sharding_indexed", "configuration": { "chunk_shape": chunks, # read size "codecs": [ {"name": "bytes", "configuration": {"endian": "little"}}, {"name": "blosc", "configuration": {"cname": "zstd", "clevel": 5}}, ], "index_codecs": [ {"name": "bytes", "configuration": {"endian": "little"}}, {"name": "crc32c"}, ], "index_location": "end", }, } codecs = [sharding_codec] else: # Alternative without sharding... chunk_grid = {"name": "regular", "configuration": {"chunk_shape": chunks}} codecs = [ {"name": "bytes", "configuration": {"endian": "little"}}, {"name": "blosc", "configuration": {"cname": "zstd", "clevel": 5}}, ] ts_config["metadata"] = { "shape": shape, "chunk_grid": chunk_grid, "chunk_key_encoding": { "name": "default" }, # "configuration": {"separator": "/"}}, "codecs": codecs, "data_type": dtype, "dimension_names": dimension_names, "attributes": {"set-by": "tensorstore"} } # Create an array with tensorstore write_config = ts_config.copy() write_config["create"] = True write_config["delete_existing"] = True write = ts.open(write_config).result() with ts.Transaction() as txn: write_config[(0,0)] = 1 if False: ## This is the workaround that I put in place. stats = {"updated-by": "tensorstore"} metadata = write.kvstore["zarr.json"] metadata = json.loads(metadata) if "attributes" in metadata: attributes = metadata["attributes"] else: attributes = {} metadata["attributes"] = attributes attributes["_ome2024_ngff_challenge_stats"] = stats metadata = json.dumps(metadata) write.kvstore["zarr.json"] = metadata else: # Attempt to update the metadata with zarr-python (#2111) stats = {"updated-by": "zarr-python"} zr_array = zarr.open_array(store=zr_store, mode="a", zarr_format=3) zr_array.update_attributes({ "_ome2024_ngff_challenge_stats": stats, }) # Now try to re-open with tensorstore verify_config = ts_config.copy() verify = ts.open(verify_config).result() ```

The first error you will run into is likely https://github.com/zarr-developers/zarr-python/issues/2025

"Named configuration does not have a 'configuration' key" ``` Traceback (most recent call last): File "/private/tmp/issue2111.py", line 129, in zr_array = zarr.open_array(store=zr_store, mode="a", zarr_format=3) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/Caskroom/mambaforge/base/envs/challenge4/lib/python3.12/site-packages/zarr/api/synchronous.py", line 231, in open_array return Array(sync(async_api.open_array(*args, **kwargs))) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/Caskroom/mambaforge/base/envs/challenge4/lib/python3.12/site-packages/zarr/sync.py", line 92, in sync raise return_result File "/opt/homebrew/Caskroom/mambaforge/base/envs/challenge4/lib/python3.12/site-packages/zarr/sync.py", line 51, in _runner return await coro ^^^^^^^^^^ File "/opt/homebrew/Caskroom/mambaforge/base/envs/challenge4/lib/python3.12/site-packages/zarr/api/asynchronous.py", line 860, in open_array return await AsyncArray.open(store_path, zarr_format=zarr_format) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/Caskroom/mambaforge/base/envs/challenge4/lib/python3.12/site-packages/zarr/array.py", line 358, in open metadata=ArrayV3Metadata.from_dict(json.loads(zarr_json_bytes.to_bytes())), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/Caskroom/mambaforge/base/envs/challenge4/lib/python3.12/site-packages/zarr/metadata.py", line 289, in from_dict return cls(**data) # type: ignore[arg-type] ^^^^^^^^^^^ File "/opt/homebrew/Caskroom/mambaforge/base/envs/challenge4/lib/python3.12/site-packages/zarr/metadata.py", line 193, in __init__ chunk_key_encoding_parsed = ChunkKeyEncoding.from_dict(chunk_key_encoding) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/Caskroom/mambaforge/base/envs/challenge4/lib/python3.12/site-packages/zarr/chunk_key_encodings.py", line 41, in from_dict name_parsed, configuration_parsed = parse_named_configuration(data) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/Caskroom/mambaforge/base/envs/challenge4/lib/python3.12/site-packages/zarr/common.py", line 132, in parse_named_configuration raise ValueError(f"Named configuration does not have a 'configuration' key. Got {data}.") ValueError: Named configuration does not have a 'configuration' key. Got {'name': 'default'}. ```

If you work around that (e.g., with my snippet there), you'll see from the tensorstore verification:

"Error opening "zarr3" driver:" ``` Traceback (most recent call last): File "/private/tmp/issue2111.py", line 136, in verify = ts.open(verify_config).result() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ValueError: FAILED_PRECONDITION: Error opening "zarr3" driver: Expected "chunk_key_encoding" of {"name":"default"} but received: {"configuration":{"separator":"."},"name":"default"} [tensorstore_spec='{\"context\":{\"cache_pool\":{},\"data_copy_concurrency\":{},\"file_io_concurrency\":{},\"file_io_sync\":true},\"driver\":\"zarr3\",\"dtype\":\"int8\",\"kvstore\":{\"driver\":\"file\",\"path\":\"example.zarr/\"},\"metadata\":{\"attributes\":{\"set-by\":\"tensorstore\"},\"chunk_grid\":{\"configuration\":{\"chunk_shape\":[64,64]},\"name\":\"regular\"},\"chunk_key_encoding\":{\"name\":\"default\"},\"codecs\":[{\"configuration\":{\"endian\":\"little\"},\"name\":\"bytes\"},{\"configuration\":{\"clevel\":5,\"cname\":\"zstd\"},\"name\":\"blosc\"}],\"data_type\":\"int8\",\"dimension_names\":[\"y\",\"x\"],\"node_type\":\"array\",\"shape\":[256,256]},\"transform\":{\"input_exclusive_max\":[[256],[256]],\"input_inclusive_min\":[0,0],\"input_labels\":[\"y\",\"x\"]}}'] [source locations='tensorstore/driver/zarr3/driver.cc:593\ntensorstore/driver/kvs_backed_chunk_driver.cc:1288\ntensorstore/driver/driver.cc:112'] ```

And this comes from the fact that the re-opening with zarr-python has changed the configuration (including the dimension separator):

zr.json ``` { "shape": [ 256, 256 ], "fill_value": 0, "chunk_grid": { "name": "regular", "configuration": { "chunk_shape": [ 64, 64 ] } }, "attributes": { "_ome2024_ngff_challenge_stats": { "updated-by": "zarr-python" } }, "zarr_format": 3, "data_type": "int8", "chunk_key_encoding": { "name": "default", "configuration": { "separator": "." } }, "codecs": [ { "name": "bytes", "configuration": { "endian": "little" } }, { "name": "blosc", "configuration": { "typesize": 1, "cname": "zstd", "clevel": 5, "shuffle": "bitshuffle", "blocksize": 0 } } ], "dimension_names": [ "y", "x" ], "node_type": "array" } ```

compared to from tensorstore (if you change the if False to if True):

ts.json ``` { "attributes": { "set-by": "tensorstore", "_ome2024_ngff_challenge_stats": { "updated-by": "tensorstore" } }, "chunk_grid": { "configuration": { "chunk_shape": [ 64, 64 ] }, "name": "regular" }, "chunk_key_encoding": { "name": "default" }, "codecs": [ { "name": "bytes" }, { "configuration": { "blocksize": 0, "clevel": 5, "cname": "zstd", "shuffle": "bitshuffle", "typesize": 1 }, "name": "blosc" } ], "data_type": "int8", "dimension_names": [ "y", "x" ], "fill_value": 0, "node_type": "array", "shape": [ 256, 256 ], "zarr_format": 3 } ```

With 3.0.0a1 I get TypeError: create() missing 1 required positional argument: 'shape' and I'll need to figure out how to migrate this code.