zarr-developers / zarr-python

An implementation of chunked, compressed, N-dimensional arrays for Python.
https://zarr.readthedocs.io
MIT License
1.51k stars 278 forks source link

v3: array written by tensorstore returns all nulls #2029

Open joshmoore opened 3 months ago

joshmoore commented 3 months ago

Arrays written by tensorstore are being returned by open_array, open, AsyncArray.open, etc. as having all fill values.

$ ./ts_info.py output.zarr/0
min=3 max=4095

$ ./zr_info.py output.zarr/0/
min=0 max=0
zarr.json ``` { "chunk_grid": { "configuration": { "chunk_shape": [ 1, 1, 275, 271 ] }, "name": "regular" }, "chunk_key_encoding": { "name": "default" }, "codecs": [ { "configuration": { "endian": "little" }, "name": "bytes" }, { "configuration": { "blocksize": 0, "clevel": 5, "cname": "lz4", "shuffle": "shuffle", "typesize": 2 }, "name": "blosc" } ], "data_type": "uint16", "dimension_names": [ "c", "z", "y", "x" ], "fill_value": 0, "node_type": "array", "shape": [ 2, 236, 275, 271 ], "zarr_format": 3 } ```
d-v-b commented 3 months ago

that's so bad that I'm hopeful that there's a simple fix!

joshmoore commented 3 months ago

Let me know if you would be interested in the creation code and/or the dataset itself.

d-v-b commented 3 months ago

that would be great!

joshmoore commented 3 months ago

Workflow:

ts_info.py ``` #!/usr/bin/env python import random import numpy as np import zarr import sys import os import tensorstore as ts import argparse parser = argparse.ArgumentParser() parser.add_argument("--input-bucket") parser.add_argument("--input-endpoint") parser.add_argument("--input-anon", action="store_true") parser.add_argument("--input-region", default="us-east-1") parser.add_argument("--input-driver", default="zarr3") parser.add_argument("input_path") ns = parser.parse_args() def create_configs(ns): configs = [] for selection in ("input",): anon = getattr(ns, f"{selection}_anon") bucket = getattr(ns, f"{selection}_bucket") endpoint = getattr(ns, f"{selection}_endpoint") region = getattr(ns, f"{selection}_region") if bucket: store = { 'driver': 's3', 'bucket': bucket, 'aws_region': region, } if anon: store['aws_credentials'] = { 'anonymous': anon } if endpoint: store["endpoint"] = endpoint else: store = { 'driver': 'file', } configs.append(store) return configs CONFIGS = create_configs(ns) def info(input_path: str): CONFIGS[0]["path"] = input_path read = ts.open({ 'driver': ns.input_driver, 'kvstore': CONFIGS[0], }).result() shape = read.shape chunks = read.schema.chunk_layout.read_chunk.shape return read read = info(ns.input_path) arr = read[:].read().result() print(np.min(arr), np.max(arr)) ```
zr_info.py ``` #!/usr/bin/env python import random import numpy as np import zarr import sys import os import logging logging.basicConfig(level=0) import argparse parser = argparse.ArgumentParser() parser.add_argument("--input-bucket") parser.add_argument("--input-endpoint") parser.add_argument("--input-anon", action="store_true") parser.add_argument("--input-region", default="us-east-1") parser.add_argument("input_path") ns = parser.parse_args() def create_configs(ns): configs = [] for selection in ("input",): anon = getattr(ns, f"{selection}_anon") bucket = getattr(ns, f"{selection}_bucket") endpoint = getattr(ns, f"{selection}_endpoint") region = getattr(ns, f"{selection}_region") if bucket: store = { 'driver': 's3', 'bucket': bucket, 'aws_region': region, } if anon: store['aws_credentials'] = { 'anonymous': anon } if endpoint: store["endpoint"] = endpoint else: store = { 'driver': 'file', } configs.append(store) return configs CONFIGS = create_configs(ns) STORES = [] for config, path, mode in ( (CONFIGS[0], ns.input_path, "r"), ): if "bucket" in config: store_class = zarr.store.RemoteStore anon = config.get("aws_credentials", {}).get("anonymous", False) store = store_class( url=f's3://{config["bucket"]}/{path}', anon=anon, endpoint_url=config.get("endpoint", None), mode=mode, ) else: store_class = zarr.store.LocalStore store = store_class(path, mode=mode) STORES.append(store) async def info(input_path: str): # from zarr.api.synchronous import open # return open(store=STORES[0], zarr_version=3) # from zarr.api.synchronous import open_array # return open_array(store=STORES[0], zarr_version=3) import zarr return zarr.open(store=STORES[0], zarr_version=3) if False: from zarr.array import Array, AsyncArray from zarr.buffer import default_buffer_prototype, NDBuffer arr = await AsyncArray.open(store=STORES[0]) whole = [ slice(0, x) for x in arr.shape ] out = NDBuffer.from_numpy_array(np.empty(arr.shape)) # return await arr._get_selection([], out=out, prototype=default_buffer_prototype) return await arr.getitem(slice(None)) # 0, 0 elif False: from zarr.api.asynchronous import open arr = await open(store=STORES[0], mode="r") return await arr.get_basic_selection((..., ..., ..., ..., ...)) import asyncio loop = asyncio.get_event_loop() arr = loop.run_until_complete(info(ns.input_path)) loop.close() print(np.min(arr), np.max(arr)) ```