pangeo-data / rechunker

Disk-to-disk chunk transformation for chunked arrays.
https://rechunker.readthedocs.io/
MIT License
163 stars 25 forks source link

Rechunking Dask Array with multiple variables #97

Closed roxyboy closed 3 years ago

roxyboy commented 3 years ago

I have the following xarray/zarr array that I'm trying to rechunk:

Screen Shot 2021-08-28 at 4 10 54 PM

I've tried the following but I'm getting a ContainsArrayError:

target_chunks = {'dzetaF': {'time':tchunk, 'YC':ychunk, 'XC':xchunk, 'delta_outer':schunk},
                 'dzetaFx': {'time':tchunk, 'YC':ychunk, 'XG':xchunk, 'deltax_outer':schunk},
                 'dzetaFy': {'time':tchunk, 'YG':ychunk, 'XC':xchunk, 'deltay_outer':schunk},
                 'zdz': {'time':tchunk, 'YC':ychunk, 'XC':xchunk, 'delta_outer':schunk},
                 'zxdz': {'time':tchunk, 'YC':ychunk, 'XG':xchunk, 'deltax_outer':schunk},
                 'zydz': {'time':tchunk, 'YG':ychunk, 'XC':xchunk, 'deltay_outer':schunk},
                 'time':None, 'delta_outer':None,
                 'deltay_outer':None, 'deltax_outer':None,
                 'YC':None, 'YG':None, 'XC':None, 'XG':None
                }
# target_chunks = (tchunk, ychunk, xchunk, schunk)

max_mem = '1GB'

target_store = tdgs+'5Dave/memb00/Zs_rechunked.zarr'
temp_store = tdgs+'5Dave/memb00/Zs_rechunked-tmp.zarr'

array_plan = rechunk(ds2_rechunked, target_chunks, max_mem, target_store, temp_store=temp_store)
array_plan
---------------------------------------------------------------------------
ContainsArrayError                        Traceback (most recent call last)
/tmp/ipykernel_29620/83261745.py in <module>
     16 temp_store = tdgs+'5Dave/memb00/Zs_rechunked-tmp.zarr'
     17 
---> 18 array_plan = rechunk(ds2_rechunked, target_chunks, max_mem, target_store, temp_store=temp_store)
     19 array_plan

~/miniconda3/envs/dimage/lib/python3.8/site-packages/rechunker/api.py in rechunk(source, target_chunks, max_mem, target_store, target_options, temp_store, temp_options, executor)
    299         executor = _get_executor(executor)
    300 
--> 301     copy_spec, intermediate, target = _setup_rechunk(
    302         source=source,
    303         target_chunks=target_chunks,

~/miniconda3/envs/dimage/lib/python3.8/site-packages/rechunker/api.py in _setup_rechunk(source, target_chunks, max_mem, target_store, target_options, temp_store, temp_options)
    380             variable_attrs[DIMENSION_KEY] = encode_zarr_attr_value(variable.dims)
    381 
--> 382             copy_spec = _setup_array_rechunk(
    383                 dask.array.asarray(variable),
    384                 variable_chunks,

~/miniconda3/envs/dimage/lib/python3.8/site-packages/rechunker/api.py in _setup_array_rechunk(source_array, target_chunks, max_mem, target_store_or_group, target_options, temp_store_or_group, temp_options, name)
    500     write_chunks = tuple(int(x) for x in write_chunks)
    501 
--> 502     target_array = _zarr_empty(
    503         shape,
    504         target_store_or_group,

~/miniconda3/envs/dimage/lib/python3.8/site-packages/rechunker/api.py in _zarr_empty(shape, store_or_group, chunks, dtype, name, **kwargs)
    150     if name is not None:
    151         assert isinstance(store_or_group, zarr.hierarchy.Group)
--> 152         return store_or_group.empty(
    153             name, shape=shape, chunks=chunks, dtype=dtype, **kwargs
    154         )

~/miniconda3/envs/dimage/lib/python3.8/site-packages/zarr/hierarchy.py in empty(self, name, **kwargs)
    899         """Create an array. Keyword arguments as per
    900         :func:`zarr.creation.empty`."""
--> 901         return self._write_op(self._empty_nosync, name, **kwargs)
    902 
    903     def _empty_nosync(self, name, **kwargs):

~/miniconda3/envs/dimage/lib/python3.8/site-packages/zarr/hierarchy.py in _write_op(self, f, *args, **kwargs)
    659 
    660         with lock:
--> 661             return f(*args, **kwargs)
    662 
    663     def create_group(self, name, overwrite=False):

~/miniconda3/envs/dimage/lib/python3.8/site-packages/zarr/hierarchy.py in _empty_nosync(self, name, **kwargs)
    905         kwargs.setdefault('synchronizer', self._synchronizer)
    906         kwargs.setdefault('cache_attrs', self.attrs.cache)
--> 907         return empty(store=self._store, path=path, chunk_store=self._chunk_store,
    908                      **kwargs)
    909 

~/miniconda3/envs/dimage/lib/python3.8/site-packages/zarr/creation.py in empty(shape, **kwargs)
    225 
    226     """
--> 227     return create(shape=shape, fill_value=None, **kwargs)
    228 
    229 

~/miniconda3/envs/dimage/lib/python3.8/site-packages/zarr/creation.py in create(shape, chunks, dtype, compressor, fill_value, order, store, synchronizer, overwrite, path, chunk_store, filters, cache_metadata, cache_attrs, read_only, object_codec, **kwargs)
    119 
    120     # initialize array metadata
--> 121     init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor,
    122                fill_value=fill_value, order=order, overwrite=overwrite, path=path,
    123                chunk_store=chunk_store, filters=filters, object_codec=object_codec)

~/miniconda3/envs/dimage/lib/python3.8/site-packages/zarr/storage.py in init_array(store, shape, chunks, dtype, compressor, fill_value, order, overwrite, path, chunk_store, filters, object_codec)
    346     _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite)
    347 
--> 348     _init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype,
    349                          compressor=compressor, fill_value=fill_value,
    350                          order=order, overwrite=overwrite, path=path,

~/miniconda3/envs/dimage/lib/python3.8/site-packages/zarr/storage.py in _init_array_metadata(store, shape, chunks, dtype, compressor, fill_value, order, overwrite, path, chunk_store, filters, object_codec)
    375             rmdir(chunk_store, path)
    376     elif contains_array(store, path):
--> 377         raise ContainsArrayError(path)
    378     elif contains_group(store, path):
    379         raise ContainsGroupError(path)

ContainsArrayError: path 'XC' contains an array

Could someone advise what I'm doing wrong here? The examples in the documentation seem to only have one variable per dataset but is the error stemming from my dataset having multiple variables?

roxyboy commented 3 years ago

I managed to get this work so I'm going to close this but information I found in the xarray issues may aid other users.

The ContainsArrayError was originating from me not deleting pre-existing temp_store and target_store but I was then getting weird errors of ValueError: Final chunk of Zarr array must be the same size or smaller than the first.

Running

del ds2_rechunked['dzetaF'].encoding['chunks']
del ds2_rechunked['dzetaFx'].encoding['chunks']
del ds2_rechunked['dzetaFy'].encoding['chunks']
del ds2_rechunked['zdz'].encoding['chunks']
del ds2_rechunked['zxdz'].encoding['chunks']
del ds2_rechunked['zydz'].encoding['chunks']

did the trick for me.