pydata / xarray

N-D labeled arrays and datasets in Python
https://xarray.dev
Apache License 2.0
3.55k stars 1.07k forks source link

mfdataset fails at chunking after opening #896

Closed apatlpo closed 5 years ago

apatlpo commented 8 years ago

Hi all,

We are trying to specify chunks after opening an mfdataset but it does not work. This works fine with datasets. Is this behavior expected? Are we doing anything wrong?

# - Modules
#
import sys, os
import xarray as xr

chunks = (1727, 2711)
xr_chunks = {'x': chunks[-1], 'y': chunks[-2], 'time_counter':1, 'deptht': 1}

# - Parameter
natl60_path = '/home7/pharos/othr/NATL60/'
filename = natl60_path+'NATL60-MJM155-S/5d/2008/NATL60-MJM155_y2008m01d09.5d_gridT.nc'
filenames = natl60_path+'NATL60-MJM155-S/5d/2008/NATL60-MJM155_y2008m01d0*gridT.nc'

### dataset
# open
ds = xr.open_dataset(filename,chunks=None)
# chunk
ds = ds.chunk(xr_chunks)
# plot
print 'With dataset:'
print ds['votemper'].isel(time_counter=0,deptht=0).values

### mfdataset
# open
ds = xr.open_mfdataset(filenames,chunks=None, lock=False)
# plot
print 'With mfdataset no chunks:'
print ds['votemper'].isel(time_counter=0,deptht=0).values
# chunk
print 'With mfdataset with chunks:'
ds = ds.chunk(xr_chunks)
print ds['votemper'].isel(time_counter=0,deptht=0)
print ds['votemper'].isel(time_counter=0,deptht=0).values

The output is:

With dataset:
[[ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 ..., 
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]]
With mfdataset no chunks:
[[ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 ..., 
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]]
With mfdataset with chunks:
<xarray.DataArray 'votemper' (y: 3454, x: 5422)>
dask.array<getitem..., shape=(3454, 5422), dtype=float64, chunksize=(1727, 2711)>
Coordinates:
    nav_lat        (y, x) float32 26.5648 26.5648 26.5648 26.5648 26.5648 ...
    nav_lon        (y, x) float32 -81.4512 -81.4346 -81.4179 -81.4012 ...
    deptht         float32 0.480455
  * x              (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ...
  * y              (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ...
    time_counter   datetime64[ns] 2008-01-02T12:00:00
    time_centered  datetime64[ns] 2008-01-02T12:00:00
Attributes:
    long_name: temperature
    units: degC
    online_operation: average
    interval_operation: 40s
    interval_write: 5d

The code hangs for a while and then spits:

Traceback (most recent call last):
  File "/home/slyne/aponte/natl60/python/natl60_dimup/overview/aurelien/plot_snapshot_2d_v4_break.py", line 44, in <module>
    print ds['votemper'].isel(time_counter=0,deptht=0).values
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/xarray/core/dataarray.py", line 364, in values
    return self.variable.values
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/xarray/core/variable.py", line 288, in values
    return _as_array_or_item(self._data_cached())
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/xarray/core/variable.py", line 254, in _data_cached
    self._data = np.asarray(self._data)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/numpy/core/numeric.py", line 460, in asarray
    return array(a, dtype, copy=False, order=order)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/array/core.py", line 867, in __array__
    x = self.compute()
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/base.py", line 37, in compute
    return compute(self, **kwargs)[0]
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/base.py", line 110, in compute
    results = get(dsk, keys, **kwargs)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/threaded.py", line 57, in get
    **kwargs)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 481, in get_async
    raise(remote_exception(res, tb))
dask.async.MemoryError: 

Traceback
---------
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 264, in execute_task
    result = _execute_task(task, data)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 245, in _execute_task
    args2 = [_execute_task(a, cache) for a in args]
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 245, in _execute_task
    args2 = [_execute_task(a, cache) for a in args]
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 242, in _execute_task
    return [_execute_task(a, cache) for a in arg]
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 242, in _execute_task
    return [_execute_task(a, cache) for a in arg]
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 242, in _execute_task
    return [_execute_task(a, cache) for a in arg]
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 242, in _execute_task
    return [_execute_task(a, cache) for a in arg]
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 245, in _execute_task
    args2 = [_execute_task(a, cache) for a in args]
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/async.py", line 246, in _execute_task
    return func(*args2)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/dask/array/core.py", line 50, in getarray
    c = np.asarray(c)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/numpy/core/numeric.py", line 460, in asarray
    return array(a, dtype, copy=False, order=order)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/xarray/core/indexing.py", line 312, in __array__
    return np.asarray(array[self.key], dtype=None)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/xarray/conventions.py", line 359, in __getitem__
    self.scale_factor, self.add_offset, self._dtype)
  File "/home1/homedir5/perso/aponte/miniconda2/envs/natl60/lib/python2.7/site-packages/xarray/conventions.py", line 57, in mask_and_scale
    values = np.array(array, dtype=dtype, copy=True)

Cheers

Aurelien

shoyer commented 8 years ago

This error indicates that Python is running out of memory. Dask (which we use with mfdataset) can help with that, but it doesn't always solve the issue.

How is the data arranged in each of the input files? What version of dask are you using? This might be an issue with dask.array not fusing calls to __getitem__ and loading entire files into memory.

cc @mrocklin @jcrist

apatlpo commented 8 years ago

Thanks for your answer. You'll find below an output from ncdump in order to answer the part about data arrangement (I hope ...) Otherwise, we have:

dask                      0.8.2                    py27_0    defaults
xarray                    0.7.2                    py27_0    defaults
netcdf4                   1.1.1                np18py27_0    defaults

Looking forward for any suggestions. cheers aurelien

netcdf NATL60-MJM155_y2008m01d09.5d_gridT {
dimensions:
    x = 5422 ;
    y = 3454 ;
    deptht = 300 ;
    time_counter = UNLIMITED ; // (1 currently)
    time_bounds = 2 ;
variables:
    float nav_lat(y, x) ;
        nav_lat:axis = "Y" ;
        nav_lat:standard_name = "latitude" ;
        nav_lat:long_name = "Latitude" ;
        nav_lat:units = "degrees_north" ;
        nav_lat:nav_model = "grid_T" ;
        nav_lat:_Storage = "chunked" ;
        nav_lat:_ChunkSizes = 12, 5422 ;
    float nav_lon(y, x) ;
        nav_lon:axis = "X" ;
        nav_lon:standard_name = "longitude" ;
        nav_lon:long_name = "Longitude" ;
        nav_lon:units = "degrees_east" ;
        nav_lon:nav_model = "grid_T" ;
        nav_lon:_Storage = "chunked" ;
        nav_lon:_ChunkSizes = 12, 5422 ;
    float deptht(deptht) ;
        deptht:axis = "Z" ;
        deptht:long_name = "Vertical T levels" ;
        deptht:units = "m" ;
        deptht:positive = "down" ;
        deptht:_Storage = "chunked" ;
        deptht:_ChunkSizes = 300 ;
    float votemper(time_counter, deptht, y, x) ;
        votemper:long_name = "temperature" ;
        votemper:units = "degC" ;
        votemper:online_operation = "average" ;
        votemper:interval_operation = "40s" ;
        votemper:interval_write = "5d" ;
        votemper:_FillValue = 0.f ;
        votemper:missing_value = 0.f ;
        votemper:coordinates = "time_centered deptht nav_lon nav_lat" ;
        votemper:_Storage = "chunked" ;
        votemper:_ChunkSizes = 1, 1, 12, 5422 ;
        votemper:_DeflateLevel = 1 ;
    double time_centered(time_counter) ;
        time_centered:standard_name = "time" ;
        time_centered:long_name = "Time axis" ;
        time_centered:title = "Time" ;
        time_centered:calendar = "gregorian" ;
        time_centered:units = "seconds since 1958-01-01 00:00:00" ;
        time_centered:time_origin = "1958-01-01 00:00:00" ;
        time_centered:bounds = "time_centered_bounds" ;
        time_centered:_Storage = "chunked" ;
        time_centered:_ChunkSizes = 1 ;
    double time_centered_bounds(time_counter, time_bounds) ;
        time_centered_bounds:_Storage = "chunked" ;
        time_centered_bounds:_ChunkSizes = 1, 2 ;
    double time_counter(time_counter) ;
        time_counter:axis = "T" ;
        time_counter:standard_name = "time" ;
        time_counter:long_name = "Time axis" ;
        time_counter:title = "Time" ;
        time_counter:calendar = "gregorian" ;
        time_counter:units = "seconds since 1958-01-01 00:00:00" ;
        time_counter:time_origin = "1958-01-01 00:00:00" ;
        time_counter:bounds = "time_counter_bounds" ;
        time_counter:_Storage = "chunked" ;
        time_counter:_ChunkSizes = 1 ;
    double time_counter_bounds(time_counter, time_bounds) ;
        time_counter_bounds:_Storage = "chunked" ;
        time_counter_bounds:_ChunkSizes = 1, 2 ;
    float vosaline(time_counter, deptht, y, x) ;
        vosaline:long_name = "salinity" ;
        vosaline:units = "psu" ;
        vosaline:online_operation = "average" ;
        vosaline:interval_operation = "40s" ;
        vosaline:interval_write = "5d" ;
        vosaline:_FillValue = 0.f ;
        vosaline:missing_value = 0.f ;
        vosaline:coordinates = "time_centered deptht nav_lon nav_lat" ;
        vosaline:_Storage = "chunked" ;
        vosaline:_ChunkSizes = 1, 1, 12, 5422 ;
        vosaline:_DeflateLevel = 1 ;
    float sossheig(time_counter, y, x) ;
        sossheig:long_name = "sea surface height" ;
        sossheig:units = "m" ;
        sossheig:online_operation = "average" ;
        sossheig:interval_operation = "40s" ;
        sossheig:interval_write = "5d" ;
        sossheig:_FillValue = 0.f ;
        sossheig:missing_value = 0.f ;
        sossheig:coordinates = "time_centered nav_lon nav_lat" ;
        sossheig:_Storage = "chunked" ;
        sossheig:_ChunkSizes = 1, 12, 5422 ;

// global attributes:
        :description = "ocean T grid variables" ;
        :conventions = "CF-1.1" ;
        :production = "An IPSL model" ;
        :start_date = 20040101 ;
        :output_frequency = "5d" ;
        :CONFIG = "NATL60" ;
        :CASE = "MJM155" ;
        :_Format = "netCDF-4" ;
}
shoyer commented 8 years ago

@apatlpo Along what axis do your multiple files differ?

apatlpo commented 8 years ago

Along time_counter

stale[bot] commented 5 years ago

In order to maintain a list of currently relevant issues, we mark issues as stale after a period of inactivity If this issue remains relevant, please comment here; otherwise it will be marked as closed automatically