Closed rabernat closed 11 months ago
I believe the last one in the notebook below is already fixed and the first two are mentioned above but here is a data point:
http://nbviewer.jupyter.org/gist/ocefpaf/1bf3b86359c459c89d44a81d3129f967
import xarray as xr
xr.open_dataset('http://thredds.ucar.edu/thredds/dodsC/grib/NCEP/GFS/Global_0p5deg/TwoD')
---------------------------------------------------------------------------
MissingDimensionsError Traceback (most recent call last)
<ipython-input-6-e2a87d803d99> in <module>()
----> 1 xr.open_dataset(gfs_cat.datasets[0].access_urls['OPENDAP'])
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/backends/api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs)
344 lock = _default_lock(filename_or_obj, engine)
345 with close_on_error(store):
--> 346 return maybe_decode_store(store, lock)
347 else:
348 if engine is not None and engine != 'scipy':
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/backends/api.py in maybe_decode_store(store, lock)
256 store, mask_and_scale=mask_and_scale, decode_times=decode_times,
257 concat_characters=concat_characters, decode_coords=decode_coords,
--> 258 drop_variables=drop_variables)
259
260 _protect_dataset_variables_inplace(ds, cache)
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/conventions.py in decode_cf(obj, concat_characters, mask_and_scale, decode_times, decode_coords, drop_variables)
428 vars, attrs, concat_characters, mask_and_scale, decode_times,
429 decode_coords, drop_variables=drop_variables)
--> 430 ds = Dataset(vars, attrs=attrs)
431 ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))
432 ds._file_obj = file_obj
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/dataset.py in __init__(self, data_vars, coords, attrs, compat)
363 coords = {}
364 if data_vars is not None or coords is not None:
--> 365 self._set_init_vars_and_dims(data_vars, coords, compat)
366 if attrs is not None:
367 self.attrs = attrs
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/dataset.py in _set_init_vars_and_dims(self, data_vars, coords, compat)
381
382 variables, coord_names, dims = merge_data_and_coords(
--> 383 data_vars, coords, compat=compat)
384
385 self._variables = variables
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/merge.py in merge_data_and_coords(data, coords, compat, join)
363 indexes = dict(extract_indexes(coords))
364 return merge_core(objs, compat, join, explicit_coords=explicit_coords,
--> 365 indexes=indexes)
366
367
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/merge.py in merge_core(objs, compat, join, priority_arg, explicit_coords, indexes)
433 coerced = coerce_pandas_values(objs)
434 aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
--> 435 expanded = expand_variable_dicts(aligned)
436
437 coord_names, noncoord_names = determine_coords(coerced)
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/merge.py in expand_variable_dicts(list_of_variable_dicts)
209 var_dicts.append(coords)
210
--> 211 var = as_variable(var, name=name)
212 sanitized_vars[name] = var
213
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/variable.py in as_variable(obj, name)
112 'dimensions %r. xarray disallows such variables because they '
113 'conflict with the coordinates used to label '
--> 114 'dimensions.' % (name, obj.dims))
115 obj = obj.to_index_variable()
116
MissingDimensionsError: 'time' has more than 1-dimension and the same name as one of its dimensions ('reftime', 'time'). xarray disallows such variables because they conflict with the coordinates used to label dimensions.
Here's a sample CDL for a file:
netcdf temp {
dimensions:
profile = 1 ;
station = 1 ;
isobaric = 31 ;
station_name_strlen = 10 ;
station_description_strlen = 33 ;
variables:
float isobaric(station, profile, isobaric) ;
isobaric:standard_name = "isobaric" ;
isobaric:long_name = "isobaric" ;
isobaric:units = "Pa" ;
isobaric:positive = "down" ;
isobaric:axis = "Z" ;
float Geopotential_height_isobaric(station, profile, isobaric) ;
Geopotential_height_isobaric:standard_name = "Geopotential_height_isobaric" ;
Geopotential_height_isobaric:long_name = "Geopotential_height_isobaric" ;
Geopotential_height_isobaric:units = "gpm" ;
Geopotential_height_isobaric:coordinates = "time longitude latitude isobaric" ;
char station_name(station, station_name_strlen) ;
station_name:long_name = "station name" ;
station_name:cf_role = "timeseries_id" ;
char station_description(station, station_description_strlen) ;
station_description:long_name = "station description" ;
station_description:standard_name = "platform_name" ;
double latitude(station) ;
latitude:units = "degrees_north" ;
latitude:long_name = "profile latitude" ;
double longitude(station) ;
longitude:units = "degrees_east" ;
longitude:long_name = "profile longitude" ;
double time(station, profile) ;
time:units = "Hour since 2018-08-15T12:00:00Z" ;
time:calendar = "proleptic_gregorian" ;
time:standard_name = "time" ;
time:long_name = "GRIB forecast or observation time" ;
// global attributes:
:Conventions = "CDM-Extended-CF" ;
:history = "Written by CFPointWriter" ;
:title = "Extract Points data from Grid file /data/ldm/pub/native/grid/NCEP/GFS/Global_0p5deg/GFS_Global_0p5deg_20180815_1200.grib2.ncx3#LatLon_361X720-p25S-180p0E" ;
:featureType = "timeSeriesProfile" ;
:time_coverage_start = "2018-08-15T18:00:00Z" ;
:time_coverage_end = "2018-08-15T18:00:00Z" ;
:geospatial_lat_min = 39.9995 ;
:geospatial_lat_max = 40.0005 ;
:geospatial_lon_min = -105.0005 ;
:geospatial_lon_max = -104.9995 ;
}
which gives:
---------------------------------------------------------------------------
MissingDimensionsError Traceback (most recent call last)
<ipython-input-10-d6f8d8651b9f> in <module>()
4 query.add_lonlat().accept('netcdf4')
5 nc = ncss.get_data(query)
----> 6 xr.open_dataset(NetCDF4DataStore(nc))
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/backends/api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs)
352 store = backends.ScipyDataStore(filename_or_obj)
353
--> 354 return maybe_decode_store(store)
355
356
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/backends/api.py in maybe_decode_store(store, lock)
256 store, mask_and_scale=mask_and_scale, decode_times=decode_times,
257 concat_characters=concat_characters, decode_coords=decode_coords,
--> 258 drop_variables=drop_variables)
259
260 _protect_dataset_variables_inplace(ds, cache)
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/conventions.py in decode_cf(obj, concat_characters, mask_and_scale, decode_times, decode_coords, drop_variables)
428 vars, attrs, concat_characters, mask_and_scale, decode_times,
429 decode_coords, drop_variables=drop_variables)
--> 430 ds = Dataset(vars, attrs=attrs)
431 ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))
432 ds._file_obj = file_obj
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/dataset.py in __init__(self, data_vars, coords, attrs, compat)
363 coords = {}
364 if data_vars is not None or coords is not None:
--> 365 self._set_init_vars_and_dims(data_vars, coords, compat)
366 if attrs is not None:
367 self.attrs = attrs
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/dataset.py in _set_init_vars_and_dims(self, data_vars, coords, compat)
381
382 variables, coord_names, dims = merge_data_and_coords(
--> 383 data_vars, coords, compat=compat)
384
385 self._variables = variables
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/merge.py in merge_data_and_coords(data, coords, compat, join)
363 indexes = dict(extract_indexes(coords))
364 return merge_core(objs, compat, join, explicit_coords=explicit_coords,
--> 365 indexes=indexes)
366
367
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/merge.py in merge_core(objs, compat, join, priority_arg, explicit_coords, indexes)
433 coerced = coerce_pandas_values(objs)
434 aligned = deep_align(coerced, join=join, copy=False, indexes=indexes)
--> 435 expanded = expand_variable_dicts(aligned)
436
437 coord_names, noncoord_names = determine_coords(coerced)
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/merge.py in expand_variable_dicts(list_of_variable_dicts)
209 var_dicts.append(coords)
210
--> 211 var = as_variable(var, name=name)
212 sanitized_vars[name] = var
213
~/miniconda3/envs/py36/lib/python3.6/site-packages/xarray/core/variable.py in as_variable(obj, name)
112 'dimensions %r. xarray disallows such variables because they '
113 'conflict with the coordinates used to label '
--> 114 'dimensions.' % (name, obj.dims))
115 obj = obj.to_index_variable()
116
MissingDimensionsError: 'isobaric' has more than 1-dimension and the same name as one of its dimensions ('station', 'profile', 'isobaric'). xarray disallows such variables because they conflict with the coordinates used to label dimensions.
The two examples by @dopplershift are the same problem as in https://github.com/pydata/xarray/issues/2233
This is mentioned elsewhere (can't find the issue right now) and may be out of scope for this issue but I'm going to say it anyway: opening a NetCDF file with groups was not as easy as I wanted it to be when first starting out with xarray.
I found this problem too long ago (see #457). Back then the workaround we implemented is to exclude the offending variable ("siglay" or "isobaric" in the examples above) with the "drop_variables" optional argument. Of course this is not great if you want to actually use the values in the variable you are dropping.
I personally don't like the notion of a "two dimensional coordinate", I find it confusing. However this kind of netCDFs are common, so fully supporting them in xarray would be nice. But I don't know how. Maybe just renaming the variable instead of dropping it with a "rename_variables"? This is the only thing that comes to my mind.
@djhoese - it would be great if you could track down a more specific example of the issue you are referring to.
Excluding this possible problem with groups, my assessment of the feedback above is that, actually, the only problem is #2233: we can't have multidimensional variables that are also their own dimensions. This is a good thing. It means we have a specific problem to fix.
Right now this is ok:
dimensions:
x = 4
y = 3
variables:
int x(x);
int y(y);
float data(y, x)
But this is not
dimensions:
x = 4
y = 3
variables:
int x(x);
float y(y, x);
float data(y, x)
Personally I find this to be an incredibly confusing, recursive use of the concept of "dimensions". For me, dimensions should be orthogonal. In the second example, y
is a [non-dimension] coordinate, not a dimension! The actual dimension is implicit, some sort of logical y_index
. I wish that CF / netCDF had never chosen to accept this as a valid schema. But I admit that perhaps my internal mental model is too wrapped up with xarray!
So the question is: what can we do about it?
I propose the following general outline:
y(y, x)
into something like y_coordinate(y, x)
open_dataset
called decode_recursive_dimension
which defaults to False
open_dataset
with decode_recursive_dimension=True
Finally, we might want to raise this upstream with netCDF or CF conventions to try to understand better why this sort of schema is being encouraged.
@rabernat For the groups NetCDF files I had in mind the NASA L1B data files for the satellite instrument VIIRS onboard Suomi-NPP and NOAA-20 satellites. You can see an example file here.
The summary of the ncdump is:
netcdf VNP02IMG.A2018008.0000.001.2018061001540 {
dimensions:
number_of_scans = 202 ;
number_of_lines = 6464 ;
number_of_pixels = 6400 ;
number_of_LUT_values = 65536 ;
... lots of global attributes ...
group: scan_line_attributes {
variables:
double scan_start_time(number_of_scans) ;
scan_start_time:long_name = "Scan start time (TAI93)" ;
scan_start_time:units = "seconds" ;
scan_start_time:_FillValue = -999.9 ;
scan_start_time:valid_min = 0. ;
scan_start_time:valid_max = 2000000000. ;
... lots of other variables in this group ...
group: observation_data {
variables:
ushort I04(number_of_lines, number_of_pixels) ;
I04:long_name = "I-band 04 earth view radiance" ;
I04:units = "Watts/meter^2/steradian/micrometer" ;
I04:_FillValue = 65535US ;
I04:valid_min = 0US ;
I04:valid_max = 65527US ;
I04:scale_factor = 6.104354e-05f ;
I04:add_offset = 0.0016703f ;
I04:flag_values = 65532US, 65533US, 65534US ;
I04:flag_meanings = "Missing_EV Bowtie_Deleted Cal_Fail" ;
When I first started out with xarray I assumed I would be able to do something like:
import xarray as xr
nc = xr.open_dataset('VNP02IMG.A2018008.0000.001.2018061001540.nc')
band_data = nc['observation_data/I04']
Which I can't do, but can do with the python netcdf4 library:
In [7]: from netCDF4 import Dataset
In [8]: nc = Dataset('VNP02IMG.A2018008.0000.001.2018061001540.nc')
In [9]: nc['observation_data/I04']
Out[9]:
<class 'netCDF4._netCDF4.Variable'>
I understand that I can provide the group
keyword to open_dataset
but then I have to open the file twice if I want to get the global attributes. So any interface I had set up in my code to pass around one object with all of the file contents won't work. That isn't xarray's fault and shouldn't necessarily be something xarray has to solve, but it is a type of NetCDF4 file that is valid and can't be read "perfectly" with xarray.
@rabernat While I agree that they're (somewhat) confusing files, I think you're missing two things:
netCDF doesn't enforce naming on dimensions and variables. Full stop. The only naming netCDF will care about is any conflict with an internal reserved name (I'm not sure that those even exist for anything besides attributes.) IMO that's a good thing, but more importantly it's not the netCDF library's job to enforce any of it.
CF is an attribute convention. This also means that the conventions say absolutely nothing about naming of variables and dimensions.
IMO, xarray is being overly pedantic here. XArray states that it adopts the Common Data Model (CDM); netCDF-java and the CDM were the tools used to generate the failing examples above.
@dopplershift - thanks for the clarifications! I agree that it's good for netCDF to be as open-ended as possible.
So I guess my quarrel is with the CDM. This is what it says about variables and dimensions:
A Variable is a container for data. It has a DataType, a set of Dimensions that define its array shape, and optionally a set of Attributes. Any shared Dimension it uses must be in the same Group or a parent Group.
A Dimension is used to define the array shape of a Variable. It may be shared among Variables, which provides a simple yet powerful way of associating Variables. When a Dimension is shared, it has a unique name within the Group. If unlimited, a Dimension's length may increase. If variableLength, then the actual length is data dependent, and can only be found by reading the data. A variableLength Dimension cannot be shared or unlimited.
then later
A Variable can have zero or more Coordinate Systems containing one or more CoordinateAxis. A CoordinateAxis can only be part of a Variable's CoordinateSystem if the CoordinateAxis' set of Dimensions is a subset of the Variable's set of Dimensions. This ensures that every data point in the Variable has a corresponding coordinate value for each of the CoordinateAxis in the CoordinateSystem.
A Coordinate System has one or more CoordinateAxis, and zero or more CoordinateTransforms.
A CoordinateAxis is a subtype of Variable, and is optionally classified according to the types in AxisType.
These are the rules which restrict which Variables can be used as Coordinate Axes:
Shared Dimensions: All dimensions used by a Coordinate Axis must be shared with the data variable. When a variable is part of a Structure, the dimensions used by the parent Structure(s) are considered to be part of the nested Variable.
I have a very hard time understanding what all of this means. Can the same variable be a "Dimension" and a "CoordinateAxis" in CDM?
It seems much simpler to me to use the CF approach to describe the physical coordinates of the data using "auxiliary coordinate variables" and to keep the dimensions as purely 1D "coordinate variables".
IMO, xarray is being overly pedantic here.
What would you like xarray to do with these datasets, given the fact that orthogonality of dimensions is central to its data model?
Perhaps part of the confusion is simply that y
has different meanings in different contexts. When used as a dimension (e.g. to "define the array shape of a Variable" in CDM terms), it is indeed 1D. When used as a variable (or "CoordinateAxis"), it is 2D. XArray doesn't have a separate namespace for dimensions and variables.
Currently, xarray requires that variables with a name matching a dimension are 1D variables along that dimension, e.g.,
for dim in dataset.dims:
if dim in dataset.variables:
assert dataset.variables[dim].dims == (dim,)
I agree that this unnecessarily complicates our data model. There's no particular advantage to this invariant, besides removing the need to check the dimensions of variables used for indexing lookups. I'm sure there are some cases internally where we currently rely on this assumption, but it should be relatively easy to relax.
It seems like this relaxation is compatible with the refactoring of indexes. Right now, we automatically create 1D indexes for all coordinate variables. The problem with 2D dimensions is that such indexes don't make sense:
data.sel(y=3.14)
But maybe we could turn multi-dimensional coordinate variables into multi-indexes? Or no index at all? In any case, we could still do
data.isel(y=3)
i.e. logical indexing on the dimension axis.
Or no index at all?
This would be my inclination (for the default behavior). It would mean that you could not longer count on anyways being able to do labeled indexing along each dimension, but in the broader scheme of things I don't think that's a big deal.
That sounds reasonable to me. I don't necessarily expect all of the xarray goodness to work with those files, but I do expect them to open without error.
Just adding that netCDF files produced as output from the GOTM turbulence model cannot be opened by xarray. I believe the reason is self-referential multidimensional coordinates.
@nordam , can you provide an example?
Indeed. An example file (1.1 MB) can be found here:
http://folk.ntnu.no/nordam/entrainment.nc
And the error message I get on trying to open this file is:
ValueError: 'z' has more than 1-dimension and the same name as one of its dimensions ('time', 'z', 'lat', 'lon'). xarray disallows such variables because they conflict with the coordinates used to label dimensions.
We are working on fixing this in #2405. That PR (mine) has most of the basic functionality there, but it still needs more testing. Unfortunately, I don't have bandwidth right now to complete the required work.
If anyone here needs this fixed urgently and actually has time to work on it, I encourage you to pick up that PR and try to finish it off. We will be happy to provide help and support along the way.
Adding another example. While working through the Model Evaluation Tool (MET) tutorial, I created a NetCDF file with the tool, and wasn't able to open the file it created.
MissingDimensionsError: 'lat' has more than 1-dimension and the same name as one of its dimensions ('lat', 'lon'). xarray disallows such variables because they conflict with the coordinates used to label dimensions.
Sounds to me like the same error caused by https://github.com/pydata/xarray/issues/2233
Below is the .nc file contents with ncdump
>>ncdump sample_fcst_24L_2005080800V_12A.nc -h
netcdf sample_fcst_24L_2005080800V_12A {
dimensions:
lat = 129 ;
lon = 185 ;
variables:
float lat(lat, lon) ;
lat:long_name = "latitude" ;
lat:units = "degrees_north" ;
lat:standard_name = "latitude" ;
float lon(lat, lon) ;
lon:long_name = "longitude" ;
lon:units = "degrees_east" ;
lon:standard_name = "longitude" ;
float APCP_12(lat, lon) ;
APCP_12:name = "APCP_12" ;
APCP_12:long_name = "Total precipitation" ;
APCP_12:level = "A12" ;
APCP_12:units = "kg/m^2" ;
APCP_12:_FillValue = -9999.f ;
APCP_12:init_time = "20050807_000000" ;
APCP_12:init_time_ut = "1123372800" ;
APCP_12:valid_time = "20050808_000000" ;
APCP_12:valid_time_ut = "1123459200" ;
APCP_12:accum_time = "120000" ;
APCP_12:accum_time_sec = 43200 ;
// global attributes:
:MET_version = "V8.1.2" ;
:MET_tool = "pcp_combine" ;
:RunCommand = "Sum: 4 files with accumulations of 030000." ;
:Projection = "Lambert Conformal" ;
:hemisphere = "N" ;
:scale_lat_1 = "25.000000" ;
:scale_lat_2 = "25.000000" ;
:lat_pin = "12.190000" ;
:lon_pin = "-133.459000" ;
:x_pin = "0.000000" ;
:y_pin = "0.000000" ;
:lon_orient = "-95.000000" ;
:d_km = "40.635000" ;
:r_km = "6371.200000" ;
:nx = "185" ;
:ny = "129 grid_points" ;
}
Found one! https://www.ncei.noaa.gov/data/oceans/ncei/ocads/data/0191304/ The dataset published in Bushinsky et al. (2019), which is basically the Landshutzer et al. (2014) climatology plus SOCCOM Float-based pCO2 data, and updated through 2018. I've only tried the first file in the list (https://www.ncei.noaa.gov/data/oceans/ncei/ocads/data/0191304/MPI-SOM_FFN_SOCCOMv2018.nc), but suspect the others will have the same issue. Here's the error (sounds like you all have discussed before, but I can't see an easy answer):
---------------------------------------------------------------------------
MissingDimensionsError Traceback (most recent call last)
<ipython-input-4-9e0af51f1c05> in <module>
----> 1 SOMFFN = xr.open_dataset('MPI-SOM_FFN_SOCCOMv2018.nc')
~/opt/anaconda3/lib/python3.8/site-packages/xarray/backends/api.py in open_dataset(filename_or_obj, group, decode_cf, mask_and_scale, decode_times, autoclose, concat_characters, decode_coords, engine, chunks, lock, cache, drop_variables, backend_kwargs, use_cftime, decode_timedelta)
573
574 with close_on_error(store):
--> 575 ds = maybe_decode_store(store, chunks)
576
577 # Ensure source filename always stored in dataset object (GH issue #2550)
~/opt/anaconda3/lib/python3.8/site-packages/xarray/backends/api.py in maybe_decode_store(store, chunks)
469
470 def maybe_decode_store(store, chunks):
--> 471 ds = conventions.decode_cf(
472 store,
473 mask_and_scale=mask_and_scale,
~/opt/anaconda3/lib/python3.8/site-packages/xarray/conventions.py in decode_cf(obj, concat_characters, mask_and_scale, decode_times, decode_coords, drop_variables, use_cftime, decode_timedelta)
598 decode_timedelta=decode_timedelta,
599 )
--> 600 ds = Dataset(vars, attrs=attrs)
601 ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))
602 ds._file_obj = file_obj
~/opt/anaconda3/lib/python3.8/site-packages/xarray/core/dataset.py in __init__(self, data_vars, coords, attrs)
628 coords = coords.variables
629
--> 630 variables, coord_names, dims, indexes, _ = merge_data_and_coords(
631 data_vars, coords, compat="broadcast_equals"
632 )
~/opt/anaconda3/lib/python3.8/site-packages/xarray/core/merge.py in merge_data_and_coords(data, coords, compat, join)
465 explicit_coords = coords.keys()
466 indexes = dict(_extract_indexes_from_coords(coords))
--> 467 return merge_core(
468 objects, compat, join, explicit_coords=explicit_coords, indexes=indexes
469 )
~/opt/anaconda3/lib/python3.8/site-packages/xarray/core/merge.py in merge_core(objects, compat, join, combine_attrs, priority_arg, explicit_coords, indexes, fill_value)
592 coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value
593 )
--> 594 collected = collect_variables_and_indexes(aligned)
595
596 prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat)
~/opt/anaconda3/lib/python3.8/site-packages/xarray/core/merge.py in collect_variables_and_indexes(list_of_mappings)
276 append_all(coords, indexes)
277
--> 278 variable = as_variable(variable, name=name)
279 if variable.dims == (name,):
280 variable = variable.to_index_variable()
~/opt/anaconda3/lib/python3.8/site-packages/xarray/core/variable.py in as_variable(obj, name)
152 # convert the Variable into an Index
153 if obj.ndim != 1:
--> 154 raise MissingDimensionsError(
155 "%r has more than 1-dimension and the same name as one of its "
156 "dimensions %r. xarray disallows such variables because they "
MissingDimensionsError: 'date' has more than 1-dimension and the same name as one of its dimensions ('time', 'date'). xarray disallows such variables because they conflict with the coordinates used to label dimensions.
Clearly we can detect this failure, so shall we rename the date
dimension to date_
in this example? We can raise a warning saying round-tripping will not work for such datasets
@dcherian Thanks for your reply. I think I understand the issue. What, specifically, do you suggest to fix this issue in my own code considering this is not a dataset I generated?
Currently, xarray requires that variables with a name matching a dimension are 1D variables along that dimension, e.g.,
for dim in dataset.dims:
if dim in dataset.variables:
assert dataset.variables[dim].dims == (dim,)
I agree that this unnecessarily complicates our data model. There's no particular advantage to this invariant, besides removing the need to check the dimensions of variables used for indexing lookups. I'm sure there are some cases internally where we currently rely on this assumption, but it should be relatively easy to relax.
It seems like this relaxation is compatible with the refactoring of indexes.
@benbovy will the explicit indexes refactor fix this case?
This is mentioned elsewhere (can't find the issue right now) and may be out of scope for this issue but I'm going to say it anyway: opening a NetCDF file with groups was not as easy as I wanted it to be when first starting out with xarray.
@djhoese For anything to do with opening netCDF files with groups see #4118 and the linked issues from there.
If people have example of other weird cases involving groups (like groups within themselves or anything like that) then I would be interested to have those files to test with!
@TomNicholas yes with the explicit index refactor we should be able to relax the 1D coordinate / dimension matching name constraint in the Xarray data model.
I'm sure there are some cases internally where we currently rely on this assumption, but it should be relatively easy to relax.
I also initially thought it would be easy to relax, but I'm not so sure anymore. I don't think it is a hard task, but it might still require some fair amount of work. I've already refactored a bunch of such internal cases in #5692, but there's a good chance that some (not sure how many) cases will still need a fix.
Found another example from ICON NWP model. Files open with netCDF4 library but not with xarray.
import pandas as pd
import xarray as xr
import requests
import os
response = requests.get('https://cloudnet.fmi.fi/api/model-files?site=hyytiala&date=2020-08-25&model=icon-iglo-12-23')
data = response.json()
df = pd.DataFrame(data)
file = df.downloadUrl
for i in file:
wget.download(i, os.getcwd())
ds = xr.open_dataset('20200825_hyytiala_icon-iglo-12-23.nc')
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In [109], line 1
----> 1 ds = xr.open_dataset('20200825_hyytiala_icon-iglo-12-23.nc')
File ~/.virtualenvs/INAR/lib/python3.10/site-packages/xarray/backends/api.py:531, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)
519 decoders = _resolve_decoders_kwargs(
520 decode_cf,
521 open_backend_dataset_parameters=backend.open_dataset_parameters,
(...)
527 decode_coords=decode_coords,
528 )
530 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 531 backend_ds = backend.open_dataset(
532 filename_or_obj,
533 drop_variables=drop_variables,
534 **decoders,
535 **kwargs,
536 )
537 ds = _dataset_from_backend_dataset(
538 backend_ds,
539 filename_or_obj,
(...)
547 **kwargs,
548 )
549 return ds
File ~/.virtualenvs/INAR/lib/python3.10/site-packages/xarray/backends/netCDF4_.py:569, in NetCDF4BackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, format, clobber, diskless, persist, lock, autoclose)
567 store_entrypoint = StoreBackendEntrypoint()
568 with close_on_error(store):
--> 569 ds = store_entrypoint.open_dataset(
570 store,
571 mask_and_scale=mask_and_scale,
572 decode_times=decode_times,
573 concat_characters=concat_characters,
574 decode_coords=decode_coords,
575 drop_variables=drop_variables,
576 use_cftime=use_cftime,
577 decode_timedelta=decode_timedelta,
578 )
579 return ds
File ~/.virtualenvs/INAR/lib/python3.10/site-packages/xarray/backends/store.py:29, in StoreBackendEntrypoint.open_dataset(self, store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta)
26 vars, attrs = store.load()
27 encoding = store.get_encoding()
---> 29 vars, attrs, coord_names = conventions.decode_cf_variables(
30 vars,
31 attrs,
32 mask_and_scale=mask_and_scale,
33 decode_times=decode_times,
34 concat_characters=concat_characters,
35 decode_coords=decode_coords,
36 drop_variables=drop_variables,
37 use_cftime=use_cftime,
38 decode_timedelta=decode_timedelta,
39 )
41 ds = Dataset(vars, attrs=attrs)
42 ds = ds.set_coords(coord_names.intersection(vars))
File ~/.virtualenvs/INAR/lib/python3.10/site-packages/xarray/conventions.py:509, in decode_cf_variables(variables, attributes, concat_characters, mask_and_scale, decode_times, decode_coords, drop_variables, use_cftime, decode_timedelta)
507 # Time bounds coordinates might miss the decoding attributes
508 if decode_times:
--> 509 _update_bounds_attributes(variables)
511 new_vars = {}
512 for k, v in variables.items():
File ~/.virtualenvs/INAR/lib/python3.10/site-packages/xarray/conventions.py:410, in _update_bounds_attributes(variables)
408 for v in variables.values():
409 attrs = v.attrs
--> 410 has_date_units = "units" in attrs and "since" in attrs["units"]
411 if has_date_units and "bounds" in attrs:
412 if attrs["bounds"] in variables:
TypeError: argument of type 'numpy.float32' is not iterable
@maxaragon, i'm curious. what version of xarray/netcdf4 are you using? i'm asking because this appears to be working fine on my end
In [1]: import xarray as xr
In [2]: ds = xr.open_dataset("20200825_hyytiala_icon-iglo-12-23.nc")
In [3]: ds
Out[3]:
<xarray.Dataset>
Dimensions: (time: 25, level: 90, flux_level: 91,
frequency: 2, soil_level: 9)
Coordinates:
* time (time) datetime64[ns] 2020-08-25 ... 2020-0...
* level (level) float32 90.0 89.0 88.0 ... 3.0 2.0 1.0
* flux_level (flux_level) float32 91.0 90.0 ... 2.0 1.0
* frequency (frequency) float32 34.96 94.0
Dimensions without coordinates: soil_level
Data variables: (12/62)
latitude float32 ...
longitude float32 ...
altitude float32 ...
horizontal_resolution float32 ...
forecast_time (time) timedelta64[ns] ...
height (time, level) float32 ...
... ...
gas_atten (frequency, time, level) float32 ...
specific_gas_atten (frequency, time, level) float32 ...
specific_saturated_gas_atten (frequency, time, level) float32 ...
specific_dry_gas_atten (frequency, time, level) float32 ...
K2 (frequency, time, level) float32 ...
specific_liquid_atten (frequency, time, level) float32 ...
Attributes: (12/13)
institution: Max Planck Institute for Meteorology/Deutscher Wette...
references: see MPIM/DWD publications
source: svn://xceh.dwd.de/for0adm/SVN_icon/tags/icon-2.6.0-n...
Conventions: CF-1.7
location: hyytiala
file_uuid: ace15f8ba477497c8d1dd0833b5ac674
... ...
year: 2020
month: 08
day: 25
history: 2021-01-25 08:24:29 - File content harmonized by the...
title: Model file from Hyytiala
pid: https://hdl.handle.net/21.12132/1.ace15f8ba477497c
here are the versions i'm using
In [4]: xr.show_versions()
/Users/andersy005/mambaforge/envs/playground/lib/python3.10/site-packages/_distutils_hack/__init__.py:33: UserWarning: Setuptools is replacing distutils.
warnings.warn("Setuptools is replacing distutils.")
INSTALLED VERSIONS
------------------
commit: None
python: 3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:41:22) [Clang 13.0.1 ]
python-bits: 64
OS: Darwin
OS-release: 22.1.0
machine: arm64
processor: arm
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: ('en_US', 'UTF-8')
libhdf5: 1.12.2
libnetcdf: 4.8.1
xarray: 2022.10.0
pandas: 1.5.1
numpy: 1.23.4
scipy: 1.9.3
netCDF4: 1.6.1
pydap: installed
h5netcdf: 1.0.2
h5py: 3.7.0
Nio: None
zarr: 2.13.3
cftime: 1.6.2
nc_time_axis: None
PseudoNetCDF: None
rasterio: None
cfgrib: None
iris: None
bottleneck: None
dask: 2022.10.2
distributed: 2022.10.2
matplotlib: 3.6.1
cartopy: None
seaborn: 0.12.0
numbagg: None
fsspec: 2022.10.0
cupy: None
pint: 0.20.1
sparse: None
flox: None
numpy_groupies: None
setuptools: 65.5.0
pip: 22.3
conda: None
pytest: None
IPython: 8.6.0
sphinx: None
@andersy005 indeed, I have updated xarray and works now, previous version was:
INSTALLED VERSIONS
------------------
commit: None
python: 3.10.6 (main, Aug 30 2022, 04:58:14) [Clang 13.1.6 (clang-1316.0.21.2.5)]
python-bits: 64
OS: Darwin
OS-release: 21.6.0
machine: arm64
processor: i386
byteorder: little
LC_ALL: None
LANG: None
LOCALE: (None, 'UTF-8')
libhdf5: 1.12.2
libnetcdf: 4.9.0
xarray: 2022.6.0
pandas: 1.4.4
numpy: 1.23.2
scipy: 1.9.1
netCDF4: 1.6.0
pydap: None
h5netcdf: None
h5py: None
Nio: None
zarr: None
cftime: 1.6.1
nc_time_axis: None
PseudoNetCDF: None
rasterio: None
cfgrib: None
iris: None
bottleneck: 1.3.5
dask: None
distributed: None
matplotlib: 3.5.3
cartopy: None
seaborn: 0.12.1
numbagg: None
fsspec: None
cupy: None
pint: None
sparse: None
flox: None
numpy_groupies: None
setuptools: 63.4.3
pip: 22.2.2
conda: None
pytest: None
IPython: 8.5.0
sphinx: None
found this one, The dataset was given based on request. That's why... Anyway, anybody want to check, you can find this polar front
data = xr.open_dataset("C:/Users/admin/Downloads/CTOH_PolarFront_weekly_1993_2019.nc")
MissingDimensionsError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_9796\2161474679.py in <module>
----> 1 data = xr.open_dataset("C:/Users/admin/Downloads/CTOH_PolarFront_weekly_1993_2019.nc")
~\anaconda3\lib\site-packages\xarray\backends\api.py in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, backend_kwargs, *args, **kwargs)
493
494 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 495 backend_ds = backend.open_dataset(
496 filename_or_obj,
497 drop_variables=drop_variables,
~\anaconda3\lib\site-packages\xarray\backends\netCDF4_.py in open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, format, clobber, diskless, persist, lock, autoclose)
562 store_entrypoint = StoreBackendEntrypoint()
563 with close_on_error(store):
--> 564 ds = store_entrypoint.open_dataset(
565 store,
566 mask_and_scale=mask_and_scale,
~\anaconda3\lib\site-packages\xarray\backends\store.py in open_dataset(self, store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta)
37 )
38
---> 39 ds = Dataset(vars, attrs=attrs)
40 ds = ds.set_coords(coord_names.intersection(vars))
41 ds.set_close(store.close)
~\anaconda3\lib\site-packages\xarray\core\dataset.py in __init__(self, data_vars, coords, attrs)
749 coords = coords.variables
750
--> 751 variables, coord_names, dims, indexes, _ = merge_data_and_coords(
752 data_vars, coords, compat="broadcast_equals"
753 )
~\anaconda3\lib\site-packages\xarray\core\merge.py in merge_data_and_coords(data, coords, compat, join)
486 explicit_coords = coords.keys()
487 indexes = dict(_extract_indexes_from_coords(coords))
--> 488 return merge_core(
489 objects, compat, join, explicit_coords=explicit_coords, indexes=indexes
490 )
~\anaconda3\lib\site-packages\xarray\core\merge.py in merge_core(objects, compat, join, combine_attrs, priority_arg, explicit_coords, indexes, fill_value)
635 coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value
636 )
--> 637 collected = collect_variables_and_indexes(aligned)
638
639 prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat)
~\anaconda3\lib\site-packages\xarray\core\merge.py in collect_variables_and_indexes(list_of_mappings)
294 append_all(coords, indexes)
295
--> 296 variable = as_variable(variable, name=name)
297
298 if variable.dims == (name,):
~\anaconda3\lib\site-packages\xarray\core\variable.py in as_variable(obj, name)
156 # convert the Variable into an Index
157 if obj.ndim != 1:
--> 158 raise MissingDimensionsError(
159 f"{name!r} has more than 1-dimension and the same name as one of its "
160 f"dimensions {obj.dims!r}. xarray disallows such variables because they "
MissingDimensionsError: 'longitude' has more than 1-dimension and the same name as one of its dimensions ('time', 'longitude'). xarray disallows such variables because they conflict with the coordinates used to label dimensions.
@ronygolderku thanks for your example. Looks like it fails for the same reason as was mentioned for some of the other examples above.
@ronygolderku thanks for your example. Looks like it fails for the same reason as was mentioned for some of the other examples above.
Is there any solution?
Closing since the most common case in this issue was fixed by https://github.com/pydata/xarray/pull/7989 and https://github.com/pydata/xarray/pull/8126.
The request for supporting groups is handled by https://github.com/xarray-contrib/datatree
At the Pangeo developers meetings, I am hearing lots of reports from folks like @dopplershift and @rsignell-usgs about netCDF datasets that xarray can't open.
My expectation is that xarray doesn't have strong requirements on the contents of datasets. (It doesn't "enforce" cf compatibility for example; that's optional.) Anything that can be written to netCDF should be readable by xarray.
I would like to collect examples of places where xarray fails. So far, I am only aware of one:
siglay(siglay, node)
. Onlysiglay(siglay)
would work.Are there other distinct cases?
Please provide links / sample code of netCDF datasets that xarray can't read. Even better would be short code snippets to create such datasets in python using the netcdf4 interface.