Various satellite products are shipped by the CDS in zip archives containing daily netcdfs.
Those files do not have a time dimension, and therefore can not be concatenated by xarray.
Here I'm showing an example using satellite-aerosol-properties, but other products are also affected (e.g., satellite-methane)
It's possible to concatenate them using xr.open_mfdataset kwargs. Are users supposed to use xarray's kwargs for these products?
Side note: I can figure out xarray's kwargs needed by using the _indexes property. As far as I can tell that's the only way, but it feels I'm doing a hack. Should _indexes be a public property?
What are the steps to reproduce the bug?
import earthkit.data
collection_id = "satellite-aerosol-properties"
request = {
"format": "zip",
"time_aggregation": "daily_average",
"variable": "aerosol_optical_depth",
"sensor_on_satellite": "aatsr_on_envisat",
"algorithm": "adv",
"year": "2002",
"month": "05",
"day": ["20", "21"],
"version": "v3.11",
}
ds = earthkit.data.from_source("cds", collection_id, request)
ds.to_xarray() # ValueError: Could not find any dimension coordinates to use to order the datasets for concatenation
Workaround:
def preprocess(ds):
import pandas as pd
time = pd.to_datetime(ds.attrs["time_coverage_start"][:11])
return ds.expand_dims(time=[time])
ds.to_xarray(xarray_open_mfdataset_kwargs={"preprocess": preprocess})
Version
0.5.1
Platform (OS and architecture)
Darwin MacBook-Pro-3.local 22.6.0 Darwin Kernel Version 22.6.0: Wed Jul 5 22:21:56 PDT 2023; root:xnu-8796.141.3~6/RELEASE_X86_64 x86_64
Relevant log output
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 16
4 request = {
5 "format": "zip",
6 "time_aggregation": "daily_average",
(...)
13 "version": "v3.11",
14 }
15 ds = earthkit.data.from_source("cds", collection_id, request)
---> 16 ds.to_xarray() # ValueError: Could not find any dimension coordinates to use to order the datasets for concatenation
File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/earthkit/data/readers/netcdf.py:647, in NetCDFMultiFieldList.to_xarray(self, **kwargs)
645 def to_xarray(self, **kwargs):
646 try:
--> 647 return NetCDFFieldList.to_xarray_multi_from_paths(
648 [x.path for x in self._indexes], **kwargs
649 )
650 except AttributeError:
651 # TODO: Implement this, but discussion required
652 # This catches Multi-MaskFieldLists which cannot be openned in xarray
653 self._not_implemented()
File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/earthkit/data/readers/netcdf.py:604, in NetCDFFieldList.to_xarray_multi_from_paths(cls, paths, **kwargs)
601 options = dict()
602 options.update(kwargs.get("xarray_open_mfdataset_kwargs", {}))
--> 604 return xr.open_mfdataset(
605 paths,
606 **options,
607 )
File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/backends/api.py:1053, in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, data_vars, coords, combine, parallel, join, attrs_file, combine_attrs, **kwargs)
1040 combined = _nested_combine(
1041 datasets,
1042 concat_dims=concat_dim,
(...)
1048 combine_attrs=combine_attrs,
1049 )
1050 elif combine == "by_coords":
1051 # Redo ordering from coordinates, ignoring how they were ordered
1052 # previously
-> 1053 combined = combine_by_coords(
1054 datasets,
1055 compat=compat,
1056 data_vars=data_vars,
1057 coords=coords,
1058 join=join,
1059 combine_attrs=combine_attrs,
1060 )
1061 else:
1062 raise ValueError(
1063 f"{combine} is an invalid option for the keyword argument"
1064 " ``combine``"
1065 )
File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/core/combine.py:958, in combine_by_coords(data_objects, compat, data_vars, coords, fill_value, join, combine_attrs)
954 grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
956 # Perform the multidimensional combine on each group of data variables
957 # before merging back together
--> 958 concatenated_grouped_by_data_vars = tuple(
959 _combine_single_variable_hypercube(
960 tuple(datasets_with_same_vars),
961 fill_value=fill_value,
962 data_vars=data_vars,
963 coords=coords,
964 compat=compat,
965 join=join,
966 combine_attrs=combine_attrs,
967 )
968 for vars, datasets_with_same_vars in grouped_by_vars
969 )
971 return merge(
972 concatenated_grouped_by_data_vars,
973 compat=compat,
(...)
976 combine_attrs=combine_attrs,
977 )
File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/core/combine.py:959, in <genexpr>(.0)
954 grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
956 # Perform the multidimensional combine on each group of data variables
957 # before merging back together
958 concatenated_grouped_by_data_vars = tuple(
--> 959 _combine_single_variable_hypercube(
960 tuple(datasets_with_same_vars),
961 fill_value=fill_value,
962 data_vars=data_vars,
963 coords=coords,
964 compat=compat,
965 join=join,
966 combine_attrs=combine_attrs,
967 )
968 for vars, datasets_with_same_vars in grouped_by_vars
969 )
971 return merge(
972 concatenated_grouped_by_data_vars,
973 compat=compat,
(...)
976 combine_attrs=combine_attrs,
977 )
File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/core/combine.py:619, in _combine_single_variable_hypercube(datasets, fill_value, data_vars, coords, compat, join, combine_attrs)
613 if len(datasets) == 0:
614 raise ValueError(
615 "At least one Dataset is required to resolve variable names "
616 "for combined hypercube."
617 )
--> 619 combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets))
621 if fill_value is None:
622 # check that datasets form complete hypercube
623 _check_shape_tile_ids(combined_ids)
File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/core/combine.py:144, in _infer_concat_order_from_coords(datasets)
139 tile_ids = [
140 tile_id + (position,) for tile_id, position in zip(tile_ids, order)
141 ]
143 if len(datasets) > 1 and not concat_dims:
--> 144 raise ValueError(
145 "Could not find any dimension coordinates to use to "
146 "order the datasets for concatenation"
147 )
149 combined_ids = dict(zip(tile_ids, datasets))
151 return combined_ids, concat_dims
ValueError: Could not find any dimension coordinates to use to order the datasets for concatenation
@malmans2, I can confirm that the preferred way to control NetCDF concatenation in to_xarray is to use xarray_open_mfdataset_kwargs. It will be added to the documentation.
What happened?
Various satellite products are shipped by the CDS in zip archives containing daily netcdfs. Those files do not have a time dimension, and therefore can not be concatenated by xarray. Here I'm showing an example using
satellite-aerosol-properties
, but other products are also affected (e.g.,satellite-methane
)It's possible to concatenate them using
xr.open_mfdataset
kwargs. Are users supposed to use xarray's kwargs for these products?Side note: I can figure out xarray's kwargs needed by using the
_indexes
property. As far as I can tell that's the only way, but it feels I'm doing a hack. Should_indexes
be a public property?What are the steps to reproduce the bug?
Workaround:
Version
0.5.1
Platform (OS and architecture)
Darwin MacBook-Pro-3.local 22.6.0 Darwin Kernel Version 22.6.0: Wed Jul 5 22:21:56 PDT 2023; root:xnu-8796.141.3~6/RELEASE_X86_64 x86_64
Relevant log output
Accompanying data
No response
Organisation
B-Open / CADS-EQC