ecmwf / earthkit-data

A format-agnostic Python interface for geospatial data
Apache License 2.0
47 stars 9 forks source link

`to_xarray()` is not able to concatenate daily satellite data #300

Open malmans2 opened 4 months ago

malmans2 commented 4 months ago

What happened?

Various satellite products are shipped by the CDS in zip archives containing daily netcdfs. Those files do not have a time dimension, and therefore can not be concatenated by xarray. Here I'm showing an example using satellite-aerosol-properties, but other products are also affected (e.g., satellite-methane)

It's possible to concatenate them using xr.open_mfdataset kwargs. Are users supposed to use xarray's kwargs for these products?

Side note: I can figure out xarray's kwargs needed by using the _indexes property. As far as I can tell that's the only way, but it feels I'm doing a hack. Should _indexes be a public property?

What are the steps to reproduce the bug?

import earthkit.data

collection_id = "satellite-aerosol-properties"
request = {
    "format": "zip",
    "time_aggregation": "daily_average",
    "variable": "aerosol_optical_depth",
    "sensor_on_satellite": "aatsr_on_envisat",
    "algorithm": "adv",
    "year": "2002",
    "month": "05",
    "day": ["20", "21"],
    "version": "v3.11",
}
ds = earthkit.data.from_source("cds", collection_id, request)
ds.to_xarray()  # ValueError: Could not find any dimension coordinates to use to order the datasets for concatenation

Workaround:

def preprocess(ds):
    import pandas as pd

    time = pd.to_datetime(ds.attrs["time_coverage_start"][:11])
    return ds.expand_dims(time=[time])

ds.to_xarray(xarray_open_mfdataset_kwargs={"preprocess": preprocess})

Version

0.5.1

Platform (OS and architecture)

Darwin MacBook-Pro-3.local 22.6.0 Darwin Kernel Version 22.6.0: Wed Jul 5 22:21:56 PDT 2023; root:xnu-8796.141.3~6/RELEASE_X86_64 x86_64

Relevant log output

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 16
      4 request = {
      5     "format": "zip",
      6     "time_aggregation": "daily_average",
   (...)
     13     "version": "v3.11",
     14 }
     15 ds = earthkit.data.from_source("cds", collection_id, request)
---> 16 ds.to_xarray()  # ValueError: Could not find any dimension coordinates to use to order the datasets for concatenation

File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/earthkit/data/readers/netcdf.py:647, in NetCDFMultiFieldList.to_xarray(self, **kwargs)
    645 def to_xarray(self, **kwargs):
    646     try:
--> 647         return NetCDFFieldList.to_xarray_multi_from_paths(
    648             [x.path for x in self._indexes], **kwargs
    649         )
    650     except AttributeError:
    651         # TODO: Implement this, but discussion required
    652         #  This catches Multi-MaskFieldLists which cannot be openned in xarray
    653         self._not_implemented()

File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/earthkit/data/readers/netcdf.py:604, in NetCDFFieldList.to_xarray_multi_from_paths(cls, paths, **kwargs)
    601 options = dict()
    602 options.update(kwargs.get("xarray_open_mfdataset_kwargs", {}))
--> 604 return xr.open_mfdataset(
    605     paths,
    606     **options,
    607 )

File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/backends/api.py:1053, in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, data_vars, coords, combine, parallel, join, attrs_file, combine_attrs, **kwargs)
   1040     combined = _nested_combine(
   1041         datasets,
   1042         concat_dims=concat_dim,
   (...)
   1048         combine_attrs=combine_attrs,
   1049     )
   1050 elif combine == "by_coords":
   1051     # Redo ordering from coordinates, ignoring how they were ordered
   1052     # previously
-> 1053     combined = combine_by_coords(
   1054         datasets,
   1055         compat=compat,
   1056         data_vars=data_vars,
   1057         coords=coords,
   1058         join=join,
   1059         combine_attrs=combine_attrs,
   1060     )
   1061 else:
   1062     raise ValueError(
   1063         f"{combine} is an invalid option for the keyword argument"
   1064         " ``combine``"
   1065     )

File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/core/combine.py:958, in combine_by_coords(data_objects, compat, data_vars, coords, fill_value, join, combine_attrs)
    954     grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
    956     # Perform the multidimensional combine on each group of data variables
    957     # before merging back together
--> 958     concatenated_grouped_by_data_vars = tuple(
    959         _combine_single_variable_hypercube(
    960             tuple(datasets_with_same_vars),
    961             fill_value=fill_value,
    962             data_vars=data_vars,
    963             coords=coords,
    964             compat=compat,
    965             join=join,
    966             combine_attrs=combine_attrs,
    967         )
    968         for vars, datasets_with_same_vars in grouped_by_vars
    969     )
    971 return merge(
    972     concatenated_grouped_by_data_vars,
    973     compat=compat,
   (...)
    976     combine_attrs=combine_attrs,
    977 )

File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/core/combine.py:959, in <genexpr>(.0)
    954     grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys)
    956     # Perform the multidimensional combine on each group of data variables
    957     # before merging back together
    958     concatenated_grouped_by_data_vars = tuple(
--> 959         _combine_single_variable_hypercube(
    960             tuple(datasets_with_same_vars),
    961             fill_value=fill_value,
    962             data_vars=data_vars,
    963             coords=coords,
    964             compat=compat,
    965             join=join,
    966             combine_attrs=combine_attrs,
    967         )
    968         for vars, datasets_with_same_vars in grouped_by_vars
    969     )
    971 return merge(
    972     concatenated_grouped_by_data_vars,
    973     compat=compat,
   (...)
    976     combine_attrs=combine_attrs,
    977 )

File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/core/combine.py:619, in _combine_single_variable_hypercube(datasets, fill_value, data_vars, coords, compat, join, combine_attrs)
    613 if len(datasets) == 0:
    614     raise ValueError(
    615         "At least one Dataset is required to resolve variable names "
    616         "for combined hypercube."
    617     )
--> 619 combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets))
    621 if fill_value is None:
    622     # check that datasets form complete hypercube
    623     _check_shape_tile_ids(combined_ids)

File ~/miniforge3/envs/earthkit/lib/python3.12/site-packages/xarray/core/combine.py:144, in _infer_concat_order_from_coords(datasets)
    139             tile_ids = [
    140                 tile_id + (position,) for tile_id, position in zip(tile_ids, order)
    141             ]
    143 if len(datasets) > 1 and not concat_dims:
--> 144     raise ValueError(
    145         "Could not find any dimension coordinates to use to "
    146         "order the datasets for concatenation"
    147     )
    149 combined_ids = dict(zip(tile_ids, datasets))
    151 return combined_ids, concat_dims

ValueError: Could not find any dimension coordinates to use to order the datasets for concatenation

Accompanying data

No response

Organisation

B-Open / CADS-EQC

sandorkertesz commented 2 months ago

@malmans2, I can confirm that the preferred way to control NetCDF concatenation in to_xarray is to use xarray_open_mfdataset_kwargs. It will be added to the documentation.