xarray-contrib / pint-xarray

Interface for using pint with xarray, providing convenience accessors
https://pint-xarray.readthedocs.io/en/latest/
Apache License 2.0
99 stars 12 forks source link

quantify fails if dataset is loaded with with `xr.open_mfdataset` #252

Closed aaschwanden closed 2 months ago

aaschwanden commented 2 months ago

pint.quantify() fails on a dataset opened using xr.open_mfdataset but everything works as expected when the two data sets are loaded individually with xr.open_dataset and then concatenated. The bug is insensitive to options such as parallel and chunking. Not sure how I could debug this any further.

Libraries

pint_xarray.__version__ =  0.3
cf_xarray.__version__ =  0.9.0

Expected behavior

quantify() should not fail.

Minimal example

import cf_xarray.units
import pint_xarray

import cftime
import xarray as xr
import re

def preprocess_nc(
    ds: xr.Dataset,
    regexp: str = "id_(.+?)_",
    dim: str = "exp_id",
) -> xr.Dataset:
    """
    Add experiment 'exp_id' to the dataset.

    This function adds an experiment id ('exp_id') to the dataset, extracted from the source encoding
    using the provided regular expression.

    Parameters
    ----------
    ds : xr.Dataset
        The dataset to be preprocessed.
    regexp : str, optional
        The regular expression used to extract the experiment id from the source encoding, by default "id_(.+?)_".
    dim : str, optional
        The name of the dimension to be added to the dataset, by default "exp_id".

    Returns
    -------
    xr.Dataset
        The preprocessed dataset.
    """

    m_id_re = re.search(regexp, ds.encoding["source"])
    ds.expand_dims(dim)
    assert m_id_re is not None
    m_id: Union[str, int]
    try:
        m_id = int(m_id_re.group(1))
    except:
        m_id = str(m_id_re.group(1))
    ds[dim] = m_id
    return ds

d = {'coords': {'lat': {'dims': ('y', 'x'),
   'attrs': {'units': 'degree_north',
    'valid_range': [-90.0, 90.0],
    'long_name': 'latitude',
    'standard_name': 'latitude'},
   'data': [[0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0]]},
  'lon': {'dims': ('y', 'x'),
   'attrs': {'units': 'degree_east',
    'valid_range': [-180.0, 180.0],
    'long_name': 'longitude',
    'standard_name': 'longitude'},
   'data': [[0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0]]},
  'time': {'dims': ('time',),
   'attrs': {'axis': 'T', 'long_name': 'time'},
   'data': [cftime.DatetimeNoLeap(1001, 1, 1, 0, 0, 0, 0, has_year_zero=True)]},
  'x': {'dims': ('x',),
   'attrs': {'units': 'm',
    'axis': 'X',
    'long_name': 'X-coordinate in Cartesian system',
    'standard_name': 'projection_x_coordinate',
    'spacing_meters': 375000.0},
   'data': [-750000.0, -375000.0, 0.0, 375000.0, 750000.0]},
  'y': {'dims': ('y',),
   'attrs': {'units': 'm',
    'axis': 'Y',
    'long_name': 'Y-coordinate in Cartesian system',
    'standard_name': 'projection_y_coordinate',
    'spacing_meters': 375000.0},
   'data': [-750000.0, -375000.0, 0.0, 375000.0, 750000.0]}},
 'attrs': {'Conventions': 'CF-1.6'},
 'dims': {'y': 5, 'x': 5, 'time': 1},
 'data_vars': {'thk': {'dims': ('time', 'y', 'x'),
   'attrs': {'units': 'm',
    'valid_min': 0.0,
    'long_name': 'land ice thickness',
    'standard_name': 'land_ice_thickness'},
   'data': [[[0.0, 0.0, 0.0, 0.0, 0.0],
     [0.0, 0.0, 499.6684394216143, 0.0, 0.0],
     [0.0, 499.6684394216143, 499.66844085642595, 499.6684394216143, 0.0],
     [0.0, 0.0, 499.6684394216143, 0.0, 0.0],
     [0.0, 0.0, 0.0, 0.0, 0.0]]]}}}

ds = xr.Dataset.from_dict(d)

file1 = "test_id_foo_0_1000.nc"
file2 = "test_id_bar_0_1000.nc"

ds.to_netcdf(file1)
ds.to_netcdf(file2)

ds1 = preprocess_nc(xr.open_dataset(file1)).chunk("auto")
ds2 = preprocess_nc(xr.open_dataset(file2)).chunk("auto")
single_ds = xr.concat([ds1, ds2], dim="exp_id")
single_ds.pint.quantify()

mf_ds = xr.open_mfdataset("test_id_*.nc",
                          preprocess=preprocess_nc,
                          concat_dim="exp_id",
                          combine="nested",
                          chunks="auto",
                          parallel=True,
                          decode_times=False)

mf_ds.pint.quantify()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[6], line 117
    107 single_ds.pint.quantify()
    109 mf_ds = xr.open_mfdataset("test_id_*.nc",
    110                           preprocess=preprocess_nc,
    111                           concat_dim="exp_id",
   (...)
    114                           parallel=True,
    115                           decode_times=False)
--> 117 mf_ds.pint.quantify()

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint_xarray/accessors.py:1085, in PintDatasetAccessor.quantify(self, units, unit_registry, **unit_kwargs)
   1083 if unit is not _default or attr not in (None, _default):
   1084     try:
-> 1085         new_units[name] = _decide_units(unit, registry, attr)
   1086     except (ValueError, pint.UndefinedUnitError) as e:
   1087         if unit is not _default:

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint_xarray/accessors.py:138, in _decide_units(units, registry, unit_attribute)
    136         units = unit_attribute
    137     else:
--> 138         units = registry.parse_units(unit_attribute)
    139 else:
    140     units = registry.parse_units(units)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/plain/registry.py:1189, in GenericPlainRegistry.parse_units(self, input_string, as_delta, case_sensitive)
   1161 def parse_units(
   1162     self,
   1163     input_string: str,
   1164     as_delta: Optional[bool] = None,
   1165     case_sensitive: Optional[bool] = None,
   1166 ) -> UnitT:
   1167     """Parse a units expression and returns a UnitContainer with
   1168     the canonical names.
   1169 
   (...)
   1185 
   1186     """
   1188     return self.Unit(
-> 1189         self.parse_units_as_container(input_string, as_delta, case_sensitive)
   1190     )

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/nonmultiplicative/registry.py:70, in GenericNonMultiplicativeRegistry.parse_units_as_container(self, input_string, as_delta, case_sensitive)
     67 if as_delta is None:
     68     as_delta = self.default_as_delta
---> 70 return super().parse_units_as_container(input_string, as_delta, case_sensitive)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/plain/registry.py:1204, in GenericPlainRegistry.parse_units_as_container(self, input_string, as_delta, case_sensitive)
   1198 as_delta = (
   1199     as_delta if as_delta is not None else True
   1200 )  # TODO This only exists in nonmultiplicative
   1201 case_sensitive = (
   1202     case_sensitive if case_sensitive is not None else self.case_sensitive
   1203 )
-> 1204 return self._parse_units_as_container(input_string, as_delta, case_sensitive)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/plain/registry.py:1232, in GenericPlainRegistry._parse_units_as_container(self, input_string, as_delta, case_sensitive)
   1229 # Sanitize input_string with whitespaces.
   1230 input_string = input_string.strip()
-> 1232 units = ParserHelper.from_string(input_string, self.non_int_type)
   1233 if units.scale != 1:
   1234     raise ValueError("Unit expression cannot have a scaling factor.")

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/util.py:767, in ParserHelper.from_string(cls, input_string, non_int_type)
    764     reps = False
    766 gen = pint_eval.tokenizer(input_string)
--> 767 ret = build_eval_tree(gen).evaluate(
    768     partial(cls.eval_token, non_int_type=non_int_type)
    769 )
    771 if isinstance(ret, Number):
    772     return cls(ret, non_int_type=non_int_type)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/pint_eval.py:384, in EvalTreeNode.evaluate(self, define_op, bin_op, un_op)
    380     if op_text not in bin_op:
    381         raise DefinitionSyntaxError(f"missing binary operator '{op_text}'")
    383     return bin_op[op_text](
--> 384         self.left.evaluate(define_op, bin_op, un_op),
    385         self.right.evaluate(define_op, bin_op, un_op),
    386     )
    387 elif self.operator:
    388     assert isinstance(self.left, EvalTreeNode), "self.left not EvalTreeNode (4)"

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/pint_eval.py:383, in EvalTreeNode.evaluate(self, define_op, bin_op, un_op)
    380     if op_text not in bin_op:
    381         raise DefinitionSyntaxError(f"missing binary operator '{op_text}'")
--> 383     return bin_op[op_text](
    384         self.left.evaluate(define_op, bin_op, un_op),
    385         self.right.evaluate(define_op, bin_op, un_op),
    386     )
    387 elif self.operator:
    388     assert isinstance(self.left, EvalTreeNode), "self.left not EvalTreeNode (4)"

TypeError: unsupported operand type(s) for -: 'ParserHelper' and 'int'
keewis commented 2 months ago

Thanks for the report. Your issue is decode_times=False: time coordinates have units containing a date, typically something like seconds since 2019-03-02 23:51:11. pint thinks this is an expression (2019 - 03 - 02) and tries to evaluate, only to fail because it can only parse expressions with units.

In #241 I simply told pint-xarray to ignore variables with time units. In other words, this is already fixed on main, and I'll take this issue as a reminder to issue a new release.