hmaarrfk / xarray

N-D labeled arrays and datasets in Python
http://xarray.pydata.org
Apache License 2.0
0 stars 0 forks source link

Ideas for micro optimization #29

Open hmaarrfk opened 6 months ago

hmaarrfk commented 6 months ago

What is your issue?

the call to self._parse_dimensions(dims) should be avoidable for internal routines.

However, it was giving me trouble in tests.

```patch diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 6f5826a5..311aac22 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -396,7 +396,10 @@ class Variable(NamedArray, AbstractArray, VariableArithmetic): unrecognized encoding items. """ super().__init__( - dims=dims, data=as_compatible_data(data, fastpath=fastpath), attrs=attrs + dims=dims, + data=as_compatible_data(data, fastpath=fastpath), + attrs=attrs, + fastpath=fastpath, ) self._encoding = None @@ -2622,6 +2625,9 @@ class IndexVariable(Variable): def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): super().__init__(dims, data, attrs, encoding, fastpath) + if fastpath: + return + if self.ndim != 1: raise ValueError(f"{type(self).__name__} objects must be 1-dimensional") diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py index 135dabc0..adceaa65 100644 --- a/xarray/namedarray/core.py +++ b/xarray/namedarray/core.py @@ -259,9 +259,14 @@ class NamedArray(NamedArrayAggregations, Generic[_ShapeType_co, _DType_co]): dims: _DimsLike, data: duckarray[Any, _DType_co], attrs: _AttrsLike = None, + *, + fastpath=True, ): self._data = data - self._dims = self._parse_dimensions(dims) + if fastpath: + self._dims = dims + else: + self._dims = self._parse_dimensions(dims) self._attrs = dict(attrs) if attrs else None def __init_subclass__(cls, **kwargs: Any) -> None: ```
hmaarrfk commented 6 months ago

_apply_index is rather slow because of pandas multi index compatibility.

I'm not sure how to fix that:

```patch From 1a659bcf8e0eced8e4309621f079a7f0ddf22fac Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Sun, 5 May 2024 14:52:25 -0400 Subject: [PATCH] 12k->14.5k --- xarray/core/indexes.py | 69 ++++++++++++++++++++++++++++++++++------- xarray/core/indexing.py | 7 +++-- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index e71c4a6f073..22bdb0777d2 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -161,7 +161,10 @@ def unstack(self) -> tuple[dict[Hashable, Index], pd.MultiIndex]: raise NotImplementedError() def create_variables( - self, variables: Mapping[Any, Variable] | None = None + self, + variables: Mapping[Any, Variable] | None = None, + *, + fastpath=False, ) -> IndexVars: """Maybe create new coordinate variables from this index. @@ -575,13 +578,19 @@ class PandasIndex(Index): __slots__ = ("index", "dim", "coord_dtype") - def __init__(self, array: Any, dim: Hashable, coord_dtype: Any = None): - # make a shallow copy: cheap and because the index name may be updated - # here or in other constructors (cannot use pd.Index.rename as this - # constructor is also called from PandasMultiIndex) - index = safe_cast_to_index(array).copy() + def __init__( + self, array: Any, dim: Hashable, coord_dtype: Any = None, *, fastpath=False + ): + if fastpath: + index = array + else: + index = safe_cast_to_index(array) if index.name is None: + # make a shallow copy: cheap and because the index name may be updated + # here or in other constructors (cannot use pd.Index.rename as this + # constructor is also called from PandasMultiIndex) + index = index.copy() index.name = dim self.index = index @@ -596,7 +605,7 @@ def _replace(self, index, dim=None, coord_dtype=None): dim = self.dim if coord_dtype is None: coord_dtype = self.coord_dtype - return type(self)(index, dim, coord_dtype) + return type(self)(index, dim, coord_dtype, fastpath=True) @classmethod def from_variables( @@ -641,6 +650,8 @@ def from_variables( obj = cls(data, dim, coord_dtype=var.dtype) assert not isinstance(obj.index, pd.MultiIndex) + # Rename safely + obj.index = obj.index.copy() obj.index.name = name return obj @@ -684,7 +695,7 @@ def concat( return cls(new_pd_index, dim=dim, coord_dtype=coord_dtype) def create_variables( - self, variables: Mapping[Any, Variable] | None = None + self, variables: Mapping[Any, Variable] | None = None, *, fastpath=False ) -> IndexVars: from xarray.core.variable import IndexVariable @@ -701,7 +712,9 @@ def create_variables( encoding = None data = PandasIndexingAdapter(self.index, dtype=self.coord_dtype) - var = IndexVariable(self.dim, data, attrs=attrs, encoding=encoding) + var = IndexVariable( + self.dim, data, attrs=attrs, encoding=encoding, fastpath=fastpath + ) return {name: var} def to_pandas_index(self) -> pd.Index: @@ -1122,7 +1135,7 @@ def reorder_levels( return self._replace(index, level_coords_dtype=level_coords_dtype) def create_variables( - self, variables: Mapping[Any, Variable] | None = None + self, variables: Mapping[Any, Variable] | None = None, *, fastpath=False ) -> IndexVars: from xarray.core.variable import IndexVariable @@ -1772,6 +1785,37 @@ def check_variables(): return not not_equal +def _apply_indexes_fast(indexes: Indexes[Index], args: Mapping[Any, Any], func: str): + # This function avoids the call to indexes.group_by_index + # which is really slow when repeatidly iterating through + # an array. However, it fails to return the correct ID for + # multi-index arrays + indexes_fast, coords = indexes._indexes, indexes._variables + + new_indexes: dict[Hashable, Index] = {k: v for k, v in indexes_fast.items()} + new_index_variables: dict[Hashable, Variable] = {} + for name, index in indexes_fast.items(): + coord = coords[name] + if hasattr(coord, "_indexes"): + index_vars = {n: coords[n] for n in coord._indexes} + else: + index_vars = {name: coord} + index_dims = {d for var in index_vars.values() for d in var.dims} + index_args = {k: v for k, v in args.items() if k in index_dims} + + if index_args: + new_index = getattr(index, func)(index_args) + if new_index is not None: + new_indexes.update({k: new_index for k in index_vars}) + new_index_vars = new_index.create_variables(index_vars, fastpath=True) + new_index_variables.update(new_index_vars) + new_index_variables.update(new_index_vars) + else: + for k in index_vars: + new_indexes.pop(k, None) + return new_indexes, new_index_variables + + def _apply_indexes( indexes: Indexes[Index], args: Mapping[Any, Any], @@ -1800,7 +1844,10 @@ def isel_indexes( indexes: Indexes[Index], indexers: Mapping[Any, Any], ) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]: - return _apply_indexes(indexes, indexers, "isel") + if any(isinstance(v, PandasMultiIndex) for v in indexes._indexes.values()): + return _apply_indexes(indexes, indexers, "isel") + else: + return _apply_indexes_fast(indexes, indexers, "isel") def roll_indexes( diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 4909d506d7b..02f1fb262c5 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1662,10 +1662,13 @@ class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin): __slots__ = ("array", "_dtype") - def __init__(self, array: pd.Index, dtype: DTypeLike = None): + def __init__(self, array: pd.Index, dtype: DTypeLike = None, *, fastpath=False): from xarray.core.indexes import safe_cast_to_index - self.array = safe_cast_to_index(array) + if fastpath: + self.array = array + else: + self.array = safe_cast_to_index(array) if dtype is None: self._dtype = get_valid_numpy_dtype(array) ```