pandas-dev / pandas

Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more
https://pandas.pydata.org
BSD 3-Clause "New" or "Revised" License
43.27k stars 17.79k forks source link

BUG: 2-sided inplace drop loses freq in DatetimeIndex #58743

Open sam-s opened 4 months ago

sam-s commented 4 months ago

Pandas version checks

Reproducible Example

import datetime
import pandas as pd

index = pd.date_range(start=datetime.date(2024,5,10), end=datetime.date(2024,5,15))
# index.freq == <Day>

df = pd.DataFrame({"a":[1]*len(index)}, index=index)
# df.index.freq == <Day>

df.drop(index=df.index[(df.index > pd.Timestamp("2024-05-13")) |
                       (df.index < pd.Timestamp("2024-05-11"))], inplace=True)
# df.index.freq == None, not <Day>
assert df.index.freq == index.freq

Issue Description

drop(inplace=True) with 2 sided limits loses freq from the DatetimeIndex

Expected Behavior

The df.index.freq should be the same after drop as before

Installed Versions

INSTALLED VERSIONS ------------------ commit : d9cdd2ee5a58015ef6f4d15c7226110c9aab8140 python : 3.12.1.final.0 python-bits : 64 OS : Windows OS-release : 10 Version : 10.0.19045 machine : AMD64 processor : Intel64 Family 6 Model 85 Stepping 7, GenuineIntel byteorder : little LC_ALL : None LANG : en_US.UTF-8 LOCALE : English_United States.1252 pandas : 2.2.2 numpy : 1.26.4 pytz : 2024.1 dateutil : 2.8.2 setuptools : 69.5.1 pip : 24.0 Cython : None pytest : None hypothesis : None sphinx : None blosc : None feather : None xlsxwriter : None lxml.etree : 5.2.2 html5lib : None pymysql : None psycopg2 : None jinja2 : 3.1.4 IPython : 8.24.0 pandas_datareader : None adbc-driver-postgresql: None adbc-driver-sqlite : None bs4 : 4.12.3 bottleneck : None dataframe-api-compat : None fastparquet : None fsspec : 2024.3.1 gcsfs : None matplotlib : 3.8.4 numba : None numexpr : None odfpy : None openpyxl : 3.1.2 pandas_gbq : None pyarrow : 15.0.0 pyreadstat : None python-calamine : None pyxlsb : None s3fs : None scipy : 1.13.0 sqlalchemy : None tables : None tabulate : None xarray : None xlrd : None zstandard : None tzdata : 2024.1 qtpy : 2.4.1 pyqt5 : None
sam-s commented 4 months ago

Possibly related: Preserving DatetimeIndex freq in MultiIndex in Pandas

rhshadrach commented 4 months ago

Thanks for the report, noting that the issue still occurs when inplace=False. Further investigations and PRs to fix are welcome.

mcmrc commented 3 months ago

@sam-s @rhshadrach Hi, all. Can I take this issue? Please tell me if you mind it.

mcmrc commented 3 months ago

TL;DR

Because delete method of class Index uses _constructor, freq is lost.


BUG Situation

Pandas Version Checks

Reproducible Example inplace=False

version=3.0.0.dev0+1023.g3b48b17e52

import datetime
import pandas as pd

index = pd.date_range(start=datetime.date(2024,5,10), end=datetime.date(2024,5,15))
# index.freq == <Day>

df = pd.DataFrame({"a":[1]*len(index)}, index=index)
# df.index.freq == <Day>

# set inplace as false
df = df.drop(index=df.index[(df.index > pd.Timestamp("2024-05-13")) |
                       (df.index < pd.Timestamp("2024-05-11"))], inplace=False)

# df.index.freq == None, not <Day>
assert df.index.freq == index.freq

Investigations

drop method of NDFrame calls _drop_axis method

obj holds index.freq just before _drop_axis called.

def drop(
        self,
        labels: IndexLabel | ListLike = None,
        *,
        axis: Axis = 0,
        index: IndexLabel | ListLike = None,
        columns: IndexLabel | ListLike = None,
        level: Level | None = None,
        inplace: bool = False,
        errors: IgnoreRaise = "raise",
    ) -> Self | None:
        inplace = validate_bool_kwarg(inplace, "inplace")

        if labels is not None:
            if index is not None or columns is not None:
                raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
            axis_name = self._get_axis_name(axis)
            axes = {axis_name: labels}
        elif index is not None or columns is not None:
            axes = {"index": index}
            if self.ndim == 2:
                axes["columns"] = columns
        else:
            raise ValueError(
                "Need to specify at least one of 'labels', 'index' or 'columns'"
            )

        obj = self

        for axis, labels in axes.items():
            if labels is not None:
                # obj still holds index.freq
                obj = obj._drop_axis(labels, axis, level=level, errors=errors)
                # obj.index.freq is None

        if inplace:
            self._update_inplace(obj)
            return None
        else:
            return obj

axis.drop returns new_axis without freq

    @final
    def _drop_axis(
        self,
        labels,
        axis,
        level=None,
        errors: IgnoreRaise = "raise",
        only_slice: bool = False,
    ) -> Self:
        """
        Drop labels from specified axis. Used in the ``drop`` method
        internally.

        Parameters
        ----------
        labels : single label or list-like
        axis : int or axis name
        level : int or level name, default None
            For MultiIndex
        errors : {'ignore', 'raise'}, default 'raise'
            If 'ignore', suppress error and existing labels are dropped.
        only_slice : bool, default False
            Whether indexing along columns should be view-only.

        """
        axis_num = self._get_axis_number(axis)
        axis = self._get_axis(axis)

        if axis.is_unique:
            if level is not None:
                if not isinstance(axis, MultiIndex):
                    raise AssertionError("axis must be a MultiIndex")
                new_axis = axis.drop(labels, level=level, errors=errors)
            else:
                new_axis = axis.drop(labels, errors=errors)
            indexer = axis.get_indexer(new_axis)
        # new_axis.freq is None

self.delete(indexer) returnes self(Index) with freq=None

    def drop(
        self,
        labels: Index | np.ndarray | Iterable[Hashable],
        errors: IgnoreRaise = "raise",
    ) -> Index:
        """
        Make new Index with passed list of labels deleted.

        Parameters
        ----------
        labels : array-like or scalar
            Array-like object or a scalar value, representing the labels to be removed
            from the Index.
        errors : {'ignore', 'raise'}, default 'raise'
            If 'ignore', suppress error and existing labels are dropped.

        Returns
        -------
        Index
            Will be same type as self, except for RangeIndex.

        Raises
        ------
        KeyError
            If not all of the labels are found in the selected axis

        See Also
        --------
        Index.dropna : Return Index without NA/NaN values.
        Index.drop_duplicates : Return Index with duplicate values removed.

        Examples
        --------
        >>> idx = pd.Index(["a", "b", "c"])
        >>> idx.drop(["a"])
        Index(['b', 'c'], dtype='object')
        """
        if not isinstance(labels, Index):
            # avoid materializing e.g. RangeIndex
            arr_dtype = "object" if self.dtype == "object" else None
            labels = com.index_labels_to_array(labels, dtype=arr_dtype)

        indexer = self.get_indexer_for(labels)
        mask = indexer == -1
        if mask.any():
            if errors != "ignore":
                raise KeyError(f"{labels[mask].tolist()} not found in axis")
            indexer = indexer[~mask]
        # self.delete(indexer) returns Index without freq
        return self.delete(indexer)