pandas-dev / pandas

Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more
BSD 3-Clause "New" or "Revised" License
43.27k stars 17.79k forks source link

BUG: 2-sided inplace drop loses freq in DatetimeIndex #58743

Open sam-s opened 4 months ago

sam-s commented 4 months ago

Pandas version checks

Reproducible Example

import datetime
import pandas as pd

index = pd.date_range(,5,10),,5,15))
# index.freq == <Day>

df = pd.DataFrame({"a":[1]*len(index)}, index=index)
# df.index.freq == <Day>

df.drop(index=df.index[(df.index > pd.Timestamp("2024-05-13")) |
                       (df.index < pd.Timestamp("2024-05-11"))], inplace=True)
# df.index.freq == None, not <Day>
assert df.index.freq == index.freq

Issue Description

drop(inplace=True) with 2 sided limits loses freq from the DatetimeIndex

Expected Behavior

The df.index.freq should be the same after drop as before

Installed Versions

INSTALLED VERSIONS ------------------ commit : d9cdd2ee5a58015ef6f4d15c7226110c9aab8140 python : python-bits : 64 OS : Windows OS-release : 10 Version : 10.0.19045 machine : AMD64 processor : Intel64 Family 6 Model 85 Stepping 7, GenuineIntel byteorder : little LC_ALL : None LANG : en_US.UTF-8 LOCALE : English_United States.1252 pandas : 2.2.2 numpy : 1.26.4 pytz : 2024.1 dateutil : 2.8.2 setuptools : 69.5.1 pip : 24.0 Cython : None pytest : None hypothesis : None sphinx : None blosc : None feather : None xlsxwriter : None lxml.etree : 5.2.2 html5lib : None pymysql : None psycopg2 : None jinja2 : 3.1.4 IPython : 8.24.0 pandas_datareader : None adbc-driver-postgresql: None adbc-driver-sqlite : None bs4 : 4.12.3 bottleneck : None dataframe-api-compat : None fastparquet : None fsspec : 2024.3.1 gcsfs : None matplotlib : 3.8.4 numba : None numexpr : None odfpy : None openpyxl : 3.1.2 pandas_gbq : None pyarrow : 15.0.0 pyreadstat : None python-calamine : None pyxlsb : None s3fs : None scipy : 1.13.0 sqlalchemy : None tables : None tabulate : None xarray : None xlrd : None zstandard : None tzdata : 2024.1 qtpy : 2.4.1 pyqt5 : None
sam-s commented 4 months ago

Possibly related: Preserving DatetimeIndex freq in MultiIndex in Pandas

rhshadrach commented 4 months ago

Thanks for the report, noting that the issue still occurs when inplace=False. Further investigations and PRs to fix are welcome.

mcmrc commented 3 months ago

@sam-s @rhshadrach Hi, all. Can I take this issue? Please tell me if you mind it.

mcmrc commented 3 months ago


Because delete method of class Index uses _constructor, freq is lost.

BUG Situation

Pandas Version Checks

Reproducible Example inplace=False


import datetime
import pandas as pd

index = pd.date_range(,5,10),,5,15))
# index.freq == <Day>

df = pd.DataFrame({"a":[1]*len(index)}, index=index)
# df.index.freq == <Day>

# set inplace as false
df = df.drop(index=df.index[(df.index > pd.Timestamp("2024-05-13")) |
                       (df.index < pd.Timestamp("2024-05-11"))], inplace=False)

# df.index.freq == None, not <Day>
assert df.index.freq == index.freq


drop method of NDFrame calls _drop_axis method

obj holds index.freq just before _drop_axis called.

def drop(
        labels: IndexLabel | ListLike = None,
        axis: Axis = 0,
        index: IndexLabel | ListLike = None,
        columns: IndexLabel | ListLike = None,
        level: Level | None = None,
        inplace: bool = False,
        errors: IgnoreRaise = "raise",
    ) -> Self | None:
        inplace = validate_bool_kwarg(inplace, "inplace")

        if labels is not None:
            if index is not None or columns is not None:
                raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
            axis_name = self._get_axis_name(axis)
            axes = {axis_name: labels}
        elif index is not None or columns is not None:
            axes = {"index": index}
            if self.ndim == 2:
                axes["columns"] = columns
            raise ValueError(
                "Need to specify at least one of 'labels', 'index' or 'columns'"

        obj = self

        for axis, labels in axes.items():
            if labels is not None:
                # obj still holds index.freq
                obj = obj._drop_axis(labels, axis, level=level, errors=errors)
                # obj.index.freq is None

        if inplace:
            return None
            return obj

axis.drop returns new_axis without freq

    def _drop_axis(
        errors: IgnoreRaise = "raise",
        only_slice: bool = False,
    ) -> Self:
        Drop labels from specified axis. Used in the ``drop`` method

        labels : single label or list-like
        axis : int or axis name
        level : int or level name, default None
            For MultiIndex
        errors : {'ignore', 'raise'}, default 'raise'
            If 'ignore', suppress error and existing labels are dropped.
        only_slice : bool, default False
            Whether indexing along columns should be view-only.

        axis_num = self._get_axis_number(axis)
        axis = self._get_axis(axis)

        if axis.is_unique:
            if level is not None:
                if not isinstance(axis, MultiIndex):
                    raise AssertionError("axis must be a MultiIndex")
                new_axis = axis.drop(labels, level=level, errors=errors)
                new_axis = axis.drop(labels, errors=errors)
            indexer = axis.get_indexer(new_axis)
        # new_axis.freq is None

self.delete(indexer) returnes self(Index) with freq=None

    def drop(
        labels: Index | np.ndarray | Iterable[Hashable],
        errors: IgnoreRaise = "raise",
    ) -> Index:
        Make new Index with passed list of labels deleted.

        labels : array-like or scalar
            Array-like object or a scalar value, representing the labels to be removed
            from the Index.
        errors : {'ignore', 'raise'}, default 'raise'
            If 'ignore', suppress error and existing labels are dropped.

            Will be same type as self, except for RangeIndex.

            If not all of the labels are found in the selected axis

        See Also
        Index.dropna : Return Index without NA/NaN values.
        Index.drop_duplicates : Return Index with duplicate values removed.

        >>> idx = pd.Index(["a", "b", "c"])
        >>> idx.drop(["a"])
        Index(['b', 'c'], dtype='object')
        if not isinstance(labels, Index):
            # avoid materializing e.g. RangeIndex
            arr_dtype = "object" if self.dtype == "object" else None
            labels = com.index_labels_to_array(labels, dtype=arr_dtype)

        indexer = self.get_indexer_for(labels)
        mask = indexer == -1
        if mask.any():
            if errors != "ignore":
                raise KeyError(f"{labels[mask].tolist()} not found in axis")
            indexer = indexer[~mask]
        # self.delete(indexer) returns Index without freq
        return self.delete(indexer)