tinkoff-ai / etna

ETNA – Time-Series Library
https://etna.tinkoff.ru
Apache License 2.0
862 stars 80 forks source link

[BUG] ResampleWithDistributionTransform with holidays transform #1304

Open martins0n opened 1 year ago

martins0n commented 1 year ago

🐛 Bug Report

ResampleWithDistributionTransform doesn't work correctly with current behaviour of HolidaysTransform

Expected behavior

Cast columns to numerical types before resampling. It should like in _SklearnAdapter.

How To Reproduce


import etna
from etna.commands import mult
from etna.datasets import TSDataset
from etna.datasets.datasets_generation import generate_from_patterns_df
from etna.loggers import tslogger
from etna.metrics import MAE
from etna.metrics import MSE
from etna.metrics import SMAPE
from etna.metrics import MedAE
from etna.metrics import Sign
from etna.pipeline import Pipeline
from hydra_slayer import get_from_params

from omegaconf import OmegaConf

periods_x_freq = {
    "D": 300,
    "H": 300 * 24,
    "T": 10 * 24 * 60,
    "MS": 50,
    "W-MON": 100,
    "W-SUN": 100,
    "W": 100,
}

freq = 'H'

ts = generate_from_patterns_df(
    periods=periods_x_freq[freq],
    start_time="1990-01-01",
    freq=freq,
    patterns=[
        [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
        [100, 90, 80, 70, 60, 50, 40, 30, 20, 10],
        [20, 40, 40, 50],
    ],
)
ts = TSDataset.to_dataset(ts)
ts = TSDataset(ts, freq=freq)

model = {
    "_target_": "etna.pipeline.Pipeline",
    "horizon": 2,
    "model": {
      "_target_": "etna.models.ElasticMultiSegmentModel"
    },
    "transforms": [
      {
        "_target_": "etna.transforms.TimeSeriesImputerTransform",
        "in_column": "target",
        "strategy": "constant"
      },
      {
        "_target_": "etna.transforms.HolidayTransform",
        "out_column": "holiday_regressor"
      },
      {
        "_target_": "etna.transforms.ResampleWithDistributionTransform",
        "distribution_column": "target",
        "in_column": "holiday_regressor"
      },
      {
        "_target_": "etna.transforms.SegmentEncoderTransform"
      },
      {
        "_target_": "etna.transforms.LagTransform",
        "in_column": "target",
        "lags": "${shift:${horizon},[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168]}"
      }
    ]
}

config = OmegaConf.create(model)
config = OmegaConf.to_container(config, resolve=True)

metrics = [Sign(), SMAPE(), MAE(), MSE(), MedAE()]
pipeline: Pipeline = get_from_params(**config)
metrics_df, forecast_df, fold_info_df = pipeline.backtest(
        ts,
        metrics=metrics,

)

Environment

No response

Additional context

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["fold"] = self._get_folds(df)
Traceback (most recent call last):
  File "/workspaces/etna/t.py", line 78, in <module>
    metrics_df, forecast_df, fold_info_df = pipeline.backtest(
  File "/workspaces/etna/etna/pipeline/base.py", line 966, in backtest
    self._folds = self._run_all_folds(
  File "/workspaces/etna/etna/pipeline/base.py", line 831, in _run_all_folds
    pipelines = parallel(
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/joblib/parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/joblib/_parallel_backends.py", line 597, in __init__
    self.results = batch()
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/joblib/parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/joblib/parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
  File "/workspaces/etna/etna/pipeline/base.py", line 678, in _fit_backtest_pipeline
    pipeline.fit(ts=ts)
  File "/workspaces/etna/etna/pipeline/pipeline.py", line 56, in fit
    self.ts.fit_transform(self.transforms)
  File "/workspaces/etna/etna/datasets/tsdataset.py", line 200, in fit_transform
    transform.fit_transform(self)
  File "/workspaces/etna/etna/transforms/base.py", line 145, in fit_transform
    return self.fit(ts=ts).transform(ts=ts)
  File "/workspaces/etna/etna/transforms/base.py", line 126, in transform
    df_transformed = self._transform(df=df)
  File "/workspaces/etna/etna/transforms/base.py", line 366, in _transform
    seg_df = segment_transform.transform(df[segment])
  File "/workspaces/etna/etna/transforms/missing_values/resample.py", line 101, in transform
    df[self.out_column] = df[self.in_column].ffill() * df["distribution"]
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/pandas/core/ops/common.py", line 72, in new_method
    return method(self, other)
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/pandas/core/arraylike.py", line 118, in __mul__
    return self._arith_method(other, operator.mul)
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/pandas/core/series.py", line 6259, in _arith_method
    return base.IndexOpsMixin._arith_method(self, other, op)
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/pandas/core/base.py", line 1325, in _arith_method
    result = ops.arithmetic_op(lvalues, rvalues, op)
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/pandas/core/ops/array_ops.py", line 218, in arithmetic_op
    res_values = op(left, right)
  File "/home/codespace/.cache/pypoetry/virtualenvs/etna-cCDvSR3a-py3.10/lib/python3.10/site-packages/pandas/core/arrays/categorical.py", line 1639, in __array_ufunc__
    raise TypeError(
TypeError: Object with dtype category cannot perform the numpy op multiply

Checklist