tinkoff-ai / etna

ETNA – Time-Series Library
https://etna.tinkoff.ru
Apache License 2.0
862 stars 80 forks source link

Fix performance of `DeepARModel` and `TFTModel` #1322

Closed Mr-Geekman closed 1 year ago

Mr-Geekman commented 1 year ago

Before submitting (must do checklist)

Proposed Changes

Optmize creation of time_idx feature.

Closing issues

Mr-Geekman commented 1 year ago

Script for old version (1.15.1):

import time
import random

import torch
import pandas as pd
import numpy as np
from loguru import logger

from etna.datasets.tsdataset import TSDataset
from etna.datasets import generate_ar_df
from etna.pipeline import Pipeline
from etna.metrics import SMAPE, MAPE, MAE
from etna.transforms import DateFlagsTransform
from etna.transforms import PytorchForecastingTransform

from etna.models.nn import TFTModel

HORIZON = 7

def generate_tsdataset(dataset_config) -> TSDataset:
    periods, n_segments, regressors, exogs, horizon = (
        dataset_config["periods"],
        dataset_config["n_segments"],
        dataset_config["regressors"],
        dataset_config["exogs"],
        dataset_config["horizon"],
    )
    df = generate_ar_df(
        periods=periods,
        start_time="2021-06-01",
        n_segments=n_segments,
        freq="D",
    )

    df_exog = None
    if exogs:
        df_exog = generate_ar_df(
            periods=periods + horizon,
            start_time="2021-06-01",
            n_segments=n_segments,
            freq="D",
        )
        df_exog = df_exog.rename(columns={"target": "exog"})
        if regressors:
            df_regressors = generate_ar_df(
                periods=periods + horizon,
                start_time="2021-06-01",
                n_segments=n_segments,
                freq="D",
            )
            df_regressors = df_regressors.rename(columns={"target": "regressor"})
            df_exog = pd.concat((df_exog, df_regressors[["regressor"]]), axis=1)
        df_exog = TSDataset.to_dataset(df_exog)

    df = TSDataset.to_dataset(df)
    ts = TSDataset(
        df=df,
        freq="D",
        df_exog=df_exog,
        known_future=["regressor"] if regressors else (),
    )
    return ts

def set_seed(seed: int = 42):
    """Set random seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def main():
    set_seed()

    # original_df = pd.read_csv("examples/data/example_dataset.csv")
    # df = TSDataset.to_dataset(original_df)
    # ts = TSDataset(df, freq="D")

    data_config = {
        "n_segments": 100,
        "periods": 500,
        "exogs": True,
        "regressors": True,
        "horizon": 14,
    }
    ts = generate_tsdataset(data_config)

    set_seed()

    dft = DateFlagsTransform(day_number_in_week=True, day_number_in_month=False, out_column="regressor_dateflag")
    pft = PytorchForecastingTransform(
        max_encoder_length=21,
        min_encoder_length=21,
        max_prediction_length=HORIZON,
        time_varying_known_reals=["time_idx"],
        time_varying_known_categoricals=["regressor_dateflag_day_number_in_week"],
        time_varying_unknown_reals=["target"],
        static_categoricals=["segment"],
        target_normalizer=None,
    )
    model_tft = TFTModel(trainer_kwargs=dict(max_epochs=1))
    transforms = [dft, pft]

    pipeline_tft = Pipeline(model=model_tft, transforms=transforms, horizon=HORIZON)

    start_time = time.perf_counter()
    metrics_tft, forecast_tft, fold_info_tft = pipeline_tft.backtest(
        ts, metrics=[SMAPE(), MAPE(), MAE()], n_folds=3, n_jobs=1
    )
    run_time = time.perf_counter() - start_time

    logger.info(f"Run time: {run_time:.3f}")
    logger.info(f"Metrics: {metrics_tft['MAE'].mean():.3f}")

if __name__ == "__main__":
    main()

Results:

Mr-Geekman commented 1 year ago

Script for new version:

import time
import random

import torch
import pandas as pd
import numpy as np
from loguru import logger

from etna.datasets.tsdataset import TSDataset
from etna.datasets import generate_ar_df
from etna.pipeline import Pipeline
from etna.metrics import SMAPE, MAPE, MAE
from etna.transforms import DateFlagsTransform
from etna.models.nn.utils import PytorchForecastingDatasetBuilder

from etna.models.nn import TFTModel

HORIZON = 7

def generate_tsdataset(dataset_config) -> TSDataset:
    periods, n_segments, regressors, exogs, horizon = (
        dataset_config["periods"],
        dataset_config["n_segments"],
        dataset_config["regressors"],
        dataset_config["exogs"],
        dataset_config["horizon"],
    )
    df = generate_ar_df(
        periods=periods,
        start_time="2021-06-01",
        n_segments=n_segments,
        freq="D",
    )

    df_exog = None
    if exogs:
        df_exog = generate_ar_df(
            periods=periods + horizon,
            start_time="2021-06-01",
            n_segments=n_segments,
            freq="D",
        )
        df_exog = df_exog.rename(columns={"target": "exog"})
        if regressors:
            df_regressors = generate_ar_df(
                periods=periods + horizon,
                start_time="2021-06-01",
                n_segments=n_segments,
                freq="D",
            )
            df_regressors = df_regressors.rename(columns={"target": "regressor"})
            df_exog = pd.concat((df_exog, df_regressors[["regressor"]]), axis=1)
        df_exog = TSDataset.to_dataset(df_exog)

    df = TSDataset.to_dataset(df)
    ts = TSDataset(
        df=df,
        freq="D",
        df_exog=df_exog,
        known_future=["regressor"] if regressors else (),
    )
    return ts

def set_seed(seed: int = 42):
    """Set random seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def main():
    set_seed()

    # original_df = pd.read_csv("examples/data/example_dataset.csv")
    # df = TSDataset.to_dataset(original_df)
    # ts = TSDataset(df, freq="D")

    data_config = {
        "n_segments": 100,
        "periods": 500,
        "exogs": True,
        "regressors": True,
        "horizon": 14,
    }
    ts = generate_tsdataset(data_config)

    set_seed()

    dft = DateFlagsTransform(day_number_in_week=True, day_number_in_month=False, out_column="regressor_dateflag")
    model_tft = TFTModel(
        dataset_builder=PytorchForecastingDatasetBuilder(
            max_encoder_length=21,
            min_encoder_length=21,
            max_prediction_length=HORIZON,
            time_varying_known_reals=["time_idx"],
            time_varying_known_categoricals=["regressor_dateflag_day_number_in_week"],
            time_varying_unknown_reals=["target"],
            static_categoricals=["segment"],
            target_normalizer=None,
        ),
        trainer_params=dict(max_epochs=1),
    )
    transforms = [dft]

    pipeline_tft = Pipeline(model=model_tft, transforms=transforms, horizon=HORIZON)

    start_time = time.perf_counter()
    metrics_tft, forecast_tft, fold_info_tft = pipeline_tft.backtest(
        ts, metrics=[SMAPE(), MAPE(), MAE()], n_folds=3, n_jobs=1
    )
    run_time = time.perf_counter() - start_time

    logger.info(f"Run time: {run_time:.3f}")
    logger.info(f"Metrics: {metrics_tft['MAE'].mean():.3f}")

if __name__ == "__main__":
    main()

Results:

codecov-commenter commented 1 year ago

Codecov Report

Merging #1322 (0f994a0) into master (75e8fc1) will increase coverage by 0.14%. The diff coverage is 100.00%.

:exclamation: Your organization is not using the GitHub App Integration. As a result you may experience degraded service beginning May 15th. Please install the Github App Integration for your organization. Read more.

@@            Coverage Diff             @@
##           master    #1322      +/-   ##
==========================================
+ Coverage   88.95%   89.09%   +0.14%     
==========================================
  Files         193      204      +11     
  Lines       12319    12638     +319     
==========================================
+ Hits        10958    11260     +302     
- Misses       1361     1378      +17     
Impacted Files Coverage Δ
etna/models/nn/utils.py 85.61% <100.00%> (+0.40%) :arrow_up:

... and 11 files with indirect coverage changes

:mega: We’re building smart automated test selection to slash your CI/CD build times. Learn more

github-actions[bot] commented 1 year ago

🚀 Deployed on https://deploy-preview-1322--etna-docs.netlify.app