tinkoff-ai / etna

ETNA – Time-Series Library
https://etna.tinkoff.ru
Apache License 2.0
862 stars 80 forks source link

Optimize `TSDataset.describe` and `TSDataset.info` by vectorization #1344

Closed Mr-Geekman closed 1 year ago

Mr-Geekman commented 1 year ago

Before submitting (must do checklist)

Proposed Changes

Look at #1341.

Closing issues

Closes #1341.

Mr-Geekman commented 1 year ago

Script for testing describe:

import time
import json

import numpy as np
import pandas as pd
from loguru import logger

from etna.models import NaiveModel
from etna.datasets import TSDataset, generate_ar_df
from etna.metrics import MAE
from etna.pipeline import Pipeline

HORIZON = 14

def make_df(num_segments: int, num_features: int, num_periods: int, random_state: int = 0) -> pd.DataFrame:
    rng = np.random.default_rng(random_state)
    df = generate_ar_df(
        periods=num_periods, start_time="2020-01-01", n_segments=num_segments
    )

    for i in range(num_features):
        # add int column
        df[f"new_int_{i}"] = rng.integers(low=-100, high=100, size=df.shape[0])

    return df

def check_time(num_segments: int, num_features: int, num_periods: int = 365):
    df = make_df(num_segments=num_segments, num_features=num_features, num_periods=num_periods)
    df_wide = TSDataset.to_dataset(df)
    ts = TSDataset(df=df_wide, freq="D")

    start_time = time.perf_counter()
    _ = ts.describe()
    elapsed_time = time.perf_counter() - start_time

    return elapsed_time

def main():
    num_segments = [10, 100, 1000, 10_000, 100_000]
    num_features = [0, 3, 10]

    results = []
    for cur_num_segments in num_segments:
        for cur_num_features in num_features:
            time_result = check_time(num_segments=cur_num_segments, num_features=cur_num_features)
            record = {"num_segments": cur_num_segments, "num_features": cur_num_features, "time": time_result}
            results.append(record)
            logger.info(json.dumps(record))

    json.dump(results, open("records.json", "w"), indent=2)

if __name__ == "__main__":
    main()

Results before optimization:

[
  {
    "num_segments": 10,
    "num_features": 0,
    "time": 0.007862442000000414
  },
  {
    "num_segments": 10,
    "num_features": 3,
    "time": 0.00775050900000096
  },
  {
    "num_segments": 10,
    "num_features": 10,
    "time": 0.00862645999999856
  },
  {
    "num_segments": 100,
    "num_features": 0,
    "time": 0.06804819900000147
  },
  {
    "num_segments": 100,
    "num_features": 3,
    "time": 0.05528060099999976
  },
  {
    "num_segments": 100,
    "num_features": 10,
    "time": 0.05490351599999954
  },
  {
    "num_segments": 1000,
    "num_features": 0,
    "time": 0.511956906
  },
  {
    "num_segments": 1000,
    "num_features": 3,
    "time": 0.5077033259999997
  },
  {
    "num_segments": 1000,
    "num_features": 10,
    "time": 0.49680727800000035
  },
  {
    "num_segments": 10000,
    "num_features": 0,
    "time": 5.198245515000002
  },
  {
    "num_segments": 10000,
    "num_features": 3,
    "time": 5.023976880999999
  },
  {
    "num_segments": 10000,
    "num_features": 10,
    "time": 5.116792693999997
  },
  {
    "num_segments": 100000,
    "num_features": 0,
    "time": 50.777624478999996
  },
  {
    "num_segments": 100000,
    "num_features": 3,
    "time": 51.87359783100001
  },
  {
    "num_segments": 100000,
    "num_features": 10,
    "time": 62.32446584499996
  }
]

Results after optimization:

[
  {
    "num_segments": 10,
    "num_features": 0,
    "time": 0.006445242999999934
  },
  {
    "num_segments": 10,
    "num_features": 3,
    "time": 0.005044411000000082
  },
  {
    "num_segments": 10,
    "num_features": 10,
    "time": 0.00412322800000009
  },
  {
    "num_segments": 100,
    "num_features": 0,
    "time": 0.007066482000000818
  },
  {
    "num_segments": 100,
    "num_features": 3,
    "time": 0.006966671999999008
  },
  {
    "num_segments": 100,
    "num_features": 10,
    "time": 0.006875658000000229
  },
  {
    "num_segments": 1000,
    "num_features": 0,
    "time": 0.015869262000000717
  },
  {
    "num_segments": 1000,
    "num_features": 3,
    "time": 0.018922749999999766
  },
  {
    "num_segments": 1000,
    "num_features": 10,
    "time": 0.019158535000000754
  },
  {
    "num_segments": 10000,
    "num_features": 0,
    "time": 0.05820048300000025
  },
  {
    "num_segments": 10000,
    "num_features": 3,
    "time": 0.07253477299999922
  },
  {
    "num_segments": 10000,
    "num_features": 10,
    "time": 0.0792398090000006
  },
  {
    "num_segments": 100000,
    "num_features": 0,
    "time": 0.47934153599999973
  },
  {
    "num_segments": 100000,
    "num_features": 3,
    "time": 0.6095439070000026
  },
  {
    "num_segments": 100000,
    "num_features": 10,
    "time": 0.9615795119999859
  }
]
Mr-Geekman commented 1 year ago

Results for script from #1338:

[
  {
    "num_segments": 10,
    "num_features": 0,
    "time": 0.18644669199999964
  },
  {
    "num_segments": 10,
    "num_features": 3,
    "time": 0.44299768199999967
  },
  {
    "num_segments": 10,
    "num_features": 10,
    "time": 0.3184416309999998
  },
  {
    "num_segments": 100,
    "num_features": 0,
    "time": 0.40837533100000023
  },
  {
    "num_segments": 100,
    "num_features": 3,
    "time": 0.49695419899999926
  },
  {
    "num_segments": 100,
    "num_features": 10,
    "time": 0.6302544880000003
  },
  {
    "num_segments": 1000,
    "num_features": 0,
    "time": 2.3735116309999995
  },
  {
    "num_segments": 1000,
    "num_features": 3,
    "time": 2.3557946890000014
  },
  {
    "num_segments": 1000,
    "num_features": 10,
    "time": 3.484642255999999
  },
  {
    "num_segments": 10000,
    "num_features": 0,
    "time": 18.414344812
  },
  {
    "num_segments": 10000,
    "num_features": 3,
    "time": 23.947836302000006
  },
  {
    "num_segments": 10000,
    "num_features": 10,
    "time": 37.481222474999996
  }
]
github-actions[bot] commented 1 year ago

🚀 Deployed on https://deploy-preview-1344--etna-docs.netlify.app

codecov-commenter commented 1 year ago

Codecov Report

Merging #1344 (d7bdcf5) into master (ddc1711) will increase coverage by 0.30%. The diff coverage is 100.00%.

:exclamation: Your organization is not using the GitHub App Integration. As a result you may experience degraded service beginning May 15th. Please install the Github App Integration for your organization. Read more.

@@            Coverage Diff             @@
##           master    #1344      +/-   ##
==========================================
+ Coverage   88.84%   89.15%   +0.30%     
==========================================
  Files         204      204              
  Lines       12665    12675      +10     
==========================================
+ Hits        11252    11300      +48     
+ Misses       1413     1375      -38     
Files Changed Coverage Δ
etna/datasets/tsdataset.py 93.09% <100.00%> (+0.13%) :arrow_up:

... and 4 files with indirect coverage changes

:mega: We’re building smart automated test selection to slash your CI/CD build times. Learn more