unit8co / darts

A python library for user-friendly forecasting and anomaly detection on time series.
https://unit8co.github.io/darts/
Apache License 2.0
7.91k stars 858 forks source link

ValueError: all input arrays must have the same shape #1729

Closed daihaozxn closed 9 months ago

daihaozxn commented 1 year ago

I use the model NBEATS and the function predict_from_dataset to make predictions on the test dataset. During the prediction, an error occurred (ValueError: all input arrays must have the same shape), as shown in the figure below: image

It seems to be due to the inconsistent shape of the samples in the last batch. When the sequence length is less than input_chunk_length (i.e. 360), it will still construct the sample with stride=1, resulting in multiple samples with a shape of(359, 25),(358, 25),......,(3, 25),(2, 25), (1, 25). How can I solve this problem? Thx.

alexcolpitts96 commented 1 year ago

Could you post the code? Your screenshot is too small to read.

daihaozxn commented 1 year ago

Could you post the code? Your screenshot is too small to read.

Thank you for the quick reply again. The following are the data file (sst_tw.csv) and code: sst_tw.csv


from darts import TimeSeries
from darts.models import RNNModel, NBEATSModel
from darts.metrics import mape, smape, mae, rmse
from darts.utils.data import InferenceDataset, TrainingDataset, PastCovariatesTrainingDataset, PastCovariatesInferenceDataset

import torch
import torch.nn as nn
import torch.optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler

class MyDataset_Train(PastCovariatesTrainingDataset):
    def __init__(self, data, border1s, border2s, seq_len, pred_len, flag='train'):
        super(MyDataset_Train, self).__init__()

        # init
        self.seq_len = seq_len
        self.pred_len = pred_len
        assert flag in ['train', 'val', 'test']
        type_map = {'train': 0, 'val': 1, 'test': 2}
        self.set_type = type_map[flag]

        border1 = border1s[self.set_type]
        border2 = border2s[self.set_type]

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]

    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end
        r_end = r_begin + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]

        return (seq_x, None, None, seq_y)

    def __len__(self):
        return len(self.data_x) - self.seq_len - self.pred_len + 1

class MyDataset_Test(PastCovariatesInferenceDataset):
    def __init__(self, target_series, seq_len):
        # super(MyDataset_Test, self).__init__()
        self.target_series = target_series
        self.seq_len = seq_len

    def __getitem__(self, index):
        target_series = self.target_series[index : (index+self.seq_len)]
        past_target = target_series.values(copy=False, sample=0)
        return (past_target, None, None, None, target_series)

    def __len__(self):
        return len(self.target_series)

root_path = './dataset/sst/'
data_path = 'sst_tw.csv'
features = 'M'
target = 'OT'
scale = True
seq_len = 360
pred_len = 30
df_raw = pd.read_csv(os.path.join(root_path, data_path))
num_train = int(len(df_raw) * 0.7)
num_test = int(len(df_raw) * 0.2)
num_vali = len(df_raw) - num_train - num_test
border1s = [0, num_train - seq_len, len(df_raw) - num_test - seq_len]
border2s = [num_train, num_train + num_vali, len(df_raw)]
if features == 'M':
    cols_data = df_raw.columns[1:]
    df_data = df_raw[cols_data]
elif features == 'S':
    df_data = df_raw[[target]]

scaler = StandardScaler()
if scale:
    train_data = df_data[border1s[0]:border2s[0]]
    scaler.fit(train_data.values)
    data = scaler.transform(df_data.values)
else:
    data = df_data.values

train_data = MyDataset_Train(data=data, border1s=border1s, border2s=border2s, seq_len=seq_len, pred_len=pred_len, flag='train')
val_data = MyDataset_Train(data=data, border1s=border1s, border2s=border2s, seq_len=seq_len, pred_len=pred_len, flag='val')

model_NBEATS = NBEATSModel(
    input_chunk_length=seq_len,
    output_chunk_length=pred_len,
    generic_architecture=True,
    num_stacks=2,
    num_blocks=1,
    num_layers=2,
    layer_widths=64,
    expansion_coefficient_dim=2,
    trend_polynomial_degree=2,
    dropout=0.0,
    batch_size=1024,
    activation='ReLU',
    nr_epochs_val_period=1,
    model_name="nbeats_run",
    n_epochs=10,
    loss_fn=nn.MSELoss(),
    save_checkpoints=False,
    optimizer_cls=torch.optim.Adam,
    optimizer_kwargs={'lr': 0.001},
    random_state=None,
    pl_trainer_kwargs={
        'accelerator': 'gpu',
        'devices': [0]
    }
)
model_NBEATS.fit_from_dataset(train_dataset=train_data, val_dataset=val_data, verbose=True)

target_series = TimeSeries.from_values(data[-num_test:], columns=None, fillna_value=None, static_covariates=None, hierarchy=None)

test_data = MyDataset_Test(target_series=target_series, seq_len=seq_len)

forecast_NBEATS = model_NBEATS.predict_from_dataset(n=pred_len,
                                                    input_series_dataset=test_data,
                                                    trainer=None,
                                                    batch_size=None,
                                                    verbose=True,
                                                    n_jobs=1,
                                                    roll_size=None,
                                                    num_samples=1,
                                                    num_loader_workers=0,
                                                    mc_dropout=False)
daihaozxn commented 1 year ago

I should find out what the problem is. When I modify the class MyDataset_Test to:

class MyDataset_Test(PastCovariatesInferenceDataset):
    def __init__(self, target_series, seq_len, pred_len):
        # super(MyDataset_Test, self).__init__()
        self.target_series = target_series
        self.seq_len = seq_len
        self.pred_len = pred_len
    def __getitem__(self, index):
        target_series = self.target_series[index : (index+self.seq_len)]
        past_target = target_series.values(copy=False, sample=0)
        return (past_target, None, None, None, target_series)
    def __len__(self):
        return len(self.target_series) - self.seq_len - self.pred_len + 1

, and modify the two lines of code accordingly as

target_series = TimeSeries.from_values(data[border1s[2]:border2s[2]], columns=None, fillna_value=None, static_covariates=None, hierarchy=None)

test_data = MyDataset_Test(target_series=target_series, seq_len=seq_len, pred_len=pred_len)

It can work.

madtoinou commented 9 months ago

Hi @daihaozxn,

As you found a solution to your problem, I am going to close this issue but feel free to reopen it if something remains unclear.