How to prevent deterministic behaviour

pnmartinez commented 3 years ago

Hi! First, let me thank you for the nice and readable implementation!

I am experimenting with the backcast length parameter (in the paper it is suggested to use 3 to 7 times the forecast length, and then they perform a 180-runs ensemble). However, all my different runs with the same backcast length are producing the exact same predictions, which is odd given that I am not enforcing determinism in any way.

I've already tried to set a different seed on every new run with the torch methods below, but changes nothing.

for run in runs_loop:
    # CPU seed
    torch.manual_seed(np.random.randint(0, 10**8))
    # GPU seed
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(np.random.randint(0, 10**8))

    model.train()
    ...

I am kind of a newbie to Pytorch, so if anyone has a suggestion to prevent this deterministic behavior (the usual request on StackOverflow is how to enforce it).

philipperemy commented 3 years ago

@pnmartinez sorry for the late reply. When you train, do you find exactly the same results between runs? Or is it between 2 evaluations of the same model?

philipperemy commented 3 years ago

If I run this (long) example, I get different values every time I train it:

import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch import optim
from torch.nn import functional as F

from nbeats_pytorch.model import NBeatsNet
from trainer_pytorch import save

warnings.filterwarnings(action='ignore', message='Setting attributes')

# plot utils.
def plot_scatter(*args, **kwargs):
    plt.plot(*args, **kwargs)
    plt.scatter(*args, **kwargs)

# simple batcher.
def data_generator(x, y, size):
    assert len(x) == len(y)
    batches = []
    for ii in range(0, len(x), size):
        batches.append((x[ii:ii + size], y[ii:ii + size]))
    for batch in batches:
        yield batch

def main():
    forecast_length = 5
    backcast_length = 3 * forecast_length
    batch_size = 10  # greater than 4 for viz

    milk = pd.read_csv('data/milk.csv', index_col=0, parse_dates=True)
    print(milk.head())
    milk = milk.values.flatten()  # just keep np array here for simplicity.

    # data backcast/forecast generation.
    x, y = [], []
    for epoch in range(backcast_length, len(milk) - forecast_length):
        x.append(milk[epoch - backcast_length:epoch])
        y.append(milk[epoch:epoch + forecast_length])
    x = np.array(x)
    y = np.array(y)

    # split train/test.
    c = int(len(x) * 0.8)
    x_train, y_train = x[:c], y[:c]
    x_test, y_test = x[c:], y[c:]

    # normalization.
    norm_constant = np.max(x_train)
    x_train, y_train = x_train / norm_constant, y_train / norm_constant
    x_test, y_test = x_test / norm_constant, y_test / norm_constant

    # model
    net = NBeatsNet(
        stack_types=(NBeatsNet.GENERIC_BLOCK, NBeatsNet.GENERIC_BLOCK),
        forecast_length=forecast_length,
        backcast_length=backcast_length,
        hidden_layer_units=128,
    )
    optimiser = optim.Adam(lr=1e-4, params=net.parameters())

    grad_step = 0
    for epoch in range(1000):
        # train.
        net.train()
        train_loss = []
        for x_train_batch, y_train_batch in data_generator(x_train, y_train, batch_size):
            grad_step += 1
            optimiser.zero_grad()
            _, forecast = net(torch.tensor(x_train_batch, dtype=torch.float).to(net.device))
            loss = F.mse_loss(forecast, torch.tensor(y_train_batch, dtype=torch.float).to(net.device))
            train_loss.append(loss.item())
            loss.backward()
            optimiser.step()
        train_loss = np.mean(train_loss)

        # test.
        net.eval()
        _, forecast = net(torch.tensor(x_test, dtype=torch.float))
        test_loss = F.mse_loss(forecast, torch.tensor(y_test, dtype=torch.float)).item()
        p = forecast.detach().numpy()
        if epoch % 100 == 0:
            subplots = [221, 222, 223, 224]
            plt.figure(1)
            for plot_id, i in enumerate(np.random.choice(range(len(x_test)), size=4, replace=False)):
                ff, xx, yy = p[i] * norm_constant, x_test[i] * norm_constant, y_test[i] * norm_constant
                plt.subplot(subplots[plot_id])
                plt.grid()
                plot_scatter(range(0, backcast_length), xx, color='b')
                plot_scatter(range(backcast_length, backcast_length + forecast_length), yy, color='g')
                plot_scatter(range(backcast_length, backcast_length + forecast_length), ff, color='r')
            plt.show()

            with torch.no_grad():
                save(net, optimiser, grad_step)
            print(f'epoch = {str(epoch).zfill(4)}, '
                  f'grad_step = {str(grad_step).zfill(6)}, '
                  f'tr_loss (epoch) = {1000 * train_loss:.3f}, '
                  f'te_loss (epoch) = {1000 * test_loss:.3f}')

if __name__ == '__main__':
    main()

Output 1

epoch = 0000, grad_step = 000012, tr_loss (epoch) = 540.268, te_loss (epoch) = 645.095
[...]

Output 2

epoch = 0000, grad_step = 000012, tr_loss (epoch) = 463.423, te_loss (epoch) = 580.784
[...]

Could it be that you always load a model before doing something? That could be a possible explanation. Instead of re-init the weights from scratch you keep reading from an old checkpoint at epoch 0.

pnmartinez commented 3 years ago

Hi @philipperemy ,

I will test this in a new environment as soon as I have the time, so I can also test if the updated pip package works well.

In the meantime, thanks in advance!

philipperemy commented 3 years ago

@pnmartinez cool let me know.

philipperemy commented 3 years ago

I'll close this issue for now. Let me know if you could fix it with the latest version.

philipperemy / n-beats

How to prevent deterministic behaviour #44