ngruver / llmtime

https://arxiv.org/abs/2310.07820
MIT License
628 stars 139 forks source link

Reproducing the csv files used in figure-4 #31

Closed srinathdama closed 3 months ago

srinathdama commented 3 months ago

Hi @ngruver,

Thank you for making the source code available publicly!

I'm currently encountering issues when trying to replicate the MAE values shown in Figure-4 for the Darts dataset. Could you please clarify how the MAE values are calculated? These values are listed in "/precomputed_outputs/deterministic_csvs/darts_results_agg.csv" and seem to be derived from the pkl files located in "/precomputed_outputs/darts". When attempting to calculate the metrics, both using prediction samples and the median of predictions, the NMAE and NMSE metrics I obtain are significantly higher than those reported in "/precomputed_outputs/deterministic_csvs/darts_results_agg.csv". Below is the code snippet I've been using to compute these metrics with the pkl files for reference:

from data.metrics import Evaluator

## load '/precomputed_outputs/darts/AirPassengersDataset.pkl' in to out_dict
gp_results = out_dict['gp']
# Computing metrics using median predictions
median_results = Evaluator().evaluate(test.values.reshape(1,-1), gp_results['median'].reshape(1, 1, -1))
# Computing metrics using samples of predictions
sample_results = Evaluator().evaluate(test.values.reshape(1,-1), gp_results['samples'].reshape(1, 100, -1))

Best, Srinath

srinathdama commented 3 months ago

Below are the bar plots (NMAE, NMSE) reproduced using the pkl files located in "/precomputed_outputs/darts". There is a significant difference between the below-reproduced bar plots and Figure-4 presented in the paper.

Reproduced plot

image

FIgure-4 presented in paper

image

The following code generates the above-reproduced plot. Please let me know if I'm doing something wrong when computing NMAE and NMSE.

# %%
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

from matplotlib.ticker import FormatStrFormatter
import matplotlib.gridspec as gridspec

sns.set(style="whitegrid", font_scale=1)

root_base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
import sys
sys.path
sys.path.append(root_base_path)

from data.metrics import Evaluator
from data.small_context import get_datasets

saved_files_path =os.path.join(root_base_path, "precomputed_outputs/darts")
model_names      = ['text-davinci-003', 'gp', 'arima', 'N-HiTS', 'TCN', 'N-BEATS']
# model_names      = ['gp', 'arima', 'N-HiTS', 'TCN', 'N-BEATS']

datasets = get_datasets()

# %%
output_save_dir  = os.path.join(root_base_path, f'figures')
# os.makedirs(output_save_dir, exist_ok=False)
exp_results      = {'Dataset': [],
                    'NLL/D':[],
                    'Type': [],
                    'MSE_median': [],
                    'MAE_median': [],
                    'NMSE_median': [],
                    'NMAE_median': [],
                    'MSE_samples': [],
                    'MAE_samples': [],
                    'NMSE_samples': [],
                    'NMAE_samples': []
                    }

# %%
for i, model in enumerate(model_names):
    print(f'{i}: Loaded results from model - {model}')
    print("***************************************")
    for j, (dsname,data) in enumerate(datasets.items()):
        print("-----------------------------------------")
        print(f'{j}: Loaded results from dataset - {dsname}')
        train, test = data
        if os.path.exists(f'{saved_files_path}/{dsname}.pkl'):
            with open(f'{saved_files_path}/{dsname}.pkl','rb') as f:
                out_dict = pickle.load(f)
            print(f"Loaded {saved_files_path}/{dsname}.pkl")
        else:
            print(f'missing data file -{saved_files_path}/{dsname}.pkl')

        # print(out_dict.keys())
        try:
            model_results = out_dict[model]
            print('median forecasts shape:', model_results['median'].shape, 'type:', type(model_results['median']) )
            print('forecasted samples shape:', model_results['samples'].shape, 'type:', type(model_results['samples']))
            steps_len      = model_results['median'].shape[0]
            no_of_samples  = model_results['samples'].shape[0]
            # # Computing metrics using median predictions
            # # Computing metrics using samples of predictions
            if model == 'gp':
                median_results = Evaluator().evaluate(test.values.reshape(1,-1), model_results['median'].reshape(1, 1, -1))
                sample_results = Evaluator().evaluate(test.values.reshape(1,-1), model_results['samples'].reshape(1, no_of_samples, -1))
            else:
                median_results = Evaluator().evaluate(test.values.reshape(1,-1), model_results['median'].to_numpy().reshape(1, 1, -1))
                sample_results = Evaluator().evaluate(test.values.reshape(1,-1), model_results['samples'].to_numpy().reshape(1, no_of_samples, -1))

            # write the results
            exp_results['Dataset'].append(dsname)
            exp_results['Type'].append(model)
            exp_results['NLL/D'].append(model_results['NLL/D'])
            exp_results['NMAE_median'].append(median_results['nmae'])
            exp_results['NMSE_median'].append(median_results['nmse'])
            exp_results['MAE_median'].append(median_results['mae'])
            exp_results['MSE_median'].append(median_results['mse'])
            exp_results['NMAE_samples'].append(sample_results['nmae'])
            exp_results['NMSE_samples'].append(sample_results['nmse'])
            exp_results['MAE_samples'].append(sample_results['mae'])
            exp_results['MSE_samples'].append(sample_results['mse'])
        except Exception as e:
            print(f"Failed computing metrics using data from model - {model} \n")
            print(e)
            continue
    print("-----------------------------------------")

    print(f"Finished {dsname}")
    print("***************************************")

# %%
exp_results = pd.DataFrame(exp_results)
exp_results.to_csv( os.path.join(output_save_dir, 'llmitme_darts_results.csv'))

# %% [markdown]
# # Bar plots (MAE) using the reproduced csv file 
# 
# Normalised MAE values obtained using median of forecasts 

# %%
gpt_color = sns.color_palette('Dark2',3)[2]

name_order = ['text-davinci-003', 'gp', 'arima', 'N-HiTS', 'TCN', 'N-BEATS']
def color_map(x):
    if x == 'GPT-3':
        return gpt_color
    elif x == 'LLaMA-2':
        return '#a60355'
    else:
        return 'grey'

palette = [color_map(x) for x in name_order]

csv_fn = os.path.join(output_save_dir, 'llmitme_darts_results.csv')
df = pd.read_csv(csv_fn)
df['Type'] = df['Type'].apply(lambda x: x.replace(" 70B", ""))

fig, ax = plt.subplots(1)

c = sns.barplot(
    data=df,
    order=name_order,
    x='Type',
    y='NMAE_median',
    ax=ax, 
    color="grey",
    palette=palette,
    alpha=0.3,
    edgecolor='black',
    errwidth=1.,
    errorbar=('ci', 68)
)
for bar,name in zip(c.containers[0], name_order):
    if name != "GPT-3" and name != 'LLaMA-2':
        continue
    bar.set_alpha(1.0)
    bar.set_edgecolor('grey')

ax.set_ylabel('MAE')
ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
ax.tick_params(axis='y', which='major', pad=0, labelsize=9)
ax.set(xlabel=None)
ax.set_xticklabels(ax.get_xticklabels(), rotation=50, horizontalalignment='right')
ax.set_xlabel('')  # Remove x-axis label
ax.set_title("Darts")

plt.tight_layout()
plt.savefig('outputs/reproduced_mae_aggregated.pdf', bbox_inches='tight')
plt.show()
srinathdama commented 3 months ago

It looks like the evaluate function is computing some kind of standardized MAE instead of normalized MAE. By modifying the evaluate function as shown below, I'm able to reproduce the bar plots in figure 4.

    def evaluate(self, gt, pred):
        ''' 
        gt: (batch_size, steps)
        pred: (batch_size, num_samples, steps)
        '''
        assert gt.shape == (pred.shape[0], pred.shape[2]), f"wrong shapes: gt.shape: {gt.shape}, pred.shape: {pred.shape}"
        diff = (gt[:, None, :] - pred) # (batch_size, num_samples, steps)
        mse = np.mean(diff**2)
        mae = np.mean(np.abs(diff))
        # std = np.std(gt, axis=1) + 1e-8 # (batch_size,)
        # normlized_diff = diff / std[:, None, None] # (batch_size, num_samples, steps)
        # nmse = np.mean(normlized_diff**2)
        # nmae = np.mean(np.abs(normlized_diff))
        nmse = mse/np.mean(gt**2)
        nmae = mae/np.mean(np.abs(gt))

        return {
            "nmse": nmse,
            "nmae": nmae,
            "mse": mse,
            "mae": mae,
        }