Closed srinathdama closed 3 months ago
Below are the bar plots (NMAE, NMSE) reproduced using the pkl files located in "/precomputed_outputs/darts". There is a significant difference between the below-reproduced bar plots and Figure-4 presented in the paper.
The following code generates the above-reproduced plot. Please let me know if I'm doing something wrong when computing NMAE and NMSE.
# %%
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from matplotlib.ticker import FormatStrFormatter
import matplotlib.gridspec as gridspec
sns.set(style="whitegrid", font_scale=1)
root_base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
import sys
sys.path
sys.path.append(root_base_path)
from data.metrics import Evaluator
from data.small_context import get_datasets
saved_files_path =os.path.join(root_base_path, "precomputed_outputs/darts")
model_names = ['text-davinci-003', 'gp', 'arima', 'N-HiTS', 'TCN', 'N-BEATS']
# model_names = ['gp', 'arima', 'N-HiTS', 'TCN', 'N-BEATS']
datasets = get_datasets()
# %%
output_save_dir = os.path.join(root_base_path, f'figures')
# os.makedirs(output_save_dir, exist_ok=False)
exp_results = {'Dataset': [],
'NLL/D':[],
'Type': [],
'MSE_median': [],
'MAE_median': [],
'NMSE_median': [],
'NMAE_median': [],
'MSE_samples': [],
'MAE_samples': [],
'NMSE_samples': [],
'NMAE_samples': []
}
# %%
for i, model in enumerate(model_names):
print(f'{i}: Loaded results from model - {model}')
print("***************************************")
for j, (dsname,data) in enumerate(datasets.items()):
print("-----------------------------------------")
print(f'{j}: Loaded results from dataset - {dsname}')
train, test = data
if os.path.exists(f'{saved_files_path}/{dsname}.pkl'):
with open(f'{saved_files_path}/{dsname}.pkl','rb') as f:
out_dict = pickle.load(f)
print(f"Loaded {saved_files_path}/{dsname}.pkl")
else:
print(f'missing data file -{saved_files_path}/{dsname}.pkl')
# print(out_dict.keys())
try:
model_results = out_dict[model]
print('median forecasts shape:', model_results['median'].shape, 'type:', type(model_results['median']) )
print('forecasted samples shape:', model_results['samples'].shape, 'type:', type(model_results['samples']))
steps_len = model_results['median'].shape[0]
no_of_samples = model_results['samples'].shape[0]
# # Computing metrics using median predictions
# # Computing metrics using samples of predictions
if model == 'gp':
median_results = Evaluator().evaluate(test.values.reshape(1,-1), model_results['median'].reshape(1, 1, -1))
sample_results = Evaluator().evaluate(test.values.reshape(1,-1), model_results['samples'].reshape(1, no_of_samples, -1))
else:
median_results = Evaluator().evaluate(test.values.reshape(1,-1), model_results['median'].to_numpy().reshape(1, 1, -1))
sample_results = Evaluator().evaluate(test.values.reshape(1,-1), model_results['samples'].to_numpy().reshape(1, no_of_samples, -1))
# write the results
exp_results['Dataset'].append(dsname)
exp_results['Type'].append(model)
exp_results['NLL/D'].append(model_results['NLL/D'])
exp_results['NMAE_median'].append(median_results['nmae'])
exp_results['NMSE_median'].append(median_results['nmse'])
exp_results['MAE_median'].append(median_results['mae'])
exp_results['MSE_median'].append(median_results['mse'])
exp_results['NMAE_samples'].append(sample_results['nmae'])
exp_results['NMSE_samples'].append(sample_results['nmse'])
exp_results['MAE_samples'].append(sample_results['mae'])
exp_results['MSE_samples'].append(sample_results['mse'])
except Exception as e:
print(f"Failed computing metrics using data from model - {model} \n")
print(e)
continue
print("-----------------------------------------")
print(f"Finished {dsname}")
print("***************************************")
# %%
exp_results = pd.DataFrame(exp_results)
exp_results.to_csv( os.path.join(output_save_dir, 'llmitme_darts_results.csv'))
# %% [markdown]
# # Bar plots (MAE) using the reproduced csv file
#
# Normalised MAE values obtained using median of forecasts
# %%
gpt_color = sns.color_palette('Dark2',3)[2]
name_order = ['text-davinci-003', 'gp', 'arima', 'N-HiTS', 'TCN', 'N-BEATS']
def color_map(x):
if x == 'GPT-3':
return gpt_color
elif x == 'LLaMA-2':
return '#a60355'
else:
return 'grey'
palette = [color_map(x) for x in name_order]
csv_fn = os.path.join(output_save_dir, 'llmitme_darts_results.csv')
df = pd.read_csv(csv_fn)
df['Type'] = df['Type'].apply(lambda x: x.replace(" 70B", ""))
fig, ax = plt.subplots(1)
c = sns.barplot(
data=df,
order=name_order,
x='Type',
y='NMAE_median',
ax=ax,
color="grey",
palette=palette,
alpha=0.3,
edgecolor='black',
errwidth=1.,
errorbar=('ci', 68)
)
for bar,name in zip(c.containers[0], name_order):
if name != "GPT-3" and name != 'LLaMA-2':
continue
bar.set_alpha(1.0)
bar.set_edgecolor('grey')
ax.set_ylabel('MAE')
ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
ax.tick_params(axis='y', which='major', pad=0, labelsize=9)
ax.set(xlabel=None)
ax.set_xticklabels(ax.get_xticklabels(), rotation=50, horizontalalignment='right')
ax.set_xlabel('') # Remove x-axis label
ax.set_title("Darts")
plt.tight_layout()
plt.savefig('outputs/reproduced_mae_aggregated.pdf', bbox_inches='tight')
plt.show()
It looks like the evaluate function is computing some kind of standardized MAE instead of normalized MAE. By modifying the evaluate function as shown below, I'm able to reproduce the bar plots in figure 4.
def evaluate(self, gt, pred):
'''
gt: (batch_size, steps)
pred: (batch_size, num_samples, steps)
'''
assert gt.shape == (pred.shape[0], pred.shape[2]), f"wrong shapes: gt.shape: {gt.shape}, pred.shape: {pred.shape}"
diff = (gt[:, None, :] - pred) # (batch_size, num_samples, steps)
mse = np.mean(diff**2)
mae = np.mean(np.abs(diff))
# std = np.std(gt, axis=1) + 1e-8 # (batch_size,)
# normlized_diff = diff / std[:, None, None] # (batch_size, num_samples, steps)
# nmse = np.mean(normlized_diff**2)
# nmae = np.mean(np.abs(normlized_diff))
nmse = mse/np.mean(gt**2)
nmae = mae/np.mean(np.abs(gt))
return {
"nmse": nmse,
"nmae": nmae,
"mse": mse,
"mae": mae,
}
Hi @ngruver,
Thank you for making the source code available publicly!
I'm currently encountering issues when trying to replicate the MAE values shown in Figure-4 for the Darts dataset. Could you please clarify how the MAE values are calculated? These values are listed in "/precomputed_outputs/deterministic_csvs/darts_results_agg.csv" and seem to be derived from the pkl files located in "/precomputed_outputs/darts". When attempting to calculate the metrics, both using prediction samples and the median of predictions, the NMAE and NMSE metrics I obtain are significantly higher than those reported in "/precomputed_outputs/deterministic_csvs/darts_results_agg.csv". Below is the code snippet I've been using to compute these metrics with the pkl files for reference:
Best, Srinath