Hello everyone, i'm trying to use the DeepAR algorithm implemented in pytorch-forecasting to make a forecast on the sales of some products, i'm using the following dataset
DATA5.csv
The code does all the necessary pre-transformations on the dataset as to render it feasible to work with.
Actual behavior
However, when trying to construct the training set something goes wrong leading to the following error:
"Unknown category '122' encountered. Set add_nan=True to allow unknown categories"
from what i understood, it doesn't successfully read the stationary encoders in the definition of the training set, how can i fix this?
Code to reproduce the problem
import os
import warnings
import numpy as np
import collections
warnings.filterwarnings("ignore")
os.chdir("../../..")
from datetime import datetime, date, timedelta
import matplotlib.pyplot as plt
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
import torch
from pytorch_forecasting import Baseline, DeepAR, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder
from pytorch_forecasting.data.examples import generate_ar_data
from pytorch_forecasting.metrics import SMAPE
df= pd.read_csv("C:/Users/alessio.rimoldi/Desktop/SPAP/DATA5mod.csv")
delta=date(2021,12,31)-date(2017,1,1)
delta=delta.days
delta=np.arange(delta+1)
ART=[]
for art in df.ARTICOLO:
if art not in ART:
ART.append(art)
del art
D={}
for art in ART:
D[art]=pd.DataFrame()
for art in D:
D[art] = df.iloc[df.index[df['ARTICOLO']== art]]
del art
dates=pd.Timestamp('2017-01-01')+pd.to_timedelta(delta, "D")
dg=pd.Series(list(dates)*len(ART))
article=pd.Series(['']*len(dg))
for i in np.arange(len(dg)):
j=int(i/len(dates))
article[i]=ART[j]
time_idx=pd.Series(list(np.arange(len(list(dates))))*len(ART))
DG=pd.concat([dg,article,time_idx],axis=1)
DG.columns=['dates','article','time_idx']
for df in D:
D[df].index=pd.to_datetime(D[df].DATA,dayfirst=True)
D[df]=D[df].sort_index()
del df
"""Raduno i dati mensilmente"""
D_QVEN={}
D_PRZACQ={}
D_PRZVEN={}
D_VALVEN={}
DM_QVEN={}
DM_PRZACQ={}
DM_PRZVEN={}
DM_VALVEN={}
DM_QVEN_sum={}
DM_PRZACQ_mean={}
DM_PRZVEN_mean={}
DM_VALVEN_sum={}
DM={}
DD=pd.DataFrame()
for df in D:
D_QVEN[df]=D[df].Q_VEN
D_PRZACQ[df]=D[df].PREZZO_ACQ
D_PRZVEN[df]=D[df].PREZZO_VEN
D_VALVEN[df]=D[df].VAL_VEN
DM_QVEN[df]=D_QVEN[df].groupby(pd.Grouper(freq='D'))
DM_PRZACQ[df]=D_PRZACQ[df].groupby(pd.Grouper(freq='D'))
DM_PRZVEN[df]=D_PRZVEN[df].groupby(pd.Grouper(freq='D'))
DM_VALVEN[df]=D_VALVEN[df].groupby(pd.Grouper(freq='D'))
DM_QVEN_sum[df]=DM_QVEN[df].sum()
DM_PRZACQ_mean[df]=DM_PRZACQ[df].mean()
DM_PRZVEN_mean[df]=DM_PRZVEN[df].mean()
DM_VALVEN_sum[df]=DM_VALVEN[df].sum()
DM[df]=pd.concat([DM_QVEN_sum[df],DM_PRZACQ_mean[df],DM_PRZVEN_mean[df],DM_VALVEN_sum[df]],axis=1)
ARTICOLO=[df]*len(DM[df])
DM[df]['article']=ARTICOLO
DM[df]=DM[df].drop(columns='VAL_VEN')
DD=DD.append(DM[df])
del df,D_QVEN,D_PRZACQ,D_PRZVEN,D_VALVEN,DM_QVEN,DM_PRZACQ,DM_PRZVEN,DM_VALVEN
del DM_QVEN_sum,DM_PRZACQ_mean,DM_PRZVEN_mean,DM_VALVEN_sum
time_idx=pd.Series([0]*len(DD.index),index=DD.index)
for d in DD.index:
de=date(d.year,d.month,d.day)-date(2017,1,1)
de=de.days
time_idx[d]=de
DD['time_idx']=time_idx
DD=DD.fillna(0)
DD['DATA']=DD.index
DD.index=np.arange(len(DD))
merged = DG.merge(DD, how='left',
left_on=['time_idx','article'],
right_on=['time_idx','article'])
# merged.Q_VEN=merged.Q_VEN.fillna(0)
# merged.PREZZO_ACQ=merged.PREZZO_ACQ.fillna(0)
# mergedPREZZO_VEN=merged.PREZZO_VEN.fillna(0)
# for i in np.arange(len(DG)):
# j=0
# while j<len(DD):
# if DG.iloc[i].article == DD.iloc[j].article and DG.iloc[i].time_idx == DD.iloc[j].time_idx and DD.iloc[j].time_idx!=0:
# DG.iloc[i].qnt=DD.iloc[j].Q_VEN
# DG.iloc[i].prz_ven=DD.iloc[j].PREZZO_ACQ
# DG.iloc[i].prz_acq=DD.iloc[j].PREZZO_VEN
# j+=1
"""DEEP AR"""
"""create dataset and dataloaders"""
max_encoder_length = 540
max_prediction_length = 180
training_cutoff = DD["time_idx"].max() - max_prediction_length
context_length = max_encoder_length
prediction_length = max_prediction_length
"""TRAINING AND VALIDATION SET"""
training = TimeSeriesDataSet(
DD[lambda x: x.time_idx <= training_cutoff],
time_idx="time_idx",
target="Q_VEN",
categorical_encoders={"article": NaNLabelEncoder().fit(DD.article)},
group_ids=["article"],
time_varying_unknown_reals=["Q_VEN"],
time_varying_known_reals=["PREZZO_ACQ","PREZZO_VEN"],
max_encoder_length=context_length,
max_prediction_length=prediction_length,
)
validation = TimeSeriesDataSet.from_dataset(training,DD,min_prediction_idx=training_cutoff + 1)
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)
Expected behavior
Hello everyone, i'm trying to use the DeepAR algorithm implemented in pytorch-forecasting to make a forecast on the sales of some products, i'm using the following dataset DATA5.csv
The code does all the necessary pre-transformations on the dataset as to render it feasible to work with.
Actual behavior
However, when trying to construct the training set something goes wrong leading to the following error:
"Unknown category '122' encountered. Set
add_nan=True
to allow unknown categories"from what i understood, it doesn't successfully read the stationary encoders in the definition of the training set, how can i fix this?
Code to reproduce the problem