How do you a prediction for an arbitrary timestamp?

jeznag commented 3 years ago

Sorry this is a really noob question. I've gone through tutorial 4 on time series regression and have successfully trained a learner. What I can't figure out is making a prediction on a future date. Here's what I tried:

from datetime import datetime
epoch = datetime.utcfromtimestamp(0)

def unix_time_seconds(dt):
    return (dt - epoch).total_seconds()

df_json = {"unix_timestamp":{"6":1451606400.0,"1558":1454284800.0,"3110":1456790400.0,"4662":1459468800.0,"6214":1462060800.0,"7766":1464739200.0,"9318":1467331200.0,"10870":1470009600.0,"12422":1472688000.0,"13974":1475280000.0,"15526":1477958400.0,"17078":1480550400.0,"18630":1483228800.0,"20182":1485907200.0,"21734":1488326400.0,"23286":1491004800.0,"24838":1493596800.0,"26390":1496275200.0,"27942":1498867200.0,"29494":1501545600.0,"31046":1504224000.0,"32598":1506816000.0,"34150":1509494400.0,"35702":1512086400.0,"37254":1514764800.0,"38806":1517443200.0,"40358":1519862400.0,"41910":1522540800.0,"43462":1525132800.0,"45014":1527811200.0,"46566":1530403200.0,"48118":1533081600.0,"49670":1535760000.0,"51222":1538352000.0,"52774":1541030400.0,"54326":1543622400.0,"55878":1546300800.0,"57430":1548979200.0,"58982":1551398400.0,"60534":1554076800.0,"62086":1556668800.0,"63638":1559347200.0,"65190":1561939200.0,"66742":1564617600.0,"68294":1567296000.0,"69846":1569888000.0,"71398":1572566400.0,"72950":1575158400.0,"74502":1577836800.0,"76054":1580515200.0,"77606":1583020800.0},"Usage_MWh":{"6":5.34858,"1558":3.78055,"3110":3.4831,"4662":3.74901,"6214":3.02347,"7766":7.63334,"9318":5.62975,"10870":5.51058,"12422":4.36067,"13974":3.29915,"15526":2.76066,"17078":2.94552,"18630":2.7777,"20182":2.76716,"21734":5.78573,"23286":4.8537129444,"24838":3.1271232778,"26390":2.8168842646,"27942":2.8774968882,"29494":2.8774968882,"31046":2.7846744079,"32598":2.8774968882,"34150":2.7846744079,"35702":2.8774968882,"37254":2.8774968882,"38806":2.5990294474,"40358":2.8774968882,"41910":2.5288257571,"43462":2.5724465738,"45014":3.3510199005,"46566":3.060722109,"48118":2.6989527056,"49670":2.5984900474,"51222":3.4421489889,"52774":3.4093083871,"54326":3.5249084516,"55878":3.1468401143,"57430":3.0175142015,"58982":3.3731491579,"60534":3.0313829708,"62086":3.1152347778,"63638":3.2681218106,"65190":3.4173398852,"66742":2.897951582,"68294":3.3056545,"69846":3.1457436,"71398":2.646408469,"72950":2.5245141129,"74502":2.7281552182,"76054":6.5999127071,"77606":7.7869288929}}
df = pd.DataFrame.from_dict(df_json)

window_length = 5
stride = None
horizon=1
X, y = SlidingWindow(5, get_x=['unix_timestamp'], get_y='Usage_MWh')(regression_df)
itemify(X, y)

splits = RandomSplitter()(X) 
tfms  = [None, [ToFloat(), ToNumpyTensor()]]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)
dls   = TSDataLoaders.from_dsets(dsets.train, dsets.valid)

model = InceptionTime(dls.vars, 1)
learn = Learner(dls, model, loss_func=MSELossFlat())

try:
  learn.fit_one_cycle(5)
  # learn.recorder.plot_metrics()

  valid_preds, valid_targets = learn.get_preds(ds_idx=1)
  valid_preds.flatten(), valid_targets.data

  y_pred = valid_targets.data.tolist()

  y_true = dsets.valid.items[1]

  future_date = datetime.strptime('2020-12-01', '%Y-%m-%d')
  seconds = unix_time_seconds(future_date)
  learn.predict(tensor([seconds]))

except Exception:
  print("ouch")
  traceback.print_exc()

I get

AssertionError: Expected an input of type in 
  - <class 'numpy.ndarray'>
 but got <class 'torch.Tensor'>

Sorry again for the really dumb question. It's probably a lack of fastai knowledge. I'll do a pull request afterwards and add the answer to the tutorial notebook.

oguiza commented 3 years ago

Hi @jeznag ,

No need to apologize for being a noobie. We all are in one area or another :)

There are a few issues with your code. I've made some changes to make it work:

from datetime import datetime
epoch = datetime.utcfromtimestamp(0)

def unix_time_seconds(dt):
    return (dt - epoch).total_seconds()

df_json = {"unix_timestamp":{"6":1451606400.0,"1558":1454284800.0,"3110":1456790400.0,"4662":1459468800.0,"6214":1462060800.0,"7766":1464739200.0,"9318":1467331200.0,"10870":1470009600.0,"12422":1472688000.0,"13974":1475280000.0,"15526":1477958400.0,"17078":1480550400.0,"18630":1483228800.0,"20182":1485907200.0,"21734":1488326400.0,"23286":1491004800.0,"24838":1493596800.0,"26390":1496275200.0,"27942":1498867200.0,"29494":1501545600.0,"31046":1504224000.0,"32598":1506816000.0,"34150":1509494400.0,"35702":1512086400.0,"37254":1514764800.0,"38806":1517443200.0,"40358":1519862400.0,"41910":1522540800.0,"43462":1525132800.0,"45014":1527811200.0,"46566":1530403200.0,"48118":1533081600.0,"49670":1535760000.0,"51222":1538352000.0,"52774":1541030400.0,"54326":1543622400.0,"55878":1546300800.0,"57430":1548979200.0,"58982":1551398400.0,"60534":1554076800.0,"62086":1556668800.0,"63638":1559347200.0,"65190":1561939200.0,"66742":1564617600.0,"68294":1567296000.0,"69846":1569888000.0,"71398":1572566400.0,"72950":1575158400.0,"74502":1577836800.0,"76054":1580515200.0,"77606":1583020800.0},"Usage_MWh":{"6":5.34858,"1558":3.78055,"3110":3.4831,"4662":3.74901,"6214":3.02347,"7766":7.63334,"9318":5.62975,"10870":5.51058,"12422":4.36067,"13974":3.29915,"15526":2.76066,"17078":2.94552,"18630":2.7777,"20182":2.76716,"21734":5.78573,"23286":4.8537129444,"24838":3.1271232778,"26390":2.8168842646,"27942":2.8774968882,"29494":2.8774968882,"31046":2.7846744079,"32598":2.8774968882,"34150":2.7846744079,"35702":2.8774968882,"37254":2.8774968882,"38806":2.5990294474,"40358":2.8774968882,"41910":2.5288257571,"43462":2.5724465738,"45014":3.3510199005,"46566":3.060722109,"48118":2.6989527056,"49670":2.5984900474,"51222":3.4421489889,"52774":3.4093083871,"54326":3.5249084516,"55878":3.1468401143,"57430":3.0175142015,"58982":3.3731491579,"60534":3.0313829708,"62086":3.1152347778,"63638":3.2681218106,"65190":3.4173398852,"66742":2.897951582,"68294":3.3056545,"69846":3.1457436,"71398":2.646408469,"72950":2.5245141129,"74502":2.7281552182,"76054":6.5999127071,"77606":7.7869288929}}
df = pd.DataFrame.from_dict(df_json)

window_length = 5
stride = 1
horizon = 1
X, y = SlidingWindow(window_length=window_length, stride=stride, horizon=horizon, get_x=['unix_timestamp'], get_y='Usage_MWh')(df)

splits = TimeSplitter()(X)
tfms = [None, [ToFloat(), ToNumpyTensor()]]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid)

model = InceptionTime(dls.vars, 1)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(100, 1e-3)
learn.plot_metrics()

valid_preds, valid_targets = learn.get_preds(ds_idx=1)

plt.plot(df.index[-len(splits[1]):].values, valid_preds, label='preds')
plt.plot(df.index[-len(splits[1]):].values, valid_targets, label='targets')
plt.title('Valid Preds vs Targets')
plt.legend(loc='best')
plt.show()

With this you should get an output like this:

(tensor([3.3068, 3.4284, 3.5788, 3.7468, 3.9250, 4.1177, 4.3224, 4.5454, 4.7916]),
 tensor([3.4173, 2.8980, 3.3057, 3.1457, 2.6464, 2.5245, 2.7282, 6.5999, 7.7869]))

The main issues you were facing are:

You have trained a model to take the last 5 steps and predict the next. But then you were only passing one value to predict the next. It doesn't work this way. The input shape in the future needs to be the same as the input during training.
The input was a numpy array, but then you were passing a tensor (hence your error message). You need to pass numpy arrays that you want to have predicted.
It's easier to have a smaller mse error if you have lots of data. Since you have very few datapoints here, it'd be better to set stride to 1 so that your training set is a bit larger. Otherwise you have too few training samples.

Please, let me know if this resolves your issues.

jeznag commented 3 years ago

Thanks for those tips!

It's almost got me there but not quite. I want to essentially do predictions on a test set out into the future rather than the validation/dev set used to iterate on the model.

I've tried below (making sure I predict the same number of items as I trained the model on) and get


def get_future_prediction_dates(num_months_to_predict):
  future_dates = list()
  for num_months_since_start in range(0, num_months_to_predict):
      date_for_iteration = add_months_to_date(start_date, num_months_since_start)

      seconds = unix_time_seconds(date_for_iteration)
      future_dates.append(tensor([seconds]))

  return np.array(future_dates)

def forecast_future_consumption(meter_df, learner, num_months_to_predict):
  results = dict()

  future_dates = get_future_prediction_dates(num_months_to_predict)

  print(f"predicting {future_dates}")
  predicted_consumption = learner.predict(future_dates)
  print(predicted_consumption)

  # Add a new entry for the timestamp if it doesn't exist
  results[date_for_iteration.isoformat()] = { "actual": actual_consumption}

  model_type = "tsai"
  # Add the prediction
  results[data_identifier][date_for_iteration.isoformat()][model_type] = predicted_consumption
  print(results)
  return results

from datetime import datetime
epoch = datetime.utcfromtimestamp(0)

def unix_time_seconds(dt):
    return (dt - epoch).total_seconds()

df_json = {"unix_timestamp":{"6":1451606400.0,"1558":1454284800.0,"3110":1456790400.0,"4662":1459468800.0,"6214":1462060800.0,"7766":1464739200.0,"9318":1467331200.0,"10870":1470009600.0,"12422":1472688000.0,"13974":1475280000.0,"15526":1477958400.0,"17078":1480550400.0,"18630":1483228800.0,"20182":1485907200.0,"21734":1488326400.0,"23286":1491004800.0,"24838":1493596800.0,"26390":1496275200.0,"27942":1498867200.0,"29494":1501545600.0,"31046":1504224000.0,"32598":1506816000.0,"34150":1509494400.0,"35702":1512086400.0,"37254":1514764800.0,"38806":1517443200.0,"40358":1519862400.0,"41910":1522540800.0,"43462":1525132800.0,"45014":1527811200.0,"46566":1530403200.0,"48118":1533081600.0,"49670":1535760000.0,"51222":1538352000.0,"52774":1541030400.0,"54326":1543622400.0,"55878":1546300800.0,"57430":1548979200.0,"58982":1551398400.0,"60534":1554076800.0,"62086":1556668800.0,"63638":1559347200.0,"65190":1561939200.0,"66742":1564617600.0,"68294":1567296000.0,"69846":1569888000.0,"71398":1572566400.0,"72950":1575158400.0,"74502":1577836800.0,"76054":1580515200.0,"77606":1583020800.0},"Usage_MWh":{"6":5.34858,"1558":3.78055,"3110":3.4831,"4662":3.74901,"6214":3.02347,"7766":7.63334,"9318":5.62975,"10870":5.51058,"12422":4.36067,"13974":3.29915,"15526":2.76066,"17078":2.94552,"18630":2.7777,"20182":2.76716,"21734":5.78573,"23286":4.8537129444,"24838":3.1271232778,"26390":2.8168842646,"27942":2.8774968882,"29494":2.8774968882,"31046":2.7846744079,"32598":2.8774968882,"34150":2.7846744079,"35702":2.8774968882,"37254":2.8774968882,"38806":2.5990294474,"40358":2.8774968882,"41910":2.5288257571,"43462":2.5724465738,"45014":3.3510199005,"46566":3.060722109,"48118":2.6989527056,"49670":2.5984900474,"51222":3.4421489889,"52774":3.4093083871,"54326":3.5249084516,"55878":3.1468401143,"57430":3.0175142015,"58982":3.3731491579,"60534":3.0313829708,"62086":3.1152347778,"63638":3.2681218106,"65190":3.4173398852,"66742":2.897951582,"68294":3.3056545,"69846":3.1457436,"71398":2.646408469,"72950":2.5245141129,"74502":2.7281552182,"76054":6.5999127071,"77606":7.7869288929}}
df = pd.DataFrame.from_dict(df_json)

window_length = 5
stride = 1
horizon = 1
X, y = SlidingWindow(window_length=window_length, stride=stride, horizon=horizon, get_x=['unix_timestamp'], get_y='Usage_MWh')(df)

splits = TimeSplitter()(X)
tfms = [None, [ToFloat(), ToNumpyTensor()]]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid)

model = InceptionTime(dls.vars, 1)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(100, 1e-3)

forecast_future_consumption(df, learn, len(dsets.train))

oguiza commented 3 years ago

Ok, I see. In the case you want to use the model on additional data, you need to create a new dataset and dataloader as shown in tutorial nb #1. The code would be something like this:

from datetime import datetime
epoch = datetime.utcfromtimestamp(0)

def unix_time_seconds(dt):
    return (dt - epoch).total_seconds()

df_json = {"unix_timestamp":{"6":1451606400.0,"1558":1454284800.0,"3110":1456790400.0,"4662":1459468800.0,"6214":1462060800.0,"7766":1464739200.0,"9318":1467331200.0,"10870":1470009600.0,"12422":1472688000.0,"13974":1475280000.0,"15526":1477958400.0,"17078":1480550400.0,"18630":1483228800.0,"20182":1485907200.0,"21734":1488326400.0,"23286":1491004800.0,"24838":1493596800.0,"26390":1496275200.0,"27942":1498867200.0,"29494":1501545600.0,"31046":1504224000.0,"32598":1506816000.0,"34150":1509494400.0,"35702":1512086400.0,"37254":1514764800.0,"38806":1517443200.0,"40358":1519862400.0,"41910":1522540800.0,"43462":1525132800.0,"45014":1527811200.0,"46566":1530403200.0,"48118":1533081600.0,"49670":1535760000.0,"51222":1538352000.0,"52774":1541030400.0,"54326":1543622400.0,"55878":1546300800.0,"57430":1548979200.0,"58982":1551398400.0,"60534":1554076800.0,"62086":1556668800.0,"63638":1559347200.0,"65190":1561939200.0,"66742":1564617600.0,"68294":1567296000.0,"69846":1569888000.0,"71398":1572566400.0,"72950":1575158400.0,"74502":1577836800.0,"76054":1580515200.0,"77606":1583020800.0},"Usage_MWh":{"6":5.34858,"1558":3.78055,"3110":3.4831,"4662":3.74901,"6214":3.02347,"7766":7.63334,"9318":5.62975,"10870":5.51058,"12422":4.36067,"13974":3.29915,"15526":2.76066,"17078":2.94552,"18630":2.7777,"20182":2.76716,"21734":5.78573,"23286":4.8537129444,"24838":3.1271232778,"26390":2.8168842646,"27942":2.8774968882,"29494":2.8774968882,"31046":2.7846744079,"32598":2.8774968882,"34150":2.7846744079,"35702":2.8774968882,"37254":2.8774968882,"38806":2.5990294474,"40358":2.8774968882,"41910":2.5288257571,"43462":2.5724465738,"45014":3.3510199005,"46566":3.060722109,"48118":2.6989527056,"49670":2.5984900474,"51222":3.4421489889,"52774":3.4093083871,"54326":3.5249084516,"55878":3.1468401143,"57430":3.0175142015,"58982":3.3731491579,"60534":3.0313829708,"62086":3.1152347778,"63638":3.2681218106,"65190":3.4173398852,"66742":2.897951582,"68294":3.3056545,"69846":3.1457436,"71398":2.646408469,"72950":2.5245141129,"74502":2.7281552182,"76054":6.5999127071,"77606":7.7869288929}}
df = pd.DataFrame.from_dict(df_json)

window_length = 5
stride = 1
horizon = 1
X, y = SlidingWindow(window_length=window_length, stride=stride, horizon=horizon, get_x=['unix_timestamp'], get_y='Usage_MWh')(df)

splits = get_splits(y, valid_size=.2, test_size=0.2, shuffle=False)

tfms = [None, [ToFloat(), ToNumpyTensor()]]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid)

model = InceptionTime(dls.vars, 1)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(100, 1e-3)
learn.plot_metrics()

valid_preds, valid_targets = learn.get_preds(ds_idx=1)

plt.plot(df.index[-len(splits[1]):].values, valid_preds, label='preds')
plt.plot(df.index[-len(splits[1]):].values, valid_targets, label='targets')
plt.title('Valid Preds vs Targets')
plt.legend(loc='best')
plt.show()

X_test = X[splits[2]] # splits[2] == test split
test_ds = dsets.valid.add_test(X_test)
test_dl = dls.valid.new(test_ds)

test_preds, *_ = learn.get_preds(dl=test_dl, with_decoded=True, save_preds=None, save_targs=None)

plt.plot(df.index[-len(splits[2]):].values, test_preds)
plt.title('Test Preds')
plt.show()

Please, let me know if this works.

jeznag commented 3 years ago

Thanks for your help! I'll write a blog post explaining the process.

oguiza commented 3 years ago

That'd be great! Thank you.

timeseriesAI / tsai

How do you a prediction for an arbitrary timestamp? #21