timeseriesAI / tsai

Time series Timeseries Deep Learning Machine Learning Python Pytorch fastai | State-of-the-art Deep Learning library for Time Series and Sequences in Pytorch / fastai
https://timeseriesai.github.io/tsai/
Apache License 2.0
5.19k stars 649 forks source link

How do you a prediction for an arbitrary timestamp? #21

Closed jeznag closed 3 years ago

jeznag commented 3 years ago

Sorry this is a really noob question. I've gone through tutorial 4 on time series regression and have successfully trained a learner. What I can't figure out is making a prediction on a future date. Here's what I tried:

from datetime import datetime
epoch = datetime.utcfromtimestamp(0)

def unix_time_seconds(dt):
    return (dt - epoch).total_seconds()

df_json = {"unix_timestamp":{"6":1451606400.0,"1558":1454284800.0,"3110":1456790400.0,"4662":1459468800.0,"6214":1462060800.0,"7766":1464739200.0,"9318":1467331200.0,"10870":1470009600.0,"12422":1472688000.0,"13974":1475280000.0,"15526":1477958400.0,"17078":1480550400.0,"18630":1483228800.0,"20182":1485907200.0,"21734":1488326400.0,"23286":1491004800.0,"24838":1493596800.0,"26390":1496275200.0,"27942":1498867200.0,"29494":1501545600.0,"31046":1504224000.0,"32598":1506816000.0,"34150":1509494400.0,"35702":1512086400.0,"37254":1514764800.0,"38806":1517443200.0,"40358":1519862400.0,"41910":1522540800.0,"43462":1525132800.0,"45014":1527811200.0,"46566":1530403200.0,"48118":1533081600.0,"49670":1535760000.0,"51222":1538352000.0,"52774":1541030400.0,"54326":1543622400.0,"55878":1546300800.0,"57430":1548979200.0,"58982":1551398400.0,"60534":1554076800.0,"62086":1556668800.0,"63638":1559347200.0,"65190":1561939200.0,"66742":1564617600.0,"68294":1567296000.0,"69846":1569888000.0,"71398":1572566400.0,"72950":1575158400.0,"74502":1577836800.0,"76054":1580515200.0,"77606":1583020800.0},"Usage_MWh":{"6":5.34858,"1558":3.78055,"3110":3.4831,"4662":3.74901,"6214":3.02347,"7766":7.63334,"9318":5.62975,"10870":5.51058,"12422":4.36067,"13974":3.29915,"15526":2.76066,"17078":2.94552,"18630":2.7777,"20182":2.76716,"21734":5.78573,"23286":4.8537129444,"24838":3.1271232778,"26390":2.8168842646,"27942":2.8774968882,"29494":2.8774968882,"31046":2.7846744079,"32598":2.8774968882,"34150":2.7846744079,"35702":2.8774968882,"37254":2.8774968882,"38806":2.5990294474,"40358":2.8774968882,"41910":2.5288257571,"43462":2.5724465738,"45014":3.3510199005,"46566":3.060722109,"48118":2.6989527056,"49670":2.5984900474,"51222":3.4421489889,"52774":3.4093083871,"54326":3.5249084516,"55878":3.1468401143,"57430":3.0175142015,"58982":3.3731491579,"60534":3.0313829708,"62086":3.1152347778,"63638":3.2681218106,"65190":3.4173398852,"66742":2.897951582,"68294":3.3056545,"69846":3.1457436,"71398":2.646408469,"72950":2.5245141129,"74502":2.7281552182,"76054":6.5999127071,"77606":7.7869288929}}
df = pd.DataFrame.from_dict(df_json)

window_length = 5
stride = None
horizon=1
X, y = SlidingWindow(5, get_x=['unix_timestamp'], get_y='Usage_MWh')(regression_df)
itemify(X, y)

splits = RandomSplitter()(X) 
tfms  = [None, [ToFloat(), ToNumpyTensor()]]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)
dls   = TSDataLoaders.from_dsets(dsets.train, dsets.valid)

model = InceptionTime(dls.vars, 1)
learn = Learner(dls, model, loss_func=MSELossFlat())

try:
  learn.fit_one_cycle(5)
  # learn.recorder.plot_metrics()

  valid_preds, valid_targets = learn.get_preds(ds_idx=1)
  valid_preds.flatten(), valid_targets.data

  y_pred = valid_targets.data.tolist()

  y_true = dsets.valid.items[1]

  future_date = datetime.strptime('2020-12-01', '%Y-%m-%d')
  seconds = unix_time_seconds(future_date)
  learn.predict(tensor([seconds]))

except Exception:
  print("ouch")
  traceback.print_exc()

I get

AssertionError: Expected an input of type in 
  - <class 'numpy.ndarray'>
 but got <class 'torch.Tensor'>

Sorry again for the really dumb question. It's probably a lack of fastai knowledge. I'll do a pull request afterwards and add the answer to the tutorial notebook.

oguiza commented 3 years ago

Hi @jeznag ,

No need to apologize for being a noobie. We all are in one area or another :)

There are a few issues with your code. I've made some changes to make it work:

from datetime import datetime
epoch = datetime.utcfromtimestamp(0)

def unix_time_seconds(dt):
    return (dt - epoch).total_seconds()

df_json = {"unix_timestamp":{"6":1451606400.0,"1558":1454284800.0,"3110":1456790400.0,"4662":1459468800.0,"6214":1462060800.0,"7766":1464739200.0,"9318":1467331200.0,"10870":1470009600.0,"12422":1472688000.0,"13974":1475280000.0,"15526":1477958400.0,"17078":1480550400.0,"18630":1483228800.0,"20182":1485907200.0,"21734":1488326400.0,"23286":1491004800.0,"24838":1493596800.0,"26390":1496275200.0,"27942":1498867200.0,"29494":1501545600.0,"31046":1504224000.0,"32598":1506816000.0,"34150":1509494400.0,"35702":1512086400.0,"37254":1514764800.0,"38806":1517443200.0,"40358":1519862400.0,"41910":1522540800.0,"43462":1525132800.0,"45014":1527811200.0,"46566":1530403200.0,"48118":1533081600.0,"49670":1535760000.0,"51222":1538352000.0,"52774":1541030400.0,"54326":1543622400.0,"55878":1546300800.0,"57430":1548979200.0,"58982":1551398400.0,"60534":1554076800.0,"62086":1556668800.0,"63638":1559347200.0,"65190":1561939200.0,"66742":1564617600.0,"68294":1567296000.0,"69846":1569888000.0,"71398":1572566400.0,"72950":1575158400.0,"74502":1577836800.0,"76054":1580515200.0,"77606":1583020800.0},"Usage_MWh":{"6":5.34858,"1558":3.78055,"3110":3.4831,"4662":3.74901,"6214":3.02347,"7766":7.63334,"9318":5.62975,"10870":5.51058,"12422":4.36067,"13974":3.29915,"15526":2.76066,"17078":2.94552,"18630":2.7777,"20182":2.76716,"21734":5.78573,"23286":4.8537129444,"24838":3.1271232778,"26390":2.8168842646,"27942":2.8774968882,"29494":2.8774968882,"31046":2.7846744079,"32598":2.8774968882,"34150":2.7846744079,"35702":2.8774968882,"37254":2.8774968882,"38806":2.5990294474,"40358":2.8774968882,"41910":2.5288257571,"43462":2.5724465738,"45014":3.3510199005,"46566":3.060722109,"48118":2.6989527056,"49670":2.5984900474,"51222":3.4421489889,"52774":3.4093083871,"54326":3.5249084516,"55878":3.1468401143,"57430":3.0175142015,"58982":3.3731491579,"60534":3.0313829708,"62086":3.1152347778,"63638":3.2681218106,"65190":3.4173398852,"66742":2.897951582,"68294":3.3056545,"69846":3.1457436,"71398":2.646408469,"72950":2.5245141129,"74502":2.7281552182,"76054":6.5999127071,"77606":7.7869288929}}
df = pd.DataFrame.from_dict(df_json)

window_length = 5
stride = 1
horizon = 1
X, y = SlidingWindow(window_length=window_length, stride=stride, horizon=horizon, get_x=['unix_timestamp'], get_y='Usage_MWh')(df)

splits = TimeSplitter()(X)
tfms = [None, [ToFloat(), ToNumpyTensor()]]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid)

model = InceptionTime(dls.vars, 1)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(100, 1e-3)
learn.plot_metrics()

valid_preds, valid_targets = learn.get_preds(ds_idx=1)

plt.plot(df.index[-len(splits[1]):].values, valid_preds, label='preds')
plt.plot(df.index[-len(splits[1]):].values, valid_targets, label='targets')
plt.title('Valid Preds vs Targets')
plt.legend(loc='best')
plt.show()

With this you should get an output like this:

(tensor([3.3068, 3.4284, 3.5788, 3.7468, 3.9250, 4.1177, 4.3224, 4.5454, 4.7916]),
 tensor([3.4173, 2.8980, 3.3057, 3.1457, 2.6464, 2.5245, 2.7282, 6.5999, 7.7869]))

image image The main issues you were facing are:

Please, let me know if this resolves your issues.

jeznag commented 3 years ago

Thanks for those tips!

It's almost got me there but not quite. I want to essentially do predictions on a test set out into the future rather than the validation/dev set used to iterate on the model.

I've tried below (making sure I predict the same number of items as I trained the model on) and get image


def get_future_prediction_dates(num_months_to_predict):
  future_dates = list()
  for num_months_since_start in range(0, num_months_to_predict):
      date_for_iteration = add_months_to_date(start_date, num_months_since_start)

      seconds = unix_time_seconds(date_for_iteration)
      future_dates.append(tensor([seconds]))

  return np.array(future_dates)

def forecast_future_consumption(meter_df, learner, num_months_to_predict):
  results = dict()

  future_dates = get_future_prediction_dates(num_months_to_predict)

  print(f"predicting {future_dates}")
  predicted_consumption = learner.predict(future_dates)
  print(predicted_consumption)

  # Add a new entry for the timestamp if it doesn't exist
  results[date_for_iteration.isoformat()] = { "actual": actual_consumption}

  model_type = "tsai"
  # Add the prediction
  results[data_identifier][date_for_iteration.isoformat()][model_type] = predicted_consumption
  print(results)
  return results

from datetime import datetime
epoch = datetime.utcfromtimestamp(0)

def unix_time_seconds(dt):
    return (dt - epoch).total_seconds()

df_json = {"unix_timestamp":{"6":1451606400.0,"1558":1454284800.0,"3110":1456790400.0,"4662":1459468800.0,"6214":1462060800.0,"7766":1464739200.0,"9318":1467331200.0,"10870":1470009600.0,"12422":1472688000.0,"13974":1475280000.0,"15526":1477958400.0,"17078":1480550400.0,"18630":1483228800.0,"20182":1485907200.0,"21734":1488326400.0,"23286":1491004800.0,"24838":1493596800.0,"26390":1496275200.0,"27942":1498867200.0,"29494":1501545600.0,"31046":1504224000.0,"32598":1506816000.0,"34150":1509494400.0,"35702":1512086400.0,"37254":1514764800.0,"38806":1517443200.0,"40358":1519862400.0,"41910":1522540800.0,"43462":1525132800.0,"45014":1527811200.0,"46566":1530403200.0,"48118":1533081600.0,"49670":1535760000.0,"51222":1538352000.0,"52774":1541030400.0,"54326":1543622400.0,"55878":1546300800.0,"57430":1548979200.0,"58982":1551398400.0,"60534":1554076800.0,"62086":1556668800.0,"63638":1559347200.0,"65190":1561939200.0,"66742":1564617600.0,"68294":1567296000.0,"69846":1569888000.0,"71398":1572566400.0,"72950":1575158400.0,"74502":1577836800.0,"76054":1580515200.0,"77606":1583020800.0},"Usage_MWh":{"6":5.34858,"1558":3.78055,"3110":3.4831,"4662":3.74901,"6214":3.02347,"7766":7.63334,"9318":5.62975,"10870":5.51058,"12422":4.36067,"13974":3.29915,"15526":2.76066,"17078":2.94552,"18630":2.7777,"20182":2.76716,"21734":5.78573,"23286":4.8537129444,"24838":3.1271232778,"26390":2.8168842646,"27942":2.8774968882,"29494":2.8774968882,"31046":2.7846744079,"32598":2.8774968882,"34150":2.7846744079,"35702":2.8774968882,"37254":2.8774968882,"38806":2.5990294474,"40358":2.8774968882,"41910":2.5288257571,"43462":2.5724465738,"45014":3.3510199005,"46566":3.060722109,"48118":2.6989527056,"49670":2.5984900474,"51222":3.4421489889,"52774":3.4093083871,"54326":3.5249084516,"55878":3.1468401143,"57430":3.0175142015,"58982":3.3731491579,"60534":3.0313829708,"62086":3.1152347778,"63638":3.2681218106,"65190":3.4173398852,"66742":2.897951582,"68294":3.3056545,"69846":3.1457436,"71398":2.646408469,"72950":2.5245141129,"74502":2.7281552182,"76054":6.5999127071,"77606":7.7869288929}}
df = pd.DataFrame.from_dict(df_json)

window_length = 5
stride = 1
horizon = 1
X, y = SlidingWindow(window_length=window_length, stride=stride, horizon=horizon, get_x=['unix_timestamp'], get_y='Usage_MWh')(df)

splits = TimeSplitter()(X)
tfms = [None, [ToFloat(), ToNumpyTensor()]]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid)

model = InceptionTime(dls.vars, 1)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(100, 1e-3)

forecast_future_consumption(df, learn, len(dsets.train))
oguiza commented 3 years ago

Ok, I see. In the case you want to use the model on additional data, you need to create a new dataset and dataloader as shown in tutorial nb #1. The code would be something like this:

from datetime import datetime
epoch = datetime.utcfromtimestamp(0)

def unix_time_seconds(dt):
    return (dt - epoch).total_seconds()

df_json = {"unix_timestamp":{"6":1451606400.0,"1558":1454284800.0,"3110":1456790400.0,"4662":1459468800.0,"6214":1462060800.0,"7766":1464739200.0,"9318":1467331200.0,"10870":1470009600.0,"12422":1472688000.0,"13974":1475280000.0,"15526":1477958400.0,"17078":1480550400.0,"18630":1483228800.0,"20182":1485907200.0,"21734":1488326400.0,"23286":1491004800.0,"24838":1493596800.0,"26390":1496275200.0,"27942":1498867200.0,"29494":1501545600.0,"31046":1504224000.0,"32598":1506816000.0,"34150":1509494400.0,"35702":1512086400.0,"37254":1514764800.0,"38806":1517443200.0,"40358":1519862400.0,"41910":1522540800.0,"43462":1525132800.0,"45014":1527811200.0,"46566":1530403200.0,"48118":1533081600.0,"49670":1535760000.0,"51222":1538352000.0,"52774":1541030400.0,"54326":1543622400.0,"55878":1546300800.0,"57430":1548979200.0,"58982":1551398400.0,"60534":1554076800.0,"62086":1556668800.0,"63638":1559347200.0,"65190":1561939200.0,"66742":1564617600.0,"68294":1567296000.0,"69846":1569888000.0,"71398":1572566400.0,"72950":1575158400.0,"74502":1577836800.0,"76054":1580515200.0,"77606":1583020800.0},"Usage_MWh":{"6":5.34858,"1558":3.78055,"3110":3.4831,"4662":3.74901,"6214":3.02347,"7766":7.63334,"9318":5.62975,"10870":5.51058,"12422":4.36067,"13974":3.29915,"15526":2.76066,"17078":2.94552,"18630":2.7777,"20182":2.76716,"21734":5.78573,"23286":4.8537129444,"24838":3.1271232778,"26390":2.8168842646,"27942":2.8774968882,"29494":2.8774968882,"31046":2.7846744079,"32598":2.8774968882,"34150":2.7846744079,"35702":2.8774968882,"37254":2.8774968882,"38806":2.5990294474,"40358":2.8774968882,"41910":2.5288257571,"43462":2.5724465738,"45014":3.3510199005,"46566":3.060722109,"48118":2.6989527056,"49670":2.5984900474,"51222":3.4421489889,"52774":3.4093083871,"54326":3.5249084516,"55878":3.1468401143,"57430":3.0175142015,"58982":3.3731491579,"60534":3.0313829708,"62086":3.1152347778,"63638":3.2681218106,"65190":3.4173398852,"66742":2.897951582,"68294":3.3056545,"69846":3.1457436,"71398":2.646408469,"72950":2.5245141129,"74502":2.7281552182,"76054":6.5999127071,"77606":7.7869288929}}
df = pd.DataFrame.from_dict(df_json)

window_length = 5
stride = 1
horizon = 1
X, y = SlidingWindow(window_length=window_length, stride=stride, horizon=horizon, get_x=['unix_timestamp'], get_y='Usage_MWh')(df)

splits = get_splits(y, valid_size=.2, test_size=0.2, shuffle=False)

tfms = [None, [ToFloat(), ToNumpyTensor()]]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits)
dls = TSDataLoaders.from_dsets(dsets.train, dsets.valid)

model = InceptionTime(dls.vars, 1)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(100, 1e-3)
learn.plot_metrics()

valid_preds, valid_targets = learn.get_preds(ds_idx=1)

plt.plot(df.index[-len(splits[1]):].values, valid_preds, label='preds')
plt.plot(df.index[-len(splits[1]):].values, valid_targets, label='targets')
plt.title('Valid Preds vs Targets')
plt.legend(loc='best')
plt.show()

X_test = X[splits[2]] # splits[2] == test split
test_ds = dsets.valid.add_test(X_test)
test_dl = dls.valid.new(test_ds)

test_preds, *_ = learn.get_preds(dl=test_dl, with_decoded=True, save_preds=None, save_targs=None)

plt.plot(df.index[-len(splits[2]):].values, test_preds)
plt.title('Test Preds')
plt.show()

Please, let me know if this works.

jeznag commented 3 years ago

Thanks for your help! I'll write a blog post explaining the process.

oguiza commented 3 years ago

That'd be great! Thank you.