steve-the-bayesian / BOOM

A C++ library for Bayesian modeling, mainly through Markov chain Monte Carlo, but with a few other methods supported. BOOM = "Bayesian Object Oriented Modeling". It is also the sound your computer makes when it crashes.
GNU Lesser General Public License v2.1
35 stars 14 forks source link

Possible Problem with regression component #41

Closed alexiosg closed 4 years ago

alexiosg commented 4 years ago

Hi Steve,

Not sure whether this is a bug or some misspecification on my part, but when adding a regression component in a seasonal model the predictions seem to be off by 1 time step. I have created a "minimally" reproducible example below with comments:

library(bsts)
## simulation of data
# just a simple weekly model which allocates to fixed day of week proportions
N = 7*52
dateseq <- seq(as.Date("2000-01-02"), length.out = N, by = "days")
proportions <- c(0.005, 0.14, 0.17, 0.23, 0.265, 0.18, 0.01)
pMatrix <- matrix(0, ncol = 1, nrow = N)
for (i in 1:7) {
  pMatrix[which(weekdays(dateseq, FALSE) == weekday.names[i]), 1] = proportions[i]
}
set.seed(102)
weekly.sim <- arima.sim(list(ma=-0.1), n = 52, sd = 0.01)
weekly.sim <- exp(diffinv(weekly.sim, differences = 1, xi = log(200)))[-1]
plot(weekly.sim, type = "l")
seqindx <- c(seq(1, N, by =  7), N + 1)
daily.sim.allocate <- do.call(rbind, lapply(1:(length(seqindx) - 1), function(i){
  cbind(pMatrix[seqindx[i]:(seqindx[i + 1] - 1)] * weekly.sim[i])
}))
# add a holiday effect
independence <- which(dateseq == "2000-07-04")
daily.sim.allocate[independence] <- daily.sim.allocate[independence] * 0.1
xreg <- rep(0, N)
xreg[independence] <- 1
colnames(xreg) <- "independence"
dat <- data.frame(y = as.numeric(daily.sim.allocate), independence = xreg, days = weekdays(dateseq), stringsAsFactors = FALSE)
rownames(dat) <- dateseq
plot(dateseq, dat[,1], type = "l")
tail(dat[1:(independence+24),])

## estimation part

spec <- AddLocalLevel(list(), y = as.numeric(dat[1:(independence+24),1]))
spec <- AddSeasonal(spec, y = as.numeric(dat[1:(independence+24),1]), nseasons = 7)
# estimate with regressor and intercept
mod1 <- bsts(y~independence, data = dat[1:(independence + 24), 1:2], state.specification = spec, niter = 1000)
# estimate with regressor and no intercept
mod2 <- bsts(y~independence-1, data = dat[1:(independence + 24), 1:2], state.specification = spec, niter = 1000)
# estimate with no regressor and no intercept
mod3 <- bsts(as.numeric(dat[1:(independence + 24), 1]), state.specification = spec, niter = 1000)

## forecast part
horizon <- length((independence + 25):N)
# the newdata has only zeros.
newdata <- dat[(independence + 25):N, 2, drop = FALSE]

p1 <- predict(mod1, newdata = newdata)
p2 <- predict(mod2, newdata = newdata)
p3 <- predict(mod3, horizon = horizon)

# comparison of output
comparison <- data.frame(day = weekdays(as.Date(rownames(dat[(independence + 25):N, 2, drop = FALSE]))),
                         mean1 = colMeans(p1$distribution),
                         mean2 = colMeans(p2$distribution),
                         mean3 = colMeans(p3$distribution),
                         actual = dat[(independence + 25):N, 1])

head(comparison)

# so I can see that the model without a regressor (mean3) is predicting as expected, but the other models with the a regressor appear to produce a forecast which is off by 1 time step. I have manually re-created the posterior predictive distribution by sampling from the final states  and the distribution of the parameters (variances and coefficients) and have found that this problem does not occur in this case, which leads me to think that this may be a bug.

day               mean1         mean2         mean3          actual
Saturday      38.744172  38.722979   2.221122      2.153567
Sunday        2.987476    2.998383    1.281937       1.074628
Monday       1.942250    1.934938     29.627750    30.089592
Tuesday      30.622240  30.638868  34.298719    36.537362
Wednesday 36.790501  36.776222   48.544461   49.432902
Thursday     49.416352  49.436155   55.644073   56.955300

Thanks! Alexios

steve-the-bayesian commented 4 years ago

Thanks for reporting this, and especially for the example.

On Tue, Oct 15, 2019 at 9:12 PM Alexios Galanos notifications@github.com wrote:

Hi Steve,

Not sure whether this is a bug or some misspecification on my part, but when adding a regression component in a seasonal model the predictions seem to be off by 1 time step. I have created a "minimally" reproducible example below with comments:

library(bsts)## simulation of data# just a simple weekly model which allocates to fixed day of week proportionsN = 752dateseq <- seq(as.Date("2000-01-02"), length.out = N, by = "days")proportions <- c(0.005, 0.14, 0.17, 0.23, 0.265, 0.18, 0.01)pMatrix <- matrix(0, ncol = 1, nrow = N)for (i in 1:7) { pMatrix[which(weekdays(dateseq, FALSE) == weekday.names[i]), 1] = proportions[i] } set.seed(102)weekly.sim <- arima.sim(list(ma=-0.1), n = 52, sd = 0.01)weekly.sim <- exp(diffinv(weekly.sim, differences = 1, xi = log(200)))[-1] plot(weekly.sim, type = "l")seqindx <- c(seq(1, N, by = 7), N + 1)daily.sim.allocate <- do.call(rbind, lapply(1:(length(seqindx) - 1), function(i){ cbind(pMatrix[seqindx[i]:(seqindx[i + 1] - 1)] weekly.sim[i]) }))# add a holiday effectindependence <- which(dateseq == "2000-07-04")daily.sim.allocate[independence] <- daily.sim.allocate[independence] * 0.1xreg <- rep(0, N)xreg[independence] <- 1 colnames(xreg) <- "independence"dat <- data.frame(y = as.numeric(daily.sim.allocate), independence = xreg, days = weekdays(dateseq), stringsAsFactors = FALSE) rownames(dat) <- dateseq plot(dateseq, dat[,1], type = "l") tail(dat[1:(independence+24),])

estimation part

spec <- AddLocalLevel(list(), y = as.numeric(dat[1:(independence+24),1]))spec <- AddSeasonal(spec, y = as.numeric(dat[1:(independence+24),1]), nseasons = 7)# estimate with regressor and interceptmod1 <- bsts(y~independence, data = dat[1:(independence + 24), 1:2], state.specification = spec, niter = 1000)# estimate with regressor and no interceptmod2 <- bsts(y~independence-1, data = dat[1:(independence + 24), 1:2], state.specification = spec, niter = 1000)# estimate with no regressor and no interceptmod3 <- bsts(as.numeric(dat[1:(independence + 24), 1]), state.specification = spec, niter = 1000)

forecast parthorizon <- length((independence + 25):N)# the newdata has only zeros.newdata <- dat[(independence + 25):N, 2, drop = FALSE]

p1 <- predict(mod1, newdata = newdata)p2 <- predict(mod2, newdata = newdata)p3 <- predict(mod3, horizon = horizon)

comparison of outputcomparison <- data.frame(day = weekdays(as.Date(rownames(dat[(independence + 25):N, 2, drop = FALSE]))),

                     mean1 = colMeans(p1$distribution),
                     mean2 = colMeans(p2$distribution),
                     mean3 = colMeans(p3$distribution),
                     actual = dat[(independence + 25):N, 1])

head(comparison)

so I can see that the model without a regressor (mean3) is predicting as expected, but the other models with the a regressor appear to produce a forecast which is off by 1 time step. I have manually re-created the posterior predictive distribution by sampling from the final states and the distribution of the parameters (variances and coefficients) and have found that this problem does not occur in this case, which leads me to think that this may be a bug.

day mean1 mean2 mean3 actualSaturday 38.744172 38.722979 2.221122 2.153567Sunday 2.987476 2.998383 1.281937 1.074628Monday 1.942250 1.934938 29.627750 30.089592Tuesday 30.622240 30.638868 34.298719 36.537362Wednesday 36.790501 36.776222 48.544461 49.432902Thursday 49.416352 49.436155 55.644073 56.955300

Thanks! Alexios

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/steve-the-bayesian/BOOM/issues/41?email_source=notifications&email_token=ABMVDVNZX5JIGREH7KWHTZLQO2IEBA5CNFSM4JBF4R2KYY3PNVWWK3TUL52HS4DFUVEXG43VMWVGG33NNVSW45C7NFSM4HSBMODQ, or unsubscribe https://github.com/notifications/unsubscribe-auth/ABMVDVI7XSCIFLUS63DW7ATQO2IEBANCNFSM4JBF4R2A .

steve-the-bayesian commented 4 years ago

Also, Dear Steve,

First of all – thanks for your work on the BSTS package and the great documentation! I have a question that I could not resolve in two days and I thought you might know the answer instantly.

I have a bsts model and I add holidays as well as other additional regressors. However, when adding additional regressors, the forecast shifts by one day. Even when I only include a dummy for a high value at the end of the month, this happens. In the graph below you see the blue line, which is a bsts forecast without additional regressors and the orange line a forecast with a dummy (xts_eom_high). While the blue line matches data and the weekly cycle, the orange line is delayed by one day.

I attached the relevant part of the R script. I have tried a lot of other things, and I’d be grateful for any input as to why this happens and how I can find out when this happens.

Also, does adding holidays the way I do it work? Because I holidays like Easter which are different across years show little effect. Steve Output sample.xlsx

Thank you very much for your help and best regards

Johannes

Add libraries

......

####################### SETUP ####################### ds_train_start <- as.Date('2014-06-01') ds_train_end <- as.Date('2018-12-31')

ds_prediction_start <- as.Date('2019-01-01') ds_prediction_end <- as.Date('2019-12-31') USE_END_OF_MONTH_HIGH_FOR_BSTS <- TRUE

BSTS_NITER = 500 #####################################################

xts_y = as.xts(zooreg(train_data$y, start=ds_train_start)) xts_eom_high = as.xts(zooreg(train_data$end_of_month_high, start=ds_train_start))

seas <- AddSemilocalLinearTrend(list(), xts_y) seas <- AddSeasonal(seas, xts_y, nseasons = 7) seas <- AddSeasonal(seas, xts_y, nseasons = 365)

Adding Holidays

The holidays dataframe has a columns with names of the holidays called "holiday" and a column with all the dates "ds"

holiday_names = unique(holidays$holiday)

for(hnr in 1:length(holiday_names)) { holiday_tmp_name = paste0("bstshol", sub(" ", "_", holiday_names[hnr])) holiday_tmp_dates = holidays$ds[holidays$holiday == holiday_names[hnr]] assign(holiday_tmp_name, DateRangeHoliday(holiday_names[hnr], start = holiday_tmp_dates, end = holiday_tmp_dates))

seas <- AddRegressionHoliday(seas, xts_y, time0 = ds_train_start, holiday.list=list(get(holiday_tmp_name))) }

if (USE_END_OF_MONTH_HIGH_FOR_BSTS) { model_bsts <- bsts(xts_y ~ ., # ~ xts_eom_high, state.specification = seas, data = xts_eom_high, niter = BSTS_NITER) } else { model_bsts <- bsts(xts_y, # ~ ., state.specification = seas, niter = BSTS_NITER) }

plot(model_bsts)

Get a suggested number of burn-ins

burn <- SuggestBurn(0.2, model_bsts)

Predict

The error from before: 'newdata' hat 365 Zeilen , aber die gefundenen Variablen haben 1674 Zeilen -> how do I add the end_of_month_high for the prediction

if (USE_END_OF_MONTH_HIGH_FOR_BSTS) { future_bsts_data <- subset(sc_fin, date >= ds_prediction_start & date <= ds_prediction_end, c("date","end_of_month_high")) xts_eom_high_future = as.xts(zooreg(future_bsts_data$end_of_month_high, start=ds_prediction_start)) forecast_bsts <- predict(model_bsts, newdata=xts_eom_high_future, horizon = 365, burn = burn) } else { forecast_bsts <- predict(model_bsts, horizon = 365, burn = burn) }

pred_data_bsts <- data.frame(ds = prediction_dates, v_bsts = as.numeric(forecast_bsts$mean))

steve-the-bayesian commented 4 years ago

Fixed and tested. Closed pending release.