atsa-es / atsar

Applied time series analysis in R with Stan. Allows fast Bayesian fitting of multivariate time-series models.
https://atsa-es.github.io/atsar/
49 stars 16 forks source link

atsar dlm is producing poor fits #1

Open eeholmes opened 4 years ago

eeholmes commented 4 years ago

Example

library(MARSS)

# load the data
data(SalmonSurvCUI, package="MARSS")
# get time indices
years <- SalmonSurvCUI[,1]
# number of years of data
TT <- length(years)
# get response variable: logit(survival)
dat <- matrix(SalmonSurvCUI[,2],nrow=1)

# get predictor variable
CUI <- SalmonSurvCUI[,3]
## z-score the CUI
CUI.z <- matrix((CUI - mean(CUI))/sqrt(var(CUI)), nrow=1)
# number of regr params (slope + intercept)
m <- dim(CUI.z)[1] + 1

## plot data
par(mfrow=c(m,1), mar=c(4,4,0.1,0), oma=c(0,0,2,0.5))
plot(years, dat, xlab="", ylab="Logit(s)", bty="n", xaxt="n", pch=16, col="darkgreen", type="b")
plot(years, CUI.z, xlab="", ylab="CUI", bty="n", xaxt="n", pch=16, col="blue", type="b")
axis(1,at=seq(1965,2005,5))
mtext("Year of ocean entry", 1, line=3)

## univariate DLM -------------
# for process eqn
B <- diag(m)                     ## 2x2; Identity
U <- matrix(0,nrow=m,ncol=1)     ## 2x1; both elements = 0
Q <- matrix(list(0),m,m)         ## 2x2; all 0 for now
diag(Q) <- c("q.alpha","q.beta") ## 2x2; diag = (q1,q2)

# for observation eqn
Z <- array(NA, c(1,m,TT))   ## NxMxT; empty for now
Z[1,1,] <- rep(1,TT)        ## Nx1; 1's for intercept
Z[1,2,] <- CUI.z            ## Nx1; predictor variable
A <- matrix(0)              ## 1x1; scalar = 0
R <- matrix("r")            ## 1x1; scalar = r

# only need starting values for regr parameters
inits.list <- list(x0=matrix(c(0, 0), nrow=m))
# list of model matrices & vectors
mod.list <- list(B=B, U=U, Q=Q, Z=Z, A=A, R=R)

# fit univariate DLM
dlm1 <- MARSS(dat, inits=inits.list, model=mod.list)

# fit w atsar
library(atsar)
mod2 = fit_stan(y = SalmonSurvCUI$logit.s,
                x = model.matrix(lmmod),
                model_name="dlm")
pars = extract(mod2)
fc2 <- apply(pars$pred, 2, mean)
fc_lb2 <- apply(pars$pred, 2, quantile, 0.025)
fc_ub2 <- rev(apply(pars$pred, 2, quantile, 0.975))
xx <- c(years, rev(years))

plot differences

layout(matrix(1:2))
ylims=c(min(fore.mean-2*sqrt(fore.var)),max(fore.mean+2*sqrt(fore.var)))
plot(years, t(dat), type="p", pch=16, ylim=ylims,
     col="blue", xlab="", ylab="Logit(s)", xaxt="n")
tmp <- broom::augment(dlm1, interval="confidence")
polygon(x = xx, y = c(tmp$.conf.low, rev(tmp$.conf.up)), col = scales::alpha('gray', .5), border = NA)
lines(years, tmp$.fitted, type="l", xaxt="n", ylab="")
title("MARSS")
#
pars = extract(mod2)
fc <- apply(pars$pred, 2, mean)
fc_lb <- apply(pars$pred, 2, quantile, 0.025)
fc_ub <- rev(apply(pars$pred, 2, quantile, 0.975))
plot(x = years, y = y, pch = 16, col="blue", ylim = ylims)
polygon(x = xx, y = c(fc_lb, fc_ub), col = scales::alpha('gray', .5), border = NA)
lines(x = years, y = fc, type="l")
title("atsar")

marss-atsar

eeholmes commented 4 years ago

The following produces better fits:

mod0 <- rstan::stan( here::here("dlm-vec.stan"), data = data_list,
                     pars = c("F_Theta", "Theta", "Theta0", "tau", "L_Omega", "R", "L", "Q"),
                     control = list(adapt_delta = .995, max_treedepth = 15),
                     cores = 4L,
                     chains = mcmc_list$n_chain,
                     warmup = mcmc_list$n_warmup,
                     iter = mcmc_list$n_iter,
                     thin = mcmc_list$n_thin)

pars = extract(mod)
fc <- apply(pars$F_Theta, 2, mean)
fc_lb <- apply(pars$F_Theta, 2, quantile, 0.01)
fc_ub <- rev(apply(pars$F_Theta, 2, quantile, 0.99))
xx <- c(years, rev(years))

where dlm-vec.stan is

data {
  int<lower=0> N; // length of dependent variable
  int<lower=0> K; // number of indep vars
  vector[N] y;
  row_vector[K] F[N]; // row_vectors of
}

transformed data {
  for (n in 1:N) {
      print("F[", n, "] = ", F[n]);
  }
}

parameters {
  vector[K] Theta0; // init Theta
  vector[K] Theta[N]; // state space paramater
  real<lower=0> R; // model error
  cholesky_factor_corr[K] L_Omega; //prior correlation
  vector<lower=0>[K] tau; // prior scale
}

transformed parameters {
  matrix[K, K] L;
  vector[N] F_Theta;

  L = diag_pre_multiply(tau, L_Omega);

  for (n in 1:N)
    F_Theta[n] = F[n]*Theta[n];
}

model {
  R ~ exponential(1);
  Theta0 ~ normal(0, 5);
  L_Omega ~ lkj_corr_cholesky(1);
  tau ~ exponential(1);
  Theta[1] ~ multi_normal_cholesky(Theta0, L);
  for (n in 2:N)
    Theta[n] ~ multi_normal_cholesky(Theta[n-1], L);
  y ~ normal(F_Theta, R);
}

generated quantities {
  matrix[K, K] Q;
  Q = L * L';
}

Output in this case is

marss-stan

eeholmes commented 4 years ago

After more poking into dlm.stan and comparing to ss_rw.stan in atsar, I think the problem is more basic and has to do probably with the 2D matrices. Here I show a random walk observed with error coded as a dlm versus as a rw w error. The model fits fine with exec/ss_rw.stan code but not with the exec/dlm code.

library(atsar)
library(rstan)

n <- 30
y = cumsum(rnorm(n,0,.1))
FF <- rep(1,n)
mod1 = fit_stan(y = y,
                x = FF,
                model_name="dlm",
                mcmc_list = list(n_mcmc = 1000, n_burn = 500, 
                                 n_chain = 3, n_thin = 1))

mod2 = fit_stan(y = y,
                model_name="ss_rw",
                mcmc_list = list(n_mcmc = 1000, n_burn = 500, 
                                 n_chain = 3, n_thin = 1))

layout(matrix(1:2))
pars = extract(mod1)
fc <- apply(pars$pred, 2, mean)
plot(x = 1:n, y = y, pch = 16, col="blue")
lines(x = 1:n, y = fc, type="l")
title("dlm")

pars = extract(mod2)
fc <- apply(pars$pred, 2, mean)
plot(x = 1:n, y = y, pch = 16, col="blue")
lines(x = 1:n, y = fc, type="l")
title("ss_rw")

dlm vs ss_rw

eeholmes commented 4 years ago

Note that

beta0[k] ~ normal(0,1);

in dlm.stan is definitely a problem since beta might be well outside that. ss_rw uses normal(0,10). But even when I changed that it didn't fix the problem.

dantonnoriega commented 4 years ago

Whatever version I merged still had my work in progress scripts. Apologies for that.

I find that the dlm-vec2.stan, a non-centered version of the model, sampled faster with fewer divergences. Note that i have adapt_delta = 0.99 and max_treedepth = 12. This proves to be critical and I plan to see what happens if I run the original atsar::fit_stan dlm model but add the option to set the adapt_delta and max_treedepth.

data {
  int<lower=0> N; // length of dependent variable
  int<lower=0> K; // number of indep vars
  vector[N] y; // univariate time series
  row_vector[K] F[N]; // N vectors of size K (array[N,K])
}

transformed data {
  for (n in 1:N) {
      print("F[", n, "] = ", F[n]);
  }
}

parameters {
  vector[K] Theta0; // init Theta
  real<lower=0> R; // model error
  cholesky_factor_corr[K] L_Omega; // cholesky factor of correlation matrix Omega
  vector<lower=0>[K] tau; // scale values for Thetas
  vector[K] z[N]; // std normal for each iteration of Theta
}

transformed parameters {
  matrix[K, K] L;
  vector[K] Theta[N]; // state space paramater
  vector[N] F_Theta;

  L = diag_pre_multiply(tau, L_Omega);

  // non-centered multivariate normal reparameterization
  // learned from http://modernstatisticalworkflow.blogspot.com/2017/07/a-few-simple-reparameterizations.html
  Theta[1] = Theta0 + L * z[1];
  for (n in 2:N)
    Theta[n] = Theta[n-1] + L * z[n];

  for (n in 1:N)
    F_Theta[n] = F[n]*Theta[n];
}

model {
  R  ~ exponential(2);
  Theta0 ~ normal(0, 5);
  L_Omega ~ lkj_corr_cholesky(3);
  tau ~ exponential(2);

  for (n in 1:N)
    z[n] ~ normal(0, 1);
  y ~ normal(F_Theta, R);
}

generated quantities {
  matrix[K, K] Q;
  Q = L * L';
}
dantonnoriega commented 4 years ago

Better still, modeling on the differences between successive states greatly improves the sampling speed and effective sample sizes. Note the option to pass a theta0_init_prior.

I learned this from a comment to the gelman blog post: https://statmodeling.stat.columbia.edu/2019/04/15/state-space-models-in-stan/

That’s a perennial problem with spatio-temporal models as you’re setting up the correlations in the model. A helpful reparameterization can be to model the differences. For example, if you have a latent time series alpha[1], …, alpha[T], the direct model alpha[t + 1] ~ normal(alpha[t], sigma) where alpha is a parameter leads to dependence between alpha[t] and alpha[t + 1]. Alternatively, you can take alpha[1] and differences delta[t + 1] = alpha[t + 1] – alpha[t], taking delta[t] ~ normal(0, sigma).

data {
  int<lower=0> N; // length of dependent variable
  int<lower=0> K; // number of indep vars
  vector[N] y; // univariate time series
  row_vector[K] F[N]; // N vectors of size K (array[N,K])
  vector[2] theta0_init_prior; // mean, sd of theta0
}

transformed data {
  for (n in 1:N) {
      print("F[", n, "] = ", F[n]);
  }
}

parameters {
  vector[K] Theta0; // init Theta
  vector[K] Theta[N]; // state space paramater
  real<lower=0> R; // model error
  cholesky_factor_corr[K] L_Omega; //prior correlation
  vector<lower=0>[K] tau; // prior scale
}

transformed parameters {
  matrix[K, K] L;
  vector[N] F_Theta;

  L = diag_pre_multiply(tau, L_Omega);

  for (n in 1:N)
    F_Theta[n] = F[n]*Theta[n];
}

model {
  vector[K] Delta[N]; // model the differences
  // estimating Theta through differences greatly improves sampling speed
  // and effective sample size
  Delta[1] = Theta[1] - Theta0;
  for (n in 2:N) {
    Delta[n] = Theta[n] - Theta[n-1];
    Delta[n] ~ multi_normal_cholesky(rep_vector(0,K), L);
  }
  R ~ exponential(2);
  tau ~ exponential(2);
  L_Omega ~ lkj_corr_cholesky(1);
  Theta0 ~ normal(theta0_init_prior[1], theta0_init_prior[2]);
  y ~ normal(F_Theta, R);
}

generated quantities {
  matrix[K, K] Q;
  Q = L * L';
}