kingaa / pomp

R package for statistical inference using partially observed Markov processes
https://kingaa.github.io/pomp
GNU General Public License v3.0
112 stars 27 forks source link

NaN output in mif2 algorithm #25

Closed theresasophia closed 8 years ago

theresasophia commented 8 years ago

The problem I am facing is that after running a local search with the mif2 algorithm the object mifs_local contains several entries where all fitted parameters are NaN (e.g. mif_local[[2]] in the code below) whereas for others I get some output. Surprisingly, this does not produce an error (why?), however I see an error when trying to evaluate the likelihood with pfilter that dmeasure returns non-finite value.

Another big issue is that, even with a very low number of particles and mif2 iterations, I am facing extremely long running times although the code is parallelized and I have access to a cluster. I am wondering if this is normal or if I can change something to speed up my calculations?

sessionInfo()
R version 3.3.1 (2016-06-21)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.6 (El Capitan)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] parallel  grid      stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] doParallel_1.0.10 magrittr_1.5      pomp_1.7          doMC_1.3.4        iterators_1.0.8   foreach_1.4.3     reshape2_1.4.1    plyr_1.8.4        ggplot2_2.1.0    

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.6      knitr_1.13       munsell_0.4.3    colorspace_1.2-6 lattice_0.20-33  subplex_1.1-6    stringr_1.0.0    tools_3.3.1      gtable_0.2.0    
[10] coda_0.18-1      digest_0.6.9     nloptr_1.0.4     codetools_0.2-14 deSolve_1.13     stringi_1.1.1    compiler_3.3.1   scales_0.4.0     mvtnorm_1.0-5   

Here is my model:

#observational model, observations are poisson
rmeas <- Csnippet("cases1 = rpois(H1);
                   cases2 = rpois(H2);
                   cases3 = rpois(H3);
                  ")
#allow for the possibility of NAs in the data
dmeas <-Csnippet(" if (ISNA(cases1)) {
                   lik = (give_log) ? 0 : 1;
                   } else {
                   lik =  dpois(cases1, H1, 1) +
                            dpois(cases2, H2, 1) +
                            dpois(cases3, H3, 1);
                   lik = (give_log) ? lik : exp(lik);
                  }")
# process model is Markovian SIRS with 3 age classes
sir.step <- Csnippet("
double rate[19];
double dN[19];
double Beta1;
double Beta2;
double Beta3;
double I;
I= I1+I2+I3;
Beta1 = beta1*(1 + beta11 * cos(M_2PI/52*time + phi)); 
Beta2 = beta2*(1 + beta11 * cos(M_2PI/52*time + phi)); 
Beta3 = beta3*(1 + beta11 * cos(M_2PI/52*time + phi)); 
rate[0] = alpha*N;
rate[1] = Beta1*I/N;
rate[2] = delta1;
rate[3] = Beta2*I/N;
rate[4] = delta2;
rate[5] = Beta3*I/N;
rate[6] = mu;
rate[7] = gamma;
rate[8] = delta1;
rate[9] = gamma;
rate[10] = delta2;
rate[11] = gamma;
rate[12] = mu;
rate[13] = delta1;
rate[14] = omega;  
rate[15] = delta2;  
rate[16] = omega;  
rate[17] = mu;  
rate[18] = omega;  
dN[0] = rpois(rate[0]*dt); // births are Poisson
reulermultinom(2, S1, &rate[1], dt, &dN[1]);
reulermultinom(2, S2, &rate[3], dt, &dN[3]);
reulermultinom(2, S3, &rate[5], dt, &dN[5]);
reulermultinom(2, I1, &rate[7], dt, &dN[7]);
reulermultinom(2, I2, &rate[9], dt, &dN[9]);
reulermultinom(2, I3, &rate[11], dt, &dN[11]);
reulermultinom(2, R1, &rate[13], dt, &dN[13]);
reulermultinom(2, R2, &rate[15], dt, &dN[15]);
reulermultinom(2, R3, &rate[17], dt, &dN[17]);
S1 += dN[0] - dN[1] - dN[2] + dN[14];
S2 += dN[2] - dN[3] - dN[4]  + dN[16];
S3 += dN[4] - dN[5] - dN[6] + dN[18];
I1 += dN[1]          - dN[7] - dN[8];
I2 += dN[3] + dN[8]  - dN[9] - dN[10];
I3 += dN[5] + dN[10] - dN[11] - dN[12];
R1 += dN[7]           - dN[13] - dN[14];
R2 += dN[9]  + dN[13] - dN[15] - dN[16];
R3 += dN[11] + dN[15] - dN[17] - dN[18];
H1 += dN[1];
H2 += dN[3];
H3 += dN[5];
")
# to and from estimation scale (phi should be in [-2*pi,0])
toEst <- Csnippet("Tbeta1  = log(beta1);
                   Tbeta2  = log(beta2);
                   Tbeta3  = log(beta3);
                   Tbeta11 = logit(beta11);
                   Tphi    = logit(-phi/M_2PI);
                  ")
fromEst <- Csnippet("Tbeta1  = exp(beta1);
                     Tbeta2  = exp(beta2);
                     Tbeta3  = exp(beta3);
                     Tbeta11 = expit(beta11);
                     Tphi    = -M_2PI*expit(phi);
                    ")
# initalizer
sir_initializer <- Csnippet("S1= 4872721;   I1= 2871; R1= 124408; H1=0;
                             S2= 54942298;  I2= 639;  R2= 57063;  H2=0;
                             S3= 19990218;  I3= 174;  R3= 9608;  H3=0;
                            ")
# read in the observations
obs <- read.table("data.txt")
# define the covariate time
tbasis <- seq(-6*52+1,nrow(obs),by=1)
basis <- as.matrix(data.frame(time=tbasis))
# define parameters
params <- c(beta1=1.287395e+01, beta2=2.409750e-01, beta3=4.034036e-01, 
                      beta11=1.509189e-01, phi= -1.706477e-03, gamma=1, delta1=1/(5*52),   
                      delta2=1/(55*52), alpha=1/(80*52), mu=1/(20*52), N=80000000, omega=1/(1*52))
# add at t=0 a row of NAs to not have problems with the accumulator variable since
# t0 is much less than t[1]
dat <- arrange(time,df=rbind(data.frame(time=0,cases1=NA,cases2=NA,cases3=NA),obs))
# define the pomp object
pomp(data = dat,
     times="time",
     t0=tbasis[1],
     dmeasure = dmeas,
     rmeasure = rmeas,
     rprocess = euler.sim(step.fun = sir.step, delta.t = 1/80),
     statenames = c("S1", "I1", "R1", "H1", "S2", "I2", "R2", "H2","S3","I3", "R3", "H3"),
     obsnames = c("cases1", "cases2","cases3"),
     paramnames = names(params),
     zeronames=c("H1", "H2", "H3"),
     initializer=sir_initializer,
     toEstimationScale=toEst,
     fromEstimationScale=fromEst,
     covar=basis,
     tcovar=tbasis,
     params = params,
     PACKAGE="pomp"
) ->sir
# iterated filtering with only local search
run_level <- 1
switch(run_level,
       {sir_Np=100; sir_Nmif=10; sir_Neval=10; sir_Nglobal=10; sir_Nlocal=10}
)
require(doParallel)
cores <- 4
registerDoParallel(cores)
mcopts <- list(set.seed=TRUE)
set.seed(396658101,kind="L'Ecuyer")
sir_rw.sd <- 0.02
sir_cooling.fraction.50 <- 0.5
sir.start <- coef(sir)
# check if particle filter works
stew(file=sprintf("pf-%d.rda",run_level),{
  t_pf <- system.time(
    pf <- foreach(i=1:10,.packages='pomp',
                  .options.multicore=mcopts) %dopar% try(
                    pfilter(sir,params=sir.start,Np=sir_Np)
                  )
  )
},seed=1320290398,kind="L'Ecuyer")
(L_pf <- logmeanexp(sapply(pf,logLik),se=TRUE))

The output of the particle filter is

                   se 
-16108.933     10.918 

Now running the mif2 algorithm

stew(file=sprintf("local_search_seas-%d.rda",run_level),{
  t_local <- system.time({
    mifs_local <- foreach(i=1:sir_Nlocal,.packages='pomp', .combine=c, 
                          .options.multicore=mcopts) %dopar%  {
      mif2(
        sir,
        start=sir.start,
        Np=sir_Np,
        Nmif=sir_Nmif,
        cooling.type="geometric",
        cooling.fraction.50=sir_cooling.fraction.50,
        transform=TRUE,
        rw.sd=rw.sd(
          beta1=sir_rw.sd,
          beta2=sir_rw.sd,
          beta3=sir_rw.sd,
          beta11=sir_rw.sd,
          phi=sir_rw.sd
        )
      )
    }
  })
},seed=900242057,kind="L'Ecuyer")

Here I obtain for example

coef(mifs_local[[1]])
        beta1         beta2         beta3        beta11           phi         gamma        delta1 
 1.635396e+01  3.652980e-01  6.534982e-01  1.771040e-01 -9.856913e-04  1.000000e+00  3.846154e-03 
       delta2         alpha            mu             N         omega 
 3.496503e-04  2.403846e-04  9.615385e-04  8.000000e+07  1.923077e-02 

BUT

coef(mifs_local[[2]])
 beta1  beta2  beta3 beta11    phi  gamma delta1 delta2  alpha     mu      N  omega 
   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN 

which gives the following error when evaluating pfilter

stew(file=sprintf("lik_local_seas-%d.rda",run_level),{
  t_local_eval <- system.time({
    liks_local <- foreach(i=1:sir_Nlocal,.packages='pomp',.combine=rbind) %dopar% {
      evals <- replicate(sir_Neval, logLik(pfilter(sir,params=coef(mifs_local[[i]]),Np=sir_Np)))
      logmeanexp(evals, se=TRUE)
    }
  })
},seed=900242057,kind="L'Ecuyer")
Error in { : 
  task 2 failed - "in ‘pfilter’: ‘dmeasure’ returns non-finite value."
kingaa commented 8 years ago

@theresasophia, I believe I see what is happening. You ask two questions. In the following, I address the second one first.

Computational complexity of particle filter and iterated filtering computations

Why does execution take longer than you would like? Notice that the latent state process evolves over the interval [-6 yr, 8 yr] and that you are simulating it using an Euler method with step size 1/80/52 yr, which means you are taking roughly 4000 x 14 = 56000 steps per simulation. Each step involves drawing 18 binomials and a Poisson. The computational complexity of the particle filter scales linearly with the number of particles. With 100 particles, you are doing order 10^8 binomial draws in your particle-filter calculation. Since the bulk of the effort in particle filtering (and hence iterated filtering) is in the simulation of the latent state process, you now have a pretty good idea how long it takes to do this much computation on your machine.

Perhaps the best way of speeding things up is to increase the Euler step size. The price you pay is decreased fidelity to your model equations. However, from a slightly different point of view, it makes sense to view the simulator itself as the model, and the equations as an approximation. Be that as it may, one can always explore the consequences, for inference, of changing the step size.

On the importance of allowing for overdispersion.

Why is mif2 giving you non-finite numbers as parameter estimates? What is happening is that the model fits the data very poorly. This leads to many filtering failures---situations in which none of the particles are consistent with the data (as determined by an adjustable threshold with default value 1e-17, i.e., the tol argument in pfilter). What does mif2 do in such situations? The random mutations are added, but there is no information from the data to guide selection. The result is random drift. Evidently, your particles were drifting off to very large or small values.

The NaNs are resulting from the fact that the point estimate of mif2 is the weighted mean of the particles. For this, it uses the R function weighted.mean from packages stats, which returns NaN when all the weights are zero.

[Note that this behavior is changed as of version 1.8.9.1, which, in these circumstances, returns an unweighted mean of the particles, thus avoiding the NaNs.]

So the question becomes, why does the model fit the data so poorly? A few simulations make it clear. We'll first implement your model in pomp.

library(pomp)
library(magrittr)
library(plyr)
library(reshape2)
library(ggplot2)
library(scales)
library(foreach)
library(doParallel)
registerDoParallel()
stopifnot(packageVersion("pomp") >= "1.7")

#observational model, observations are poisson
rmeas <- Csnippet("
  cases1 = rpois(H1);
  cases2 = rpois(H2);
  cases3 = rpois(H3);
                  ")
#allow for the possibility of NAs in the data
dmeas <-Csnippet("
  if (ISNA(cases1)) {
    lik = (give_log) ? 0 : 1;
  } else {
    lik =  dpois(cases1, H1, 1) +
            dpois(cases2, H2, 1) +
            dpois(cases3, H3, 1);
    lik = (give_log) ? lik : exp(lik);
  }")
# process model is Markovian SIRS with 3 age classes
sir.step <- Csnippet("
                     double rate[19];
                     double dN[19];
                     double Beta1;
                     double Beta2;
                     double Beta3;
                     double I;
                     I= I1+I2+I3;
                     Beta1 = beta1*(1 + beta11 * cos(M_2PI/52*t + phi));
                     Beta2 = beta2*(1 + beta11 * cos(M_2PI/52*t + phi));
                     Beta3 = beta3*(1 + beta11 * cos(M_2PI/52*t + phi));
                     rate[0] = alpha*N;
                     rate[1] = Beta1*I/N;
                     rate[2] = delta1;
                     rate[3] = Beta2*I/N;
                     rate[4] = delta2;
                     rate[5] = Beta3*I/N;
                     rate[6] = mu;
                     rate[7] = gamma;
                     rate[8] = delta1;
                     rate[9] = gamma;
                     rate[10] = delta2;
                     rate[11] = gamma;
                     rate[12] = mu;
                     rate[13] = delta1;
                     rate[14] = omega;
                     rate[15] = delta2;
                     rate[16] = omega;
                     rate[17] = mu;
                     rate[18] = omega;
                     dN[0] = rpois(rate[0]*dt); // births are Poisson
                     reulermultinom(2, S1, &rate[1], dt, &dN[1]);
                     reulermultinom(2, S2, &rate[3], dt, &dN[3]);
                     reulermultinom(2, S3, &rate[5], dt, &dN[5]);
                     reulermultinom(2, I1, &rate[7], dt, &dN[7]);
                     reulermultinom(2, I2, &rate[9], dt, &dN[9]);
                     reulermultinom(2, I3, &rate[11], dt, &dN[11]);
                     reulermultinom(2, R1, &rate[13], dt, &dN[13]);
                     reulermultinom(2, R2, &rate[15], dt, &dN[15]);
                     reulermultinom(2, R3, &rate[17], dt, &dN[17]);
                     S1 += dN[0] - dN[1] - dN[2] + dN[14];
                     S2 += dN[2] - dN[3] - dN[4]  + dN[16];
                     S3 += dN[4] - dN[5] - dN[6] + dN[18];
                     I1 += dN[1]          - dN[7] - dN[8];
                     I2 += dN[3] + dN[8]  - dN[9] - dN[10];
                     I3 += dN[5] + dN[10] - dN[11] - dN[12];
                     R1 += dN[7]           - dN[13] - dN[14];
                     R2 += dN[9]  + dN[13] - dN[15] - dN[16];
                     R3 += dN[11] + dN[15] - dN[17] - dN[18];
                     H1 += dN[1];
                     H2 += dN[3];
                     H3 += dN[5];
                     ")

# to and from estimation scale
toEst <- Csnippet("Tbeta1  = log(beta1);
                  Tbeta2  = log(beta2);
                  Tbeta3  = log(beta3);
                  Tbeta11 = logit(beta11);
                  ")
fromEst <- Csnippet("Tbeta1  = exp(beta1);
                    Tbeta2  = exp(beta2);
                    Tbeta3  = exp(beta3);
                    Tbeta11 = expit(beta11);
                    ")

# initalizer
sir_initializer <- Csnippet("S1= 4872721;   I1= 2871; R1= 124408; H1=0;
                            S2= 54942298;  I2= 639;  R2= 57063;  H2=0;
                            S3= 19990218;  I3= 174;  R3= 9608;  H3=0;
                            ")

# define parameters
params <- c(beta1=1.287395e+01, beta2=2.409750e-01, beta3=4.034036e-01,
            beta11=1.509189e-01, phi= -1.706477e-03, gamma=1, delta1=1/(5*52),
            delta2=1/(55*52), alpha=1/(80*52), mu=1/(20*52), N=80000000, omega=1/(1*52))

# read in the data
# add at t=0 a row of NAs to not have problems with the accumulator variable since
# t0 is much less than t[1]
read.table("data.txt") %>%
  rbind(data.frame(time=0,cases1=NA,cases2=NA,cases3=NA)) %>%
  arrange(time) -> dat

# define the pomp object
pomp(data = dat,
     times="time",
     t0=1-6*52,
     dmeasure = dmeas,
     rmeasure = rmeas,
     rprocess = euler.sim(step.fun = sir.step, delta.t = 1/10),
     statenames = c("S1", "I1", "R1", "H1", "S2", "I2", "R2", "H2","S3","I3", "R3", "H3"),
     paramnames = names(params),
     zeronames=c("H1", "H2", "H3"),
     initializer=sir_initializer,
     toEstimationScale=toEst,
     fromEstimationScale=fromEst,
    params = params
) -> sir

The following plots a few simulations along with the data:

simulate(sir,nsim=5,as.data.frame=TRUE,include.data=TRUE) %>%
  subset(time>0,select=c(time,sim,cases1,cases2,cases3)) %>%
  melt(id=c("time","sim")) %>%
  ggplot(aes(x=time,y=value,color=ifelse(sim=="data","data","simulation"),
             alpha=(sim=="data"),
             group=sim))+
  scale_alpha_discrete(breaks=c(`TRUE`=1,`FALSE`=0.8))+
  labs(color="")+
  guides(alpha=FALSE)+
  geom_line()+
  facet_wrap(~variable,ncol=1,scales="free_y")+theme_bw()

unnamed-chunk-3-1

The Poisson variability in the measurement model is not sufficient to explain the data at these parameter values.

One way to add overdispersion is to replace the Poisson measurement model with a negative binomial model:

pomp(sir,
     dmeasure = Csnippet("
  if (ISNA(cases1)) {
    lik = (give_log) ? 0 : 1;
  } else {
    lik =  dnbinom_mu(cases1, 1/od, H1, 1) +
            dnbinom_mu(cases2, 1/od, H2, 1) +
            dnbinom_mu(cases3, 1/od, H3, 1);
    lik = (give_log) ? lik : exp(lik);}"),
    rmeasure = Csnippet("
  cases1 = rnbinom_mu(1/od,H1);
  cases2 = rnbinom_mu(1/od,H2);
  cases3 = rnbinom_mu(1/od,H3);"),
    toEstimationScale=Csnippet("
  Tbeta1  = log(beta1);
  Tbeta2  = log(beta2);
  Tbeta3  = log(beta3);
  Tbeta11 = logit(beta11);
  Tod = log(od);"),
    fromEstimationScale=Csnippet("
  Tbeta1  = exp(beta1);
  Tbeta2  = exp(beta2);
  Tbeta3  = exp(beta3);
  Tbeta11 = expit(beta11);
  Tod = exp(od);"),
    statenames = c("H1", "H2","H3"),
    paramnames = c(names(params),"od")
  ) -> sir

coef(sir,"od") <- 1

simulate(sir,nsim=5,as.data.frame=TRUE,include.data=TRUE) %>%
  subset(time>0,select=c(time,sim,cases1,cases2,cases3)) %>%
  melt(id=c("time","sim")) %>%
  ggplot(aes(x=time,y=value,color=ifelse(sim=="data","data","simulation"),
             alpha=(sim=="data"),
             group=sim))+
  scale_alpha_discrete(breaks=c(`TRUE`=1,`FALSE`=0.8))+
  labs(color="")+
  guides(alpha=FALSE)+
  geom_line()+
  facet_wrap(~variable,ncol=1,scales="free_y")+theme_bw()

unnamed-chunk-4-1

Note that we've added a lot of overdispersion. But the model is now capable of explaining the data. We verify this with a particle-filter calculation.

stew(file="pfilter.rda",seed=1320290398,kind="L'Ecuyer",{
  system.time(
    foreach (i=1:10,.packages='pomp',.options.multicore=list(set.seed=TRUE)) %dopar%
      try(pfilter(sir,Np=1000)) -> pf
  ) -> etime
})

logmeanexp(sapply(pf,logLik),se=TRUE)

##                          se 
## -1.100925e+04  5.838841e-01

etime

##    user  system elapsed 
## 260.208   1.876  29.593

Already we see a substantial improvement in the log likelihood relative to the Poisson model. More critically, the effective sample size is now respectable:

pf %>% sapply(eff.sample.size) %>% melt() %>%
  ggplot(aes(x=Var1,group=Var2,y=value))+geom_line()+
  scale_y_continuous(trans=log1p_trans())+
  labs(x="time",y="ESS")+
  theme_bw()

unnamed-chunk-6-1

[The same plot, applied to the Poisson model, would show an effective sample size of zero for most time points, i.e., filtering failure.]

Now let's move on to trying to maximize the likelihood. I started by following your example in this, the only difference being the new overdispersion parameter, od, which is to be estimated along with the others. With random-walk intensities of 2% per observation, I was not seeing any trend in the log likelihood, as I do below. I soon realized that, because your time series are so long (416 observations), this was far too much variability per mif2 iteration. In the following, I reduced the perturbation intensities by a factor of 10. Because od needs to move quite a way from where I started it, I've kept it at 1%.

stew(file="local_search.rda",seed=900242057,kind="L'Ecuyer",{
  system.time({
    foreach (i=1:10,.packages='pomp',.combine=c,.options.multicore=list(set.seed=TRUE)) %dopar% {
      mif2(
        sir,
        Np=1000,
        Nmif=20,
        cooling.type="geometric",
        cooling.fraction.50=0.5,
        transform=TRUE,
        rw.sd=rw.sd(beta1=0.002,beta2=0.002,beta3=0.002,beta11=0.002,phi=0.002,od=0.01)
      )
    } -> mifs
  }) -> etime
})

etime

##     user   system  elapsed 
## 5185.296    4.104  575.688

sapply(mifs,coef)

##                [,1]         [,2]         [,3]         [,4]         [,5]
## beta1  1.206207e+01 1.202399e+01 1.153665e+01 1.164324e+01 1.179013e+01
## beta2  2.570209e-01 2.717509e-01 2.862369e-01 2.770161e-01 2.740267e-01
## beta3  4.041913e-01 3.865958e-01 4.173082e-01 4.256647e-01 4.062243e-01
## beta11 1.389955e-01 1.466958e-01 1.462300e-01 1.434030e-01 1.408440e-01
## phi    1.437375e-02 7.642879e-02 1.006428e-01 1.977255e-02 6.100431e-02
## gamma  1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## delta1 3.846154e-03 3.846154e-03 3.846154e-03 3.846154e-03 3.846154e-03
## delta2 3.496503e-04 3.496503e-04 3.496503e-04 3.496503e-04 3.496503e-04
## alpha  2.403846e-04 2.403846e-04 2.403846e-04 2.403846e-04 2.403846e-04
## mu     9.615385e-04 9.615385e-04 9.615385e-04 9.615385e-04 9.615385e-04
## N      8.000000e+07 8.000000e+07 8.000000e+07 8.000000e+07 8.000000e+07
## omega  1.923077e-02 1.923077e-02 1.923077e-02 1.923077e-02 1.923077e-02
## od     1.390146e-01 1.427957e-01 1.328448e-01 1.246410e-01 1.447729e-01
##                [,6]         [,7]         [,8]         [,9]        [,10]
## beta1  1.127621e+01 1.198252e+01 1.154654e+01 1.111633e+01 1.140463e+01
## beta2  2.918447e-01 2.694642e-01 2.749054e-01 3.032488e-01 2.950158e-01
## beta3  4.495577e-01 3.888155e-01 4.356756e-01 4.503078e-01 4.223831e-01
## beta11 1.401555e-01 1.411478e-01 1.325803e-01 1.374305e-01 1.421792e-01
## phi    5.232921e-02 6.402672e-02 6.531577e-02 8.128937e-02 2.992768e-02
## gamma  1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## delta1 3.846154e-03 3.846154e-03 3.846154e-03 3.846154e-03 3.846154e-03
## delta2 3.496503e-04 3.496503e-04 3.496503e-04 3.496503e-04 3.496503e-04
## alpha  2.403846e-04 2.403846e-04 2.403846e-04 2.403846e-04 2.403846e-04
## mu     9.615385e-04 9.615385e-04 9.615385e-04 9.615385e-04 9.615385e-04
## N      8.000000e+07 8.000000e+07 8.000000e+07 8.000000e+07 8.000000e+07
## omega  1.923077e-02 1.923077e-02 1.923077e-02 1.923077e-02 1.923077e-02
## od     1.314755e-01 1.727626e-01 1.561911e-01 1.362144e-01 1.457950e-01

Note that the parameter estimates are all now finite. Let's look at some diagnostic convergence plots:

mifs %>%
  conv.rec(c("loglik","nfail","beta1","beta2","beta3","beta11","phi","od")) %>%
  melt() %>%
  ggplot(aes(x=iteration,y=value,color=variable,group=L1))+
  geom_line()+
  guides(color=FALSE)+
  labs(x="MIF2 Iteration",y="")+
  facet_wrap(~variable,scales="free_y",ncol=2)+
  theme_bw()

unnamed-chunk-8-1

The likelihood climbs quite rapidly and, encouragingly, all filtering failures have disappeared by the last mif2 iteration. We see considerable movement of the parameters. In particular, some of the transmission rates have changed substantially and the overdispersion parameter has dropped to a much lower value.

What are the likelihoods at the end of these mif2 iterations?

stew(file="loglik_local.rda",seed=900242057,kind="L'Ecuyer",{
  foreach (mf=mifs,.packages='pomp',.combine=rbind) %dopar% {
    evals <- replicate(10,logLik(pfilter(mf,Np=1000)))
    logmeanexp(evals,se=TRUE)
  }  -> liks
})

liks %>% as.data.frame() %>% rename(c(V1="loglik")) %>% arrange(-loglik)

##       loglik       se
## 1  -10304.25 6.642149
## 2  -10320.71 8.179374
## 3  -10335.08 7.467220
## 4  -10338.77 4.218007
## 5  -10352.40 4.029680
## 6  -10354.17 1.912765
## 7  -10372.13 1.800901
## 8  -10388.87 3.757840
## 9  -10408.12 2.601812
## 10 -10436.88 5.944798

Although we haven't used sufficiently many particles here to get precise estimates, it is clear that the likelihood has improved by several hundred log units. What do simulations look like at this point?

best <- which.max(liks[,1])
simulate(mifs[[best]],nsim=5,as.data.frame=TRUE,include.data=TRUE) %>%
  subset(time>0,select=c(time,sim,cases1,cases2,cases3)) %>%
  melt(id=c("time","sim")) %>%
  ggplot(aes(x=time,y=value,color=ifelse(sim=="data","data","simulation"),
             alpha=(sim=="data"),
             group=sim))+
  scale_alpha_discrete(breaks=c(`TRUE`=1,`FALSE`=0.8))+
  labs(color="")+
  guides(alpha=FALSE)+
  geom_line()+
  facet_wrap(~variable,ncol=1,scales="free_y")+theme_bw()

unnamed-chunk-10-1

Although the data look far more plausible as a realization of this model, there is clearly still room for improvement.


theresasophia commented 8 years ago

Thank you very much for this great and detailed answer! This was very helpful for me.

I was surprised how much the likelihood improved after changing the random-walk intensities, however as I understood there are no precise rules for how to choose this. Since I am planning to include other parameters into the estimation I wonder if you could share your experience of how to find the 'right' intensities? Does this depend on how 'unsure' I am about my initial guess of the parameter?

kingaa commented 8 years ago

You are right that I know of no precise rules for choosing the intensity of mif2 perturbations. Indeed, I doubt that there can be such rules: the optimal choice is bound to depend on the model, the data, and the state of the investigator's uncertainty. In this case, as you suggest, I can talk about the thought process that went into the choices made above. This is not to say that these are optimal choices!! I spent only a short time with the problem: you will doubtless come to understand the system much deeply than I will.

My first step was to diagnose the need for overdispersion in your model. I could have put additional sources of noise into the latent state process. In my experience, this is almost certainly worthwhile. Once we recognize that the hypothetical process is surely too simple to reflect reality in detail, and that one role for stochasticity in scientific models is to account for model mis-specification, it seems almost always a good idea to do so. We treat this issue in the Discussion of our 2010 paper [1]. In addition, of course, and for the same reasons, one is usually wise to allow for the possibility of overdispersion in the measurement process. In the above, I allow only for the latter.

Having implemented the negative-binomial model as above, I ran some mif2 iterations, using what has de facto become my default choice of random-walk perturbation intensities of 2% for each parameter.

bake(file="mif1.rds",seed=900242057,kind="L'Ecuyer",{
  foreach (i=1:10,.packages='pomp',.combine=c,.options.multicore=list(set.seed=TRUE)) %dopar% {
    mif2(
      sir,
      Np=1000,
      Nmif=20,
      cooling.type="geometric",
      cooling.fraction.50=0.5,
      transform=TRUE,
      rw.sd=rw.sd(beta1=0.02,beta2=0.02,beta3=0.02,beta11=0.02,phi=0.02,od=0.02)
    )
  }
}) -> mifs

Looking at the convergence plots, as above, we see the following.

Note that the log likelihood is not trending upward noticeably and we are seeing filtering failures, in some of the parallel mif2 searches, right through the 20 iterations. It doesn't look like we've been able to make much progress with this computation, despite the fact that the parameters (beta1, beta2, beta3, and od) have moved quite a lot. It's this last that is the tip-off. Notice that all of these parameters make one giant leap in the first iteration; subsequent iterations lead to no such marked improvement. That this is possible suggests that the perturbations are large relative to the scale over which the log likelihood changes meaningfully:

By this logic, if we reduce the perturbation sizes, we will slow down the approach to the MLE to some extent because we force the particles to move less at each step. However, because we will also be less extreme in our smoothing of the log likelihood surface, we avoid erasing the peaks in that surface that are the object of our search.

These observations suggest the next steps. Continue to apply mif2 until you can no longer increase the likelihood. Then, reduce the perturbation intensities and repeat. Longer mif2 runs will be useful too, since with geometric cooling and cooling.fraction.50=0.5, you are only dropping the perturbations to 0.5^(20/50)=0.76 of their original magnitude after 20 iterations.

References

  1. He D, Ionides EL, King AA (2010) Plug-and-play inference for disease dynamics: Measles in large and small populations as a case study. Journal of The Royal Society Interface 7: 271–283. doi:10.1098/rsif.2009.0151.
theresasophia commented 8 years ago

Thank you very much for this really helpful answer. I appreciate the detail you provide very much and I am highly motivated to continue working on this.

kingaa commented 8 years ago

I'm closing this issue now, but feel free to re-open if more conversation on the topic is warranted.

kingaa commented 8 years ago

On this topic, as of version 1.8.9.1, mif2 no longer gives NaN in this case. When all particles are lost, it now uses an unweighted mean to arrive at the point estimate.

theresasophia commented 7 years ago

Sorry, to re-open this issue but am facing another problem for the model above. I did as you suggested and performed a longer mif2 search (Nmif=200) and also used more particles (Np=4000). Moreover, I proceeded to carry out a global search, where 20 different parameter constellations as starting values are drawn uniformly at random from this box. (Everything else as above.)

sir_box <- rbind(
  beta1=c(10,13),
  beta2=c(0.2,0.6),
  beta3=c(0.2,0.6),
  beta11=c(0.11,0.15),
  phi=c(0,0.05),
  od=c(0.001,1)
)

What I observe now confuses me, because for the first few iterations the likelihood increases quite much, however stabilizes at at values with is below the initially found region with higher loglikelihood. In order to see the phenomenon I am talking about, the plot below does not show the first 2 iterations (the loglikelihood is starting somewhere around -10500 for the first iteration).

rplot01

I tried different constellations of intensities of the random walk, used higher and lower values for the cooling fraction and also different number of particles and iterations, but in all cases I sooner or later observe that the loglikelihood stabilizes at a value which does not look optimal after seeing higher likelihoods. Do you have any explanation for this or am I doing something wrong? Thanks a lot.

PS: This is the code I use for this:

sir_fixed_params <- c(gamma=1, delta1=1/(5*52), delta2=1/(55*52), alpha=1/(80*52), 
                      omega=1/(1*52), mu=1/(20*52), N=80000000, y1=dat$cases1[2], 
                      y2=dat$cases2[2], y3=dat$cases3[2])

stew(file="bi_box_eval_negbin-%d.rda",{

  t_global <- system.time({
    mifs_global <- foreach(i=1:20,.packages='pomp', .combine=c, .options.multicore=mcopts) %dopar% {
      mif2(
        sir,
        start=c(apply(sir_box,1,function(x)runif(1,min=x[1],max=x[2])),sir_fixed_params),
        Np=4000,
        Nmif=200,
        cooling.type="geometric",
        cooling.fraction.50=0.5,
        transform=TRUE,
        rw.sd=rw.sd(beta1=0.002,beta2=0.002,beta3=0.002,beta11=0.002,phi=0.002,od=0.01)
      )
    }
  })
},seed=1270401374,kind="L'Ecuyer")

mifs_global %>%
  conv.rec(c("loglik","nfail","beta1","beta2","beta3","beta11","phi","od")) %>%
  melt() %>%subset(iteration>2)%>%
  ggplot(aes(x=iteration,y=value,color=variable,group=L1))+
  geom_line()+
  guides(color=FALSE)+
  labs(x="MIF2 Iteration",y="")+
  facet_wrap(~variable,scales="free_y",ncol=2)+
  theme_bw()
kingaa commented 7 years ago

This is an interesting phenomenon that comes up fairly frequently and is worth thinking about. Can you move it to a new thread so it can be more readily seen?

theresasophia commented 7 years ago

Sure. Thanks!