Metropolis Algorithm with Multiple Chains (Fig. 7.10 and 7.11)

OmidGhasemi21 commented 3 years ago

Hi,

I slightly changed your codes for the Metropolis single-chain algorithm and reproduced several plots of figures 7.10 and 7.11 (page 179-180). In the bookdown, you have used brms to build the models and check for representativeness, so I am not sure if the Metropolis algorithm is useful to be added. Anyway, here it is:

library(tidyverse)
library(patchwork)
library(RColorBrewer)
library(coda)

# specify the data, to be used in the likelihood function.
my_data <- c(rep(0, 15), rep(1, 35))

# define the Bernoulli likelihood function, p(D|theta).
likelihood <- function(theta, data) {
  z <- sum(data)
  n <- length(data)
  p_data_given_theta <- theta^z * (1 - theta)^(n - z)
  p_data_given_theta[theta > 1 | theta < 0] <- 0
  return(p_data_given_theta)
}

# define the prior density function. 
prior_d <- function(theta) {
  p_theta <- dbeta(theta, 1, 1)
  p_theta[theta > 1 | theta < 0] = 0
  return(p_theta)
}

# Define the relative probability of the target distribution. For our application, 
# this target distribution is the unnormalized posterior distribution.
target_rel_prob <- function(theta, data) {
  target_rel_prob <- likelihood(theta, data) * prior_d(theta)
  return(target_rel_prob)
}

# specify the length of the trajectory, i.e., the number of jumps to try:
traj_length <- 50000

# initialize the vector that will store the results
trajectory1 <- rep(0, traj_length)
trajectory2 <- rep(0, traj_length)
trajectory3 <- rep(0, traj_length)

# specify where to start the trajectory:
trajectory1[1] <- 0.01 # another arbitrary value
trajectory2[1] <- 0.5 # another arbitrary value
trajectory3[1] <- 0.99 # another arbitrary value

# specify the burn-in period
burn_in <- ceiling(0.0 * traj_length) # arbitrary number, less than `traj_length`

# initialize accepted, rejected counters, just to monitor performance:
n_accepted1 <- 0
n_rejected1 <- 0
n_accepted2 <- 0
n_rejected2 <- 0
n_accepted3 <- 0
n_rejected3 <- 0

proposal_sd = .02

# now generate the random walk. the 't' index is time or trial in the walk.
#set.seed(47405)

for (t in 1:(traj_length - 1)) {
  current_position1 <- trajectory1[t]
  current_position2 <- trajectory2[t]
  current_position3 <- trajectory3[t]
  # use the proposal distribution to generate a proposed jump
  proposed_jump1 <- rnorm(1, mean = 0, sd = proposal_sd)
  proposed_jump2 <- rnorm(1, mean = 0, sd = proposal_sd)
  proposed_jump3 <- rnorm(1, mean = 0, sd = proposal_sd)
  # compute the probability of accepting the proposed jump
  prob_accept1 <- min(1,
                     target_rel_prob(current_position1 + proposed_jump1, my_data)
                     / target_rel_prob(current_position1, my_data))
  prob_accept2 <- min(1,
                      target_rel_prob(current_position2 + proposed_jump2, my_data)
                      / target_rel_prob(current_position2, my_data))
  prob_accept3 <- min(1,
                      target_rel_prob(current_position3 + proposed_jump3, my_data)
                      / target_rel_prob(current_position3, my_data))
  # generate a random uniform value from the interval [0, 1] to
  # decide whether or not to accept the proposed jump
  # Chain 1......
  if (runif(1) < prob_accept1) {
    # accept the proposed jump
    trajectory1[t + 1] <- current_position1 + proposed_jump1
    # increment the accepted counter, just to monitor performance
    if (t > burn_in) {n_accepted1 <- n_accepted1 + 1}
  } else {
    # reject the proposed jump, stay at current position
    trajectory1[t + 1] <- current_position1
    # increment the rejected counter, just to monitor performance
    if (t > burn_in) {n_rejected1 <- n_rejected1 + 1}
  }
  # Chain 2........
  if (runif(1) < prob_accept2) {
    # accept the proposed jump
    trajectory2[t + 1] <- current_position2 + proposed_jump2
    # increment the accepted counter, just to monitor performance
    if (t > burn_in) {n_accepted2 <- n_accepted2 + 1}
  } else {
    # reject the proposed jump, stay at current position
    trajectory2[t + 1] <- current_position2
    # increment the rejected counter, just to monitor performance
    if (t > burn_in) {n_rejected2 <- n_rejected2 + 1}
  }
  # Chain 3........
  if (runif(1) < prob_accept3) {
    # accept the proposed jump
    trajectory3[t + 1] <- current_position3 + proposed_jump3
    # increment the accepted counter, just to monitor performance
    if (t > burn_in) {n_accepted3 <- n_accepted3 + 1}
  } else {
    # reject the proposed jump, stay at current position
    trajectory3[t + 1] <- current_position3
    # increment the rejected counter, just to monitor performance
    if (t > burn_in) {n_rejected3 <- n_rejected3 + 1}
  }
  # end of Metropolis algorithm
}

# extract the post-burn_in portion of the trajectory
accepted_traj1 <- trajectory1[(burn_in + 1) : length(trajectory1)]
accepted_traj2 <- trajectory2[(burn_in + 1) : length(trajectory2)]
accepted_traj3 <- trajectory3[(burn_in + 1) : length(trajectory3)]

# Convert all the chains into tibbles and then join all of them together
chain1 <- tibble(
  chain = 1,
  accepted_traj = accepted_traj1,
  n_accepted    = n_accepted1, 
  n_rejected    = n_rejected1,
  iter = rep(1:traj_length, times =1))

chain2 <- tibble(
  chain = 2,
  accepted_traj = accepted_traj2,
  n_accepted    = n_accepted2, 
  n_rejected    = n_rejected2,
  iter = rep(1:traj_length, times =1))

chain3 <- tibble(
  chain = 3,
  accepted_traj = accepted_traj3,
  n_accepted    = n_accepted3, 
  n_rejected    = n_rejected3,
  iter = rep(1:traj_length, times =1))

all_chains <- do.call("rbind", list(chain1, chain2, chain3)) %>%
  mutate(chain= factor(chain))

# Calculate effectiveSize and MCSE: in the book ESS= 605.8 and MCSE = .00254
require(coda)
all_chains %>%
  filter(iter %in% (500:10000)) %>%
  summarise(eff_size = effectiveSize(accepted_traj),
            mcse= sd(accepted_traj) / sqrt(eff_size))

# Now, let's do some plots
density_earlier_chain <- all_chains %>%
  filter(iter < 500) %>%
  ggplot(aes(x = accepted_traj)) +
  geom_density(aes(color= chain)) +
  tidybayes::stat_pointinterval(aes(y = 0, color= chain), 
                          point_interval = mode_hdi, .width = .95) +
  scale_x_continuous(breaks = seq(from = 0, to = 1, length.out = 6)) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(x = expression(theta), title= "Earlier Steps") +
  theme(panel.grid = element_blank()) +
  theme_classic()+
  scale_color_brewer(palette = "PuBu")

density_later_chain <- all_chains %>%
  filter(iter %in% (500:10000)) %>%
  ggplot(aes(x = accepted_traj)) +
  geom_density(aes(color= chain)) +
  tidybayes::stat_pointinterval(aes(y = 0, color= chain), 
                                point_interval = mode_hdi, .width = .95) +
  scale_x_continuous(breaks = seq(from = 0, to = 1, length.out = 6)) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(x = expression(theta), title = "Later Steps") +
  theme(panel.grid = element_blank()) +
  theme_classic()+
  scale_color_brewer(palette = "PuBu")

traceplot_earlier_chain <- all_chains %>% 
  ggplot(aes(x = accepted_traj, y = iter)) +
  geom_path(size = 1/4, aes(color= chain)) + #color = "skyblue"
  geom_point(size = .8, alpha = 1/2, aes(color= chain)) + #color = "skyblue"
  coord_flip(xlim = c(0,1), ylim = c(1,500))+
  scale_x_continuous(expression(theta), breaks = seq(from = 0, to = 1, length.out = 6)) +
  labs(title = "Earlier Steps", y = "Iterations") +
  theme(panel.grid = element_blank()) +
  theme_classic() +
  scale_color_brewer(palette = "PuBu")

traceplot_later_chain <- all_chains %>% 
  ggplot(aes(x = accepted_traj, y = iter)) +
  geom_path(size = 1/4, aes(color= chain)) + #color = "skyblue"
  geom_point(size = .8, alpha = 1/2, aes(color= chain)) + #color = "skyblue"
  coord_flip(xlim = c(0,1), ylim = c(500,10000))+
  scale_x_continuous(expression(theta), breaks = seq(from = 0, to = 1, length.out = 6)) +
  labs(title = "Later Steps", y = "Iterations") +
  theme(panel.grid = element_blank()) +
  theme_classic() +
  scale_color_brewer(palette = "PuBu")

(traceplot_earlier_chain + traceplot_later_chain)/(density_earlier_chain+density_later_chain) & theme(legend.position = "none")

combined

To draw a Gelman plot to check shrinkage factors, we can use the coda package. However, the graph is not as beautiful as the previous ones.

# Shrinkage plot for the earlier steps
chain1_mcmc <- chain1 %>% filter(iter < 500) %>%
  select (accepted_traj) %>%
  mcmc()

chain2_mcmc <- chain2 %>% filter(iter < 500) %>%
  select (accepted_traj) %>%
  mcmc()

chain3_mcmc <- chain3 %>% filter(iter < 500) %>%
  select (accepted_traj) %>%
  mcmc()

combined_chains <- mcmc.list(chain1_mcmc,chain2_mcmc,chain3_mcmc)
gelman_plot <- gelman.plot(combined_chains)

gelman_plot

ASKurz commented 3 years ago

Nice! Thanks for sharing your code. If you ever adapt your method for the work underlying Figure 7.6, I’d love to see it.

OmidGhasemi21 commented 3 years ago

Hi Solomon,

Thanks for your feedback.

In the book, you asked to share our codes for reproducing Kruschke's ESS and MCSE. I edited my previous post and added the following lines which reproduce those measures:

# Calculate effectiveSize and MCSE: in the book ESS= 605.8 and MCSE = .00254
require(coda)
all_chains %>%
  filter(iter %in% (500:10000)) %>%
  summarise(eff_size = effectiveSize(accepted_traj),
            mcse= sd(accepted_traj) / sqrt(eff_size))

Moreover, I tried to reproduce Fig. 7.6. of the book.

Two important notes:

Kruschke has used a bivariate normal proposal distribution. I used a method similar to this book: https://rpruim.github.io/Kruschke-Notes/markov-chain-monte-carlo-mcmc.html#two-coins
This example is really beyond my knowledge of R and Bayesian stats, so I just copy my codes here to get some feedback. It is possible that I misunderstood the 2D Metropolis algorithm. I tried to check the effective size and the proportion on accepted steps and they were similar to Kruschke's results, however, the figure looks a little bit odd.

here are my codes with proposal SD = .2:

# 2 parameters Bayes' Rule with Metropolis algorithm
# Our theta sequences and data
theta_sequence <- seq(from = 0, to = 1, by = .01)
theta_1_data   <- rep(0:1, times = c(8 - 6, 6))
theta_2_data   <- rep(0:1, times = c(7 - 2, 2))

likelihood <- function(theta1, data1, theta2, data2) {
  z1 <- sum(data1)
  n1 <- length(data1)
  z2 <- sum(data2)
  n2 <- length(data2)
  p_data_given_theta <- (theta1^z1 * (1 - theta1)^(n1 - z1)) * (theta2^z2 * (1 - theta2)^(n2 - z2))
  p_data_given_theta[theta1 > 1 | theta1 < 0] <- 0
  p_data_given_theta[theta2 > 1 | theta2 < 0] <- 0
  return(p_data_given_theta)
}

# define the prior density function. 
prior_d <- function(theta1, theta2) {
  p_theta <- dbeta(theta1, 1, 1) * dbeta(theta2, 1, 1)
  p_theta[theta1 > 1 | theta1 < 0] = 0
  p_theta[theta2 > 1 | theta2 < 0] = 0
  return(p_theta)
}

# Define the relative probability of the target distribution. For our application, 
# this target distribution is the unnormalized posterior distribution.
target_rel_prob <- function(theta1, data1, theta2, data2) {
  target_rel_prob <- (likelihood(theta1, data1, theta2, data2) * prior_d(theta1, theta2))
  return(target_rel_prob)
}

# specify the length of the trajectory, i.e., the number of jumps to try:
traj_length <- 50000

# initialize the vector that will store the results
trajectory1 <- rep(0, traj_length)
trajectory2 <- rep(0, traj_length)

# specify where to start the trajectory:
trajectory1[1] <- 0.5 # another arbitrary value
trajectory2[1] <- 0.5 # another arbitrary value

# specify the burn-in period
burn_in <- ceiling(0.0 * traj_length) # arbitrary number, less than `traj_length`

# initialize accepted, rejected counters, just to monitor performance:
n_accepted <- 0
n_rejected <- 0

proposal_sd <- .2

for (t in 1:(traj_length - 1)) {
  current_position1 <- trajectory1[t]
  current_position2 <- trajectory2[t]
  # use the proposal distribution to generate a proposed jump
  proposed_jump1 <- rnorm(1, mean = 0, sd = proposal_sd)
  proposed_jump2 <- rnorm(1, mean = 0, sd = proposal_sd)
  # compute the probability of accepting the proposed jump

  prob_accept <- min(1,
                     target_rel_prob(current_position1 + proposed_jump1, theta_1_data,
                                     current_position2 + proposed_jump2, theta_2_data)
                     / target_rel_prob(current_position1, theta_1_data, current_position2, theta_2_data))

  # generate a random uniform value from the interval [0, 1] to
  # decide whether or not to accept the proposed jump
  if (runif(1) < prob_accept) {
    # accept the proposed jump
    trajectory1[t + 1] <- current_position1  + proposed_jump1
    trajectory2[t + 1] <- current_position2  + proposed_jump2
    # increment the accepted counter, just to monitor performance
    if (t > burn_in) {n_accepted <- n_accepted + 1}
  } else {
    # reject the proposed jump, stay at current position
    trajectory1[t + 1] <- current_position1
    trajectory2[t + 1] <- current_position2
    # increment the rejected counter, just to monitor performance
    if (t > burn_in) {n_rejected <- n_rejected + 1}
  }
}

# extract the post-burn_in portion of the trajectory
accepted_traj1 <- trajectory1[(burn_in + 1) : length(trajectory1)]
accepted_traj2 <- trajectory2[(burn_in + 1) : length(trajectory2)]

metrop_2d_data <- tibble(accepted_traj1 = accepted_traj1,
       accepted_traj2 = accepted_traj2,
       n_accepted    = n_accepted, 
       n_rejected    = n_rejected,
       iter = rep(1:traj_length))

# calculate the proportion of accepted proposal compared to all proposals
metrop_2d_data %>%
  summarise(acc_diveded_total_pro= n_accepted/length(accepted_traj))

# Calculate effectiveSize
require(coda)
metrop_2d_data %>%
  summarise(eff_size1 = effectiveSize(accepted_traj1),
            eff_size2 = effectiveSize(accepted_traj2))

metrop_2d_traceplot <- metrop_2d_data %>% 
  filter(iter < 1000) %>%
  ggplot(aes(x = accepted_traj1, y = accepted_traj2)) +
  geom_path(size = 1/4, color = "skyblue") +
  geom_point(size = 1, alpha = 1/2, color = "skyblue", shape = 1) +
  coord_cartesian(xlim = c(0,1), ylim = c(0,1)) +
  labs(title = "", x= expression(theta[1]), y= expression(theta[2])) +
  theme_bw()

metrop_2d_traceplot

ASKurz commented 3 years ago

Hey @OmidGhasemi21, I'm finally getting back to this. Similar to how you mention, above, this is stretching the limits of my programing skills. But based on my understanding of the task, you work flow gets the job done and I'm going to add this, with attribution to you, to the next revision of the book in a slightly revised form.

My first step will be to define a few of your custom functions:

# 2 parameters Bayes' Rule with Metropolis algorithm
# Our theta sequences and data
theta_sequence <- seq(from = 0, to = 1, by = .01)
theta_1_data   <- rep(0:1, times = c(8 - 6, 6))
theta_2_data   <- rep(0:1, times = c(7 - 2, 2))

# define the bivariate Bernoulli likelihood
bivariate_bernoulli_likelihood <- function(theta1, data1, theta2, data2) {

  z1 <- sum(data1)
  n1 <- length(data1)
  z2 <- sum(data2)
  n2 <- length(data2)
  p_data_given_theta <- (theta1^z1 * (1 - theta1)^(n1 - z1)) * (theta2^z2 * (1 - theta2)^(n2 - z2))
  p_data_given_theta[theta1 > 1 | theta1 < 0] <- 0
  p_data_given_theta[theta2 > 1 | theta2 < 0] <- 0

  return(p_data_given_theta)

}

# define the prior density function
prior_d <- function(theta1, theta2) {

  p_theta <- dbeta(theta1, 1, 1) * dbeta(theta2, 1, 1)
  p_theta[theta1 > 1 | theta1 < 0] = 0
  p_theta[theta2 > 1 | theta2 < 0] = 0

  return(p_theta)

}

# Define the relative probability of the target distribution. For our application, 
# this target distribution is the unnormalized posterior distribution.
target_rel_prob <- function(theta1, data1, theta2, data2) {
  target_rel_prob <- (bivariate_bernoulli_likelihood(theta1, data1, theta2, data2) * prior_d(theta1, theta2))

  return(target_rel_prob)
}

Second, I'll wrap the bulk of the remaining code into a single function called my_bivariate_metropolis().

my_bivariate_metropolis <- function(proposal_sd = .02,
                                    # specify the length of the trajectory (i.e., the number of jumps to try)
                                    traj_length = 50000) {

  # initialize the vector that will store the results
  trajectory1 <- rep(0, traj_length)
  trajectory2 <- rep(0, traj_length)

  # specify where to start the trajectory:
  trajectory1[1] <- 0.5 # another arbitrary value
  trajectory2[1] <- 0.5 # another arbitrary value

  # specify the burn-in period
  burn_in <- ceiling(0.0 * traj_length) # arbitrary number, less than `traj_length`

  # initialize accepted, rejected counters, just to monitor performance:
  n_accepted <- 0
  n_rejected <- 0

  for (t in 1:(traj_length - 1)) {
    current_position1 <- trajectory1[t]
    current_position2 <- trajectory2[t]

    # use the proposal distribution to generate a proposed jump
    proposed_jump1 <- rnorm(1, mean = 0, sd = proposal_sd)
    proposed_jump2 <- rnorm(1, mean = 0, sd = proposal_sd)

    # compute the probability of accepting the proposed jump
    prob_accept <- min(1,
                       target_rel_prob(current_position1 + proposed_jump1, theta_1_data,
                                       current_position2 + proposed_jump2, theta_2_data)
                       / target_rel_prob(current_position1, theta_1_data, current_position2, theta_2_data))

    # generate a random uniform value from the interval [0, 1] to
    # decide whether or not to accept the proposed jump
    if (runif(1) < prob_accept) {
      # accept the proposed jump
      trajectory1[t + 1] <- current_position1  + proposed_jump1
      trajectory2[t + 1] <- current_position2  + proposed_jump2
      # increment the accepted counter, just to monitor performance
      if (t > burn_in) {n_accepted <- n_accepted + 1}
    } else {
      # reject the proposed jump, stay at current position
      trajectory1[t + 1] <- current_position1
      trajectory2[t + 1] <- current_position2
      # increment the rejected counter, just to monitor performance
      if (t > burn_in) {n_rejected <- n_rejected + 1}
    }
  }

  # extract the post-burn_in portion of the trajectory
  accepted_traj1 <- trajectory1[(burn_in + 1) : length(trajectory1)]
  accepted_traj2 <- trajectory2[(burn_in + 1) : length(trajectory2)]

  # collect the results
  metrop_2d_data <- 
    tibble(iter           = rep(1:traj_length),
           accepted_traj1 = accepted_traj1,
           accepted_traj2 = accepted_traj2,
           n_accepted     = n_accepted, 
           n_rejected     = n_rejected)

  return(metrop_2d_data)

}

Third, I'll run the simulation within a nested tibble.

mh <-
  tibble(proposal_sd = c(0.02, 0.2)) %>% 
  mutate(mh = map(proposal_sd, my_bivariate_metropolis)) %>% 
  unnest(mh)

Fourth, I'll plot like this:

mh %>% 
  filter(iter < 1000) %>%

  ggplot(aes(x = accepted_traj1, y = accepted_traj2)) +
  geom_path(size = 1/10, alpha = 1/2, color = "steelblue") +
  geom_point(alpha = 1/4, color = "steelblue") +
  scale_x_continuous(expression(theta[1]), breaks = 0:5 / 5, expand = c(0, 0), limits = c(0, 1)) +
  scale_y_continuous(expression(theta[2]), breaks = 0:5 / 5, expand = c(0, 0), limits = c(0, 1)) +
  coord_equal() +
  theme_cowplot() +
  panel_border() +
  theme(panel.spacing = unit(0.75, "cm")) +
  facet_wrap(~ proposal_sd, labeller = label_both)

Fifth, I'll summarize the results like this:

# calculate the proportion of accepted proposal compared to all proposals
mh %>% 
  group_by(proposal_sd) %>% 
  slice(1) %>% 
  summarise(acceptance_rate = n_accepted / (n_accepted + n_rejected))

# calculate ess
mh %>% 
  group_by(proposal_sd) %>% 
  summarise(ess_theta_1 = effectiveSize(accepted_traj1),
            ess_theta_2 = effectiveSize(accepted_traj2))

  proposal_sd acceptance_rate
        <dbl>           <dbl>
1        0.02           0.930
2        0.2            0.425

  proposal_sd ess_theta_1 ess_theta_2
        <dbl>       <dbl>       <dbl>
1        0.02        243.        212.
2        0.2        6551.       5809.

When I thank you in the book, should I refer to you as Omid Ghasemi?

OmidGhasemi21 commented 3 years ago

Hi @ASKurz, thank you so much for checking the code and improving it significantly. The plot looks really nice now. Yes, "Omid Ghasemi" is fine.

Finally, thank you for this amazing job. Learning Kruschke's book is now much easier and more fun with the help of your fantastic book. Well done!

ASKurz / Doing-Bayesian-Data-Analysis-in-brms-and-the-tidyverse

Metropolis Algorithm with Multiple Chains (Fig. 7.10 and 7.11) #29