ocbe-uio / BayesMallows

R-package for Bayesian preference learning with the Mallows rank model.
https://ocbe-uio.github.io/BayesMallows/
GNU General Public License v3.0
21 stars 9 forks source link

Probability is missing? #407

Closed osorensen closed 5 months ago

osorensen commented 5 months ago

Leap-and-shift for latent rankings based on pairwise preference data is symmetric, but here it is treated as uniform on the constraint set. Is that correct? This does not matter for MCMC, but it does matter for the particle weights in SMC.

https://github.com/ocbe-uio/BayesMallows/blob/6f6b85d81b2c5249d155a12f35306810722359a9/src/rank_proposal.cpp#L142-L159

osorensen commented 5 months ago

I ran the following code lines:

library(BayesMallows)
set.seed(3)
par(mfrow = c(3, 1))
for(k in 1:3) {
  dat <- subset(beach_preferences, assessor <= 2)
  mod_init <- compute_mallows(
    data = setup_rank_data(
      preferences = dat
    ),
    compute_options = set_compute_options(nmc = 3000, burnin = 1000)
  )

  # Next we provide assessors 21 to 60 one at a time.
  alpha <- numeric()
  mod <- mod_init
  for (i in 3:60) {
    print(i)
    mod <- update_mallows(
      model = mod,
      new_data = setup_rank_data(
        preferences = subset(beach_preferences, assessor == i),
        user_ids = i
      ),
      smc_options = set_smc_options(
        n_particles = 2000, latent_sampling_lag = 1)
    )
    alpha <- c(alpha, mean(mod$alpha_samples))
  }

  plot(alpha)

}

Results with version 2.1.1

image

Results with the version I'm currently developing

image

The alpha parameter certainly gets higher, and hence closer to the MCMC results.

osorensen commented 5 months ago

Some confirmation that my way of computing probabilities is correct:

# Validate pairwise leap-and-shift proposal
devtools::load_all()
library(tidyverse)
library(furrr)

dat <- subset(beach_preferences, assessor == 3 & bottom_item < 7 & top_item < 12)
n_items <- 15
dd <- setup_rank_data(preferences = dat, n_items = n_items)
constraints <- dd$constraints[[1]]

current_rank <- dd$rankings

plan("multisession")
rankings <- future_map_dfr(1:1000000, function(i) {
  u <- sample(n_items, 1)
  if(u %in% constraints$constrained_items) {
    ia <- constraints$items_above[[u]]
    ib <- constraints$items_below[[u]]
    if(length(ia) > 0) {
      l <- max(current_rank[ia]) + 1
    } else {
      l <- 1
    }
    if(length(ib) > 0) {
      r <- min(current_rank[ib]) - 1
    } else {
      r <- n_items
    }
  } else {
    l <- 1
    r <- n_items
  }
  support <- seq(from = l, to = r, by = 1)
  r_prime <- current_rank
  r_prime[[u]] <- sample(support, 1)

  for(j in seq_len(n_items)) {
    if(current_rank[[u]] < current_rank[[j]] && current_rank[[j]] <= r_prime[[u]]) {
      r_prime[[j]] <- current_rank[[j]] - 1
    } else if(current_rank[[u]] > current_rank[[j]] && current_rank[[j]] >= r_prime[[u]]) {
      r_prime[[j]] <- current_rank[[j]] + 1
    }
  }
  stopifnot(BayesMallows:::validate_permutation(r_prime))
  tibble(
    iteration = i,
    probability = 1 / length(support) / n_items,
    ranking = list(as.numeric(r_prime))
  )
}, .options = furrr_options(seed = 1L))

empirical_probs <- rankings %>%
  mutate(ranking = map_chr(ranking, paste, collapse = "")) %>%
  group_by(ranking) %>%
  summarise(n = n(), .groups = "drop") %>%
  mutate(proportion = n / sum(n))

calculated_probs <- rankings %>%
  mutate(ranking = map_chr(ranking, paste, collapse = "")) %>%
  distinct(ranking, probability) %>%
  group_by(ranking) %>%
  summarise(
    probability = sum(probability), .groups = "drop"
  )

empirical_probs %>%
  full_join(calculated_probs, by = "ranking") %>%
  ggplot(aes(x = proportion, y = probability)) +
  geom_point() +
  geom_abline()

First with this row:

dat <- subset(beach_preferences, assessor == 3 & bottom_item < 12 & top_item < 12)

image

Next with this:

dat <- subset(beach_preferences, assessor == 3 & bottom_item < 7 & top_item < 12)

image