Scalability of lgpr - Githubissues

YuHsiangLo commented 3 years ago

Is your feature request related to a problem? Please describe. First of all, thank you very much for writing this package and making it publicly available. I was trying to use it to analyze time-series linguistic data. One problem with lgpr, it seems, is that it can't currently handle datasets with more than 300 observations. A typical dataset in linguistics that involves 20 individuals with repeated measurements in a couple of conditions can easily exceed 10,000 entries. My computer crashed when I was attempting to use lgpr to analyze the data.

Describe the solution you'd like Is there a way to scale up the analytical capability of lgpr, so that it can handle bigger datasets efficiently?

Describe alternatives you've considered I'm also trying to use other generalized additive mixed effects models to analyze the data.

Additional context I'd like to be able to fit the following model:

fit <- lgp(VALUE ~ TIME + TIME|VOICING + TIME|TONE + HEIGHT + PLACE + GENDER + PERSON,
           data = d,
           iter = 1000,
           chains = 4,
           cores = 4,
           refresh = 500)

with the following simulated dataset:

begin_f0 <- function(gender) {
  if (gender == "M") {
    return(rnorm(n = 1, mean = 150, sd = 3))
  } else {
    return(rnorm(n = 1, mean = 250, sd = 3))
  }
}

end_f0 <- function(gender) {
  if (gender == "M") {
    return(rnorm(n = 1, mean = 120, sd = 3))
  } else {
    return(rnorm(n = 1, mean = 220, sd = 3))
  }
}

a_coeff <- function(f1, f2) {
  return((f1 - f2)/81)
}

b_coeff <- function(f1, f2) {
  return(20 * (f2 - f1) / 81)
}

c_coeff <- function(f1, f2) {
  return(f1 - (f1 - f2)/81 - 20 * (f2 - f1) / 81)
}

get_f0 <- function(f1, f2) {
  a_ <- a_coeff(f1, f2)
  b_ <- b_coeff(f1, f2)
  c_ <- c_coeff(f1, f2)
  t <- 1:10
  return(a_ * t^2 + b_ * t + c_)
}

set.seed(2021)
n_indiv <- 6
participants <- sample(c("M", "F"), size = n_indiv, replace = TRUE)

f_p_i_1 <- vector("numeric")
f_p_i_4 <- vector("numeric")
f_p_ai_1 <- vector("numeric")
f_p_ai_4 <- vector("numeric")

f_b_i_1 <- vector("numeric")
f_b_i_4 <- vector("numeric")
f_b_ai_1 <- vector("numeric")
f_b_ai_4 <- vector("numeric")

f_t_i_1 <- vector("numeric")
f_t_i_4 <- vector("numeric")
f_t_ai_1 <- vector("numeric")
f_t_ai_4 <- vector("numeric")

f_d_i_1 <- vector("numeric")
f_d_i_4 <- vector("numeric")
f_d_ai_1 <- vector("numeric")
f_d_ai_4 <- vector("numeric")

f_k_ai_1 <- vector("numeric")
f_k_ai_4 <- vector("numeric")
f_g_ai_1 <- vector("numeric")
f_g_ai_4 <- vector("numeric")

f_m_i_1 <- vector("numeric")
f_m_i_4 <- vector("numeric")
f_m_ai_4 <- vector("numeric")

f_n_i_4 <- vector("numeric")
f_n_ai_4 <- vector("numeric")

f0 <- vector("numeric")

for (p in participants) {
  perturbation_effect <- rnorm(1, -10, 5)
  f_e <- end_f0(p)
  f_b_voiceless <- begin_f0(p)
  f_b_voiced <- f_b_voiceless + perturbation_effect
  f_b_sonorant <- f_e + (f_b_voiceless - f_e) * 0.1

  voicelss_f0 <- get_f0(f_b_voiceless, f_e)
  voiced_f0 <- get_f0(f_b_voiced, f_e)
  sonorant_f0 <- get_f0(f_b_sonorant, f_e)

  height_diff <- runif(1, 0, 2)

  f_p_i_1 <- rep(voicelss_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_p_i_4 <- rep(voicelss_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_p_ai_1 <- rep(voicelss_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_p_ai_4 <- rep(voicelss_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)

  f_b_i_1 <- rep(voiced_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_b_i_4 <- rep(voiced_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_b_ai_1 <- rep(voiced_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_b_ai_4 <- rep(voiced_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)

  f_t_i_1 <- rep(voicelss_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_t_i_4 <- rep(voicelss_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_t_ai_1 <- rep(voicelss_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_t_ai_4 <- rep(voicelss_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)

  f_d_i_1 <- rep(voiced_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_d_i_4 <- rep(voiced_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_d_ai_1 <- rep(voiced_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_d_ai_4 <- rep(voiced_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)

  f_k_ai_1 <- rep(voicelss_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_k_ai_4 <- rep(voicelss_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_g_ai_1 <- rep(voiced_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_g_ai_4 <- rep(voiced_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)

  f_m_i_1 <- rep(sonorant_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_m_i_4 <- rep(sonorant_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_m_ai_4 <- rep(sonorant_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)

  f_n_i_4 <- rep(sonorant_f0, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)
  f_n_ai_4 <- rep(sonorant_f0 - height_diff, 3) + rep(rnorm(3, mean = 0, sd = 1), each = 10)

  f0 <- c(f0, f_p_i_1, f_p_i_4, f_p_ai_1, f_p_ai_4,
          f_b_i_1, f_b_i_4, f_b_ai_1, f_b_ai_4,
          f_t_i_1, f_t_i_4, f_t_ai_1, f_t_ai_4,
          f_d_i_1, f_d_i_4, f_d_ai_1, f_d_ai_4,
          f_k_ai_1, f_k_ai_4,
          f_g_ai_1, f_g_ai_4,
          f_m_i_1, f_m_i_4, f_m_ai_4,
          f_n_i_4, f_n_ai_4)
}

f0 <- matrix(f0, ncol = 10, byrow = TRUE)

tone <- rep(rep(c("Tone1", "Tone4", "Tone1", "Tone4",
                  "Tone1", "Tone4", "Tone1", "Tone4",
                  "Tone1", "Tone4", "Tone1", "Tone4",
                  "Tone1", "Tone4", "Tone1", "Tone4",
                  "Tone1", "Tone4",
                  "Tone1", "Tone4",
                  "Tone1", "Tone4", "Tone4",
                  "Tone4", "Tone4"), each = 3), n_indiv)

cons <- rep(rep(c("p", "p", "p", "p",
                  "b", "b", "b", "b",
                  "t", "t", "t", "t",
                  "d", "d", "d", "d",
                  "k", "k",
                  "g", "g",
                  "m", "m", "m",
                  "n", "n"), each = 3), n_indiv)

voicing <- rep(rep(c("voiceless", "voiceless", "voiceless", "voiceless",
                     "voiced", "voiced", "voiced", "voiced",
                     "voiceless", "voiceless", "voiceless", "voiceless",
                     "voiced", "voiced", "voiced", "voiced",
                     "voiceless", "voiceless",
                     "voiced", "voiced",
                     "nasal", "nasal", "nasal",
                     "nasal", "nasal"), each = 3), n_indiv)

height <- rep(rep(c("high", "high", "low", "low",
                    "high", "high", "low", "low",
                    "high", "high", "low", "low",
                    "high", "high", "low", "low",
                    "low", "low",
                    "low", "low",
                    "high", "high", "low",
                    "high", "low"), each = 3), n_indiv)

place <- rep(rep(c("bilabial", "bilabial", "bilabial", "bilabial",
                   "bilabial", "bilabial", "bilabial", "bilabial",
                   "alveolar", "alveolar", "alveolar", "alveolar",
                   "alveolar", "alveolar", "alveolar", "alveolar",
                   "velar", "velar",
                   "velar", "velar",
                   "bilabial", "bilabial", "bilabial",
                   "alveolar", "alveolar"), each = 3), n_indiv)

gender <- rep(participants, each = 75)
person <- rep(1:n_indiv, each = 75)

df <- cbind(person, gender, cons, tone, voicing, height, place, as.data.frame(f0))
colnames(df) <- c("PERSON", "GENDER", "CONS", "TONE", "VOICING", "HEIGHT", "PLACE", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10")

df %>%
  pivot_longer(cols = !c("PERSON", "GENDER", "CONS", "TONE", "VOICING", "HEIGHT", "PLACE"), names_to = "TIME", values_to = "F0") %>%
  mutate(TIME = as.integer(TIME),
         TRIAL = rep(1:(75*n_indiv), each = 10)) %>%
  ggplot(aes(x = TIME, y = F0, color = CONS, group = TRIAL)) +
  geom_line() +
  facet_wrap(~PERSON)

d <- df %>%
  pivot_longer(!c("PERSON", "GENDER", "CONS", "TONE", "VOICING", "HEIGHT", "PLACE"), names_to = "TIME", values_to = "VALUE") %>%
  mutate(PERSON = as.factor(PERSON),
         GENDER = as.factor(GENDER),
         CONS = as.factor(CONS),
         TONE = as.factor(TONE),
         VOICING = as.factor(VOICING),
         HEIGHT = as.factor(HEIGHT),
         PLACE = as.factor(PLACE),
         TIME = as.numeric(TIME))

d <- as.data.frame(d)

Again, appreciated!

jtimonen commented 3 years ago

I have been able to run lgpr with N = 600 data points in under a day for a fairly simple model. I have put the warning for N >= 300 because after that sampling starts to take really long. So currently this doesn't scale well because the complexity is N^3, and with for example that data of yours with N about 5000 sampling isn't really possible at least on a normal computer. But yours is an interesting use case, and I have been implementing a basis function approximation which should scale much better, so I will comment on this again when I am able to try it on your data.

YuHsiangLo commented 3 years ago

That'd be great. Much appreciated!

jtimonen / lgpr

Scalability of lgpr #21