fhui28 / CBFM

Spatio-temporal joint species distribution modeling using community-level basis functions
13 stars 2 forks source link

Example of how to use G_control$structure for a hierarchical GAM #32

Open fhui28 opened 2 years ago

fhui28 commented 2 years ago

This example is mainly to show how to "trick" CBFM into fitting a very limited form of a hierarchical GAM [similar but not exactly the same as the S model in https://peerj.com/articles/6876/]. Obviously if you really need to fit a hierarchical, then use mgcv as it is far superior and more flexible. But this was motivated to help solve a problem @chrishaak had.

Some of the info in the example below is also superfluous [apologies], but left in there for motivating further exploration.

library(mgcv)
library(mvtnorm)
library(tidyverse)
library(CBFM)

## A local pseudo-inverse function -- straight from summary.gam in mgcv package. Full credit goes to Simon Wood for this!
.pinv <- function(V, M, rank.tol = 1e-6) {
    if(missing(M))
        M <- ncol(V)
    D <- eigen(V, symmetric = TRUE)
    M1 <- length(D$values[D$values > rank.tol * D$values[1]])
    if(M > M1)
        M<-M1 # avoid problems with zero eigen-values
    if(M+1 <= length(D$values))
        D$values[(M+1):length(D$values)]<-1
    D$values<- 1/D$values
    if(M+1 <= length(D$values))
        D$values[(M+1):length(D$values)]<-0
    res <- D$vectors %*% diag(x = D$values, nrow = length(D$values)) %*% t(D$vectors)

    return(res)
}

set.seed(082022)
num_sites <- 1000 
num_spp <- 20 # Number of species. Need to keep this lower as factor smooths do not scale very well with the number of factor levels
num_X <- 4 # Only one of the covariates ends up being used for illustrative purposes

xy <- data.frame(x = runif(num_sites, 0, 5), y = runif(num_sites, 0, 5))
X <- rmvnorm(num_sites, mean = rep(0,4))
colnames(X) <- c("temp", "depth", "chla", "O2")
dat <- data.frame(xy, X)
f1 <- function(x, a, b) a * sin(pi * b * x)

spp_slopes <- cbind(a = rnorm(num_spp)+2, b = rnorm(num_spp, sd = 0.5))
spp_intercepts <- rnorm(num_spp, 0, sd = 0.5)

# Simulate spatial multivariate abundance data
eta <- matrix(spp_intercepts, nrow = num_sites, ncol = num_spp, byrow = TRUE) + sapply(1:num_spp, function(j) f1(dat$temp, a = spp_slopes[j,1], b = spp_slopes[j,2]))
simy <- matrix(rbinom(num_sites * num_spp, size = 1, prob = plogis(eta)), num_sites, num_spp)
colnames(simy) <- paste0("spp", 1:num_spp)

# Form training and test sets -- The split is not really needed here in this example
dat_train <- dat[1:500,]
dat_test <- dat[501:1000,]
simy_train <- simy[1:500,]
simy_test <- simy[501:1000,]
rm(X, xy, simy, dat)

# Fit a hierarchical GAM involving only temp, for illustrative purposes
dat_long <- data.frame(simy_train, dat_train) %>%
    pivot_longer(spp1:spp20, names_to = "species") %>%
    mutate(species = fct_inorder(species))

hgam <- gam(value ~ s(temp, by = species, bs = "gp", id = 1), data = dat_long, family = "binomial", method = "REML") # Null space penalties not applied
#hgam_select <- gam(value ~ s(temp, by = species, id = 1), data = dat_long, select = TRUE, family = "binomial", method = "REML") # Null space penalties applied. This should get very close to factor.smooth.interactions but is not quite the same [have not investigated why as yet]
#hgam_fs <- gam(value ~ s(temp, species, bs = "fs"), data = dat_long, family = "binomial", method = "REML") 

# Fit CBFMs involving only temp, for illustrative purposes
sm_train_useincbfm <- smoothCon(s(temp, bs = "gp"), data = dat_train, knots = NULL, absorb.cons = TRUE)[[1]] # Note scale.penalty = TRUE by default. Also absorbing the identifiability constraints into the model and penalty matrix

mm_train_useincbfm <- sm_train_useincbfm$X
penmat_useincbfm <- sm_train_useincbfm$S[[1]]
mm_test_useincbfm <- PredictMat(sm_train_useincbfm, data = dat_test) # Not needed here in the example, but in case you want predict later on...
rm(sm_train_useincbfm)

fitcbfm <- CBFM(y = simy_train,
                formula = ~ 1,
                data = dat_train,
                B_space = mm_train_useincbfm,
                family = binomial(),
                control = list(trace = 1, optim_lower = -1000, optim_upper = 1000, final_maxit = Inf), # Note the optim_lower/upper is set to large numbers here. This is only really because of the nature of this example. final_maxit although should not need to be set in future version of CBFM
                Sigma_control = list(rank = "full", custom_space = .pinv(penmat_useincbfm)),
                G_control = list(rank = "full", structure = "identity"))

# Compare the hierarchical GAM and CBFM fits in terms of linear predictors for temperature. They should be somewhat similar, although hierarchical GAM probably captures the curvature more effectively? 
data.frame(temp = dat_train$temp, cbfm = fitcbfm$linear_predictors, hgam = matrix(hgam$linear.predictors, ncol = num_spp, byrow = TRUE)) %>%
    pivot_longer(-temp) %>%
    mutate(species = rep(rep(paste0("spp",1:num_spp),2),500) %>% fct_inorder) %>%
    mutate(model = rep(rep(c("cbfm","gam"),each=num_spp),500)) %>%
    ggplot(., aes(x = temp, y = value, color = model)) +
    geom_line() +
    facet_wrap(. ~ species, nrow = 5) +
    theme_bw()