florianhartig / DHARMa

Diagnostics for HierArchical Regession Models
http://florianhartig.github.io/DHARMa/
201 stars 21 forks source link

Add STAN example to documentation #328

Open florianhartig opened 2 years ago

florianhartig commented 2 years ago

Example from Amael Le Squin for residual checks with STAN - to be integrated with the help

#### Example from https://cran.r-project.org/web/packages/DHARMa/vignettes/DHARMaForBayesians.html ran with Stan

library(data.table)
library(cmdstanr)
library(DHARMa)

#### Create data
set.seed(123)

dat = DHARMa::createData(200, overdispersion = 0.2)

Data = as.list(dat)
Data$nobs = nrow(dat)
Data$nGroups = length(unique(dat$group))

#### Fit data
## Common variables
n_chains = 3
maxIter = 3000

## Compile model
model = cmdstan_model("./florian_eg.stan")

## Run model
results = model$sample(data = Data, parallel_chains = n_chains, refresh = 50, chains = n_chains,
    iter_warmup = round(maxIter/2), iter_sampling = round(maxIter/2), save_warmup = FALSE)

n_sampling = results$metadata()$iter_sampling

#### Compute residuals
## Get parameters
intercept_array = as.vector(results$draws("intercept")) # dimensions: iter_sampling * n_chains * 1
env_array = as.vector(results$draws("env")) # dimensions: iter_sampling * n_chains * 1

## Posterior distributions
# Posterior lambda 
posteriorPredDistr = matrix(data = 0, nrow = n_sampling*n_chains, ncol = nrow(dat))
for (i in 1:nrow(dat))
    posteriorPredDistr[, i] = exp(intercept_array + env_array*Data$Environment1[i])

# Posterior response variable
posteriorPredSim = results$draws("observedResponseSim") # dimension: iter_sampling * n_chains * nobs

# Reshape posteriorPredSim as a matrix nobs x n_repetition
posteriorPredSim_matrix = matrix(data = 0, nrow = nrow(dat), ncol = n_sampling*n_chains)
for (obs in 1:nrow(posteriorPredSim_matrix))
{
    col_start = 1
    for (chain in 1:n_chains)
    {
        col_end = chain*n_sampling
        posteriorPredSim_matrix[obs, col_start:col_end] = posteriorPredSim[, chain, obs]
        col_start = col_end + 1
    }
}

## Create dharma data for the residuals
sim = createDHARMa(simulatedResponse = posteriorPredSim_matrix,
    observedResponse = dat$observedResponse,
    fittedPredictedResponse = apply(posteriorPredDistr, 2, median),
    integerResponse = TRUE)

## Plot residuals
plot(sim)

########################################################
###############    RANDOM EFFECT PART    ###############
########################################################
#### Add a random effect.
## Explanations:
# By default (see ?createData), the intercept is 0, the fixed effect is 1, the number of groups is 10, the random effect is on
# the intercept and its variance is 1. We create the expected result below and then run a stan model accounting for the random effect.

expected_results = c(intercept_mean = 0, env = 1, randEff_var = 1)

#### Create stan data
## Create indexing for groups (used in Stan to compute likelihood)
# Convert data to data table and order it by group
setDT(dat)
setorder(dat, group)

# Index
ls_groups = as.integer(dat[, unique(group)])
indices = integer(Data$nGroups + 1)

for (i in 1:Data$nGroups)
    indices[i] = min(which(dat[, group] == ls_groups[i]))
indices[Data$nGroups + 1] = Data$nobs + 1

## Data list for stan
Data = as.list(dat)
Data$nobs = nrow(dat)
Data$nGroups = length(unique(dat$group))
Data$indices = indices

#### Fit data
## Common variables
n_chains = 3
maxIter = 3000

## Compile model
model = cmdstan_model("./florian_eg_randEff.stan")

## Run model
results = model$sample(data = Data, parallel_chains = n_chains, refresh = 50, chains = n_chains,
    iter_warmup = round(maxIter/2), iter_sampling = round(maxIter/2), save_warmup = FALSE)

results$cmdstan_diagnose()

results$print(names(expected_results))
print(expected_results)

n_sampling = results$metadata()$iter_sampling

#### Compute residuals
## Posterior distributions
# Posterior lambda 
posteriorPredDistr = results$draws("lambda") # dimension: iter_sampling * n_chains * nobs
posteriorPredDistr_matrix = matrix(data = 0, nrow = n_sampling*n_chains, ncol = nrow(dat)) # One obs per column

for (i in 1:nrow(dat))
    posteriorPredDistr_matrix[, i] = as.vector(posteriorPredDistr[,,i])

# Posterior response variable
posteriorPredSim = results$draws("observedResponseSim") # dimension: iter_sampling * n_chains * nobs

# Reshape posteriorPredSim as a matrix nobs x n_repetition
posteriorPredSim_matrix = matrix(data = 0, nrow = nrow(dat), ncol = n_sampling*n_chains)
for (obs in 1:nrow(posteriorPredSim_matrix))
{
    col_start = 1
    for (chain in 1:n_chains)
    {
        col_end = chain*n_sampling
        posteriorPredSim_matrix[obs, col_start:col_end] = posteriorPredSim[, chain, obs]
        col_start = col_end + 1
    }
}

## Create dharma data for the residuals
sim = createDHARMa(simulatedResponse = posteriorPredSim_matrix,
    observedResponse = dat$observedResponse,
    fittedPredictedResponse = apply(posteriorPredDistr_matrix, 2, median),
    integerResponse = TRUE)

## Plot residuals
plot(sim)

florian_eg.stan

data {
    // Number of data
    int<lower = 0> nobs;

    // Observations
    int observedResponse [nobs];

    // Explanatory variables
    vector [nobs] Environment1;
}

parameters {
    real intercept;
    real env;
}

model {
    // Declare variables
    vector [nobs] lambda = exp(intercept + env*Environment1);

    // Priors
    target += normal_lpdf(intercept | 0, 1e4);
    target += normal_lpdf(env | 0, 1e4);

    // Likelihood
    target += poisson_lpmf(observedResponse | lambda);
}

generated quantities {
    int<lower = 0> observedResponseSim [nobs] = poisson_rng(exp(intercept + env*Environment1));
}

florian_eg_randEff.stan

data {
    // Number of data
    int<lower = 1> nobs;
    int<lower = 1> nGroups;

    // Indexing (index of the beginning of each group)
    array [nGroups + 1] int indices; // This syntax is compatible only from Stan 2.29!

    // Observations
    array [nobs] int observedResponse;

    // Explanatory variables
    vector [nobs] Environment1;
}

parameters {
    real intercept_mean;
    array [nGroups] real intercepts; // This syntax is compatible only from Stan 2.29!

    real env;

    // Random effect
    real<lower = 0> randEff_var;
}

model {
    // Declare variables
    vector [nobs] lambda;
    for (i in 1:nGroups)
        lambda[indices[i]:(indices[i + 1] - 1)] = exp(intercepts[i] + env*Environment1[indices[i]:(indices[i + 1] - 1)]);

    // Priors
    target += normal_lpdf(intercept_mean | 0, 1e4);
    target += normal_lpdf(intercepts | intercept_mean, randEff_var);
    target += normal_lpdf(env | 0, 1e4);

    target += gamma_lpdf(randEff_var | 1.0/1000, 1.0/1000); // Give an average of 1, and a variance of 1000

    // Likelihood
    target += poisson_lpmf(observedResponse | lambda);
}

generated quantities {
    array [nobs] int<lower = 0> observedResponseSim;
    vector<lower = 0> [nobs] lambda;

    for (i in 1:nGroups)
    {
        observedResponseSim[indices[i]:(indices[i + 1] - 1)] = poisson_rng(exp(intercepts[i] + env*Environment1[indices[i]:(indices[i + 1] - 1)]));
        lambda[indices[i]:(indices[i + 1] - 1)] = exp(intercepts[i] + env*Environment1[indices[i]:(indices[i + 1] - 1)]);
    }
}