xfim / ggmcmc

Graphical tools for analyzing Markov Chain Monte Carlo simulations from Bayesian inference
111 stars 31 forks source link

How was the binary dataset created? #57

Closed george-githinji closed 5 years ago

george-githinji commented 7 years ago

The documentation says

in order to illustrate the functions suitable for binary outcomes a new dataset must be simulated, and samples of parameters from a simple logistic regression model must be obtained

But it does not explain how this dataset was derived or created. An actual example would be useful. I have a logistic regression model and I would like to plot a roc-curve. I would like to understand how to simulate and create a dataset suitable for creating a roc-curve.

xfim commented 5 years ago

Here it goes the full file used to create the fake datasets.

library(rjags)
set.seed(14718)

# ----------- Linear Model

#Prepare the fake dataset:
#<<>>=
N <- 50
beta.0 <- 3
beta.1 <- 5
sigma <- 2
x <- rnorm(N, 0, 3)
y <- beta.0 + beta.1 * x + rnorm(N, mean=0, sd=sigma)
X <- matrix(x, ncol=1)
#@

#Define the model in \proglang{JAGS} by writing it in a file using R and then invoking it from the same file when sampling:
#<<model_definition, tidy=FALSE>>=
write("
  model {
    for (i in 1:N) {
      y[i] ~ dnorm(mu[i], tau)
      y.rep[i] ~ dnorm(mu[i], tau)
      mu[i] <- beta[1] + beta[2] * X[i,1]
    }
   tau <- pow(sigma, -2)
   sigma ~ dunif(0, 100)
   beta[1] ~ dnorm(0, 0.001)
   beta[2] ~ dnorm(0, 0.001)
}", "m-simple_linear_regression.bug")
                                                  #-@

#Parameters to pass to \proglang{JAGS}, such as data, number of simulations, nodes to monitor, and initial values (the first for $\tau$ and the last two for $\beta$).
#<<>>=
n.simu <- 2000
n.burnin <- n.simu/2
inits <- c(1, 5, -5)
D <- list(y=y, X=X, N=N, y.rep=rep(NA, N))
par <- c("sigma", "beta")
#@

#Sample from the posterior distribution using \proglang{JAGS}.
#<<sample, echo=TRUE>>=
m.jags <- jags.model("m-simple_linear_regression.bug", data=D,
  n.adapt=n.burnin, quiet=TRUE, n.chains=2)
s <- coda.samples(m.jags, par, n.iter=n.simu-n.burnin, quiet=TRUE)
s.mu <- coda.samples(m.jags, "mu", n.iter=200, quiet=TRUE)
s.y.rep <- coda.samples(m.jags, "y.rep", n.iter=200, quiet=TRUE)
linear <- list(s=s, s.mu=s.mu, y=y)
save(s, s.y.rep, y, file="linear.rda")

# s needed for maaaaaany functions
# s.y.rep needed for ggs_ppmean() and ggs_ppsd()
# y needed for comparison with the residual standard deviation in the examples of the paper.

# ----------- Binary Model
N <- 100
X <- rnorm(N)
z <- -1 + 1 * X
pr <- 1/(1+exp(-z))
y <- rbinom(N, 1, pr)
D <- list(N=N, X=X, y=y)

md <- '
model {
  for (n in 1:N) {
    y[n] ~ dbern(mu[n])
    logit(mu[n]) <- theta[1] + theta[2] * X[n]
  }
  for (t in 1:2) {
    theta[t] ~ dnorm(0, 0.001)
  }
}'
write(md, file="model-logit.bug")

m.jags <- jags.model("model-logit.bug", data=D,
  n.adapt=n.burnin, quiet=TRUE, n.chains=2)
par <- c("theta", "mu")
s.binary <- coda.samples(m.jags, par, n.iter=1E3, quiet=TRUE, thin=10)
y.binary <- y
binary <- list(s.binary=s.binary, y.binary=y.binary)
save(s.binary, y.binary, file="binary.rda")

# s.binary needed for ggs_rocplot() and ggs_separation()
# y.binary needed for ggs_rocplot() and ggs_separation()