osofr / simcausal

Simulating Longitudinal and Network Data with Causal Inference Applications
64 stars 11 forks source link

Gaussian var affecting categorical var #19

Open torkar opened 1 year ago

torkar commented 1 year ago

Hi,

I want to have a Gaussian variable affecting a categorical variable. I tried this which seems to work

C <- rnorm(3,0,2) # or even rnorm(1,0,2)
rcat.b1(250, softmax(C * c(0.5, 0.25, 0.25)))

But running the below only gets me category 4, while there are only three categories?

logsumexp <- function (x) {
  y = max(x)
  y + log(sum(exp(x - y)))
}

softmax <- function (x) {
  exp(x - logsumexp(x))
}

D <- DAG.empty()
N <- 250

D <- D +
  # Confounder with Gaussian distribution
  node("C",
      distr = "rnorm",
      mean = 0,
      sd = 2) +

  # Set BA to categorical (c=3).
  # Make sure it starts at 1, to ensure sane priors later.
  # Let C be a causal effect on BA.
  node("BA", 
    distr = "rcat.b1",
    probs = softmax(C * c(0.5, 0.25, 0.25)))

Dset <- set.DAG(D, vecfun = c("softmax"))
d <- sim(Dset, n = N)
r$> head(d)
  ID          C BA
1  1 -1.3004380  4
2  2  3.4434901  4
3  3 -1.7159391  4
4  4  3.7295937  4
5  5 -1.7967269  4
6  6 -0.8985413  4

Have I misunderstood something? :)

torkar commented 1 year ago

I now use softmax() from {rethinking}, and it seems to work? Here's an example, I'd be happy to see you pick it apart :)

rsz_unknown

Where $C$ is Gaussian, $BA$ and $H$ are categorical, and $P$ is Beta distributed.

D <- D +
  # Confounder with Gaussian distribution
  node("C",
      distr = "rnorm",
      mean = 0,
      sd = 2) +

  # Set BA to categorical (c=3).
  # Make sure it starts at 1, to ensure sane priors later.
  # Let C be a causal effect on BA.
  node("BA", 
    distr = "rcat.b1",
    probs = softmax(C * c(0.5, 0.25, 0.25))) + 

  # Set H to categorical (c=3).
  # Let BA be a causal effect on H.
  node("H",
    distr = "rcat.b1",
    probs = (BA == 1) * c(0.7, 0.1, 0.2) + (BA == 2) * c(0.2, 0.1, 0.7) + (BA == 3) * c(0.2, 0.1, 0.7)) + 

  # Generate Beta(mu, phi) with H, BA, and C.
  # Note: We use rbeta_mean() which is a mean parameterization of Beta.
  node("P",
    distr = "rbeta_mean",
    phi = 4,
    mu = inv_logit(b_H * H + b_BA * BA + b_C * C))

# Need to use inv_logit() since mu in Beta needs it when modelling
Dset <- set.DAG(D, vecfun  = c("inv_logit", "softmax"))

# Gen N samples
d <- simcausal::sim(Dset, n = N)
fkgruber commented 1 year ago

where do you get rbeta_mean? Can you include the R package or definition. Also can you include the values of b_H, b_BA, and b_C?

fkgruber commented 1 year ago

The problem with your softmax implementation is that it does not handle matrix input correctly. Compare the results between rethinking softmax and your softmax:

softmax=rethinking::softmax

logsumexp <- function (x) {
  y = max(x)
  y + log(sum(exp(x - y)))
}

softmax2 <- function (x) {
  exp(x - logsumexp(x))
}

set.seed(123)
test=matrix(rnorm(21,0,1),ncol=3)
test 
V1 V2 V3
-0.560475646552213 -1.26506123460653 -0.555841134754075
-0.23017748948328 -0.686852851893526 1.78691313680308
1.55870831414912 -0.445661970099958 0.497850478229239
0.070508391424576 1.22408179743946 -1.96661715662964
0.129287735160946 0.359813827057364 0.701355901563686
1.71506498688328 0.400771450594052 -0.472791407727934
0.460916205989202 0.11068271594512 -1.06782370598685
softmax(test)  
V1 V2 V3
0.400166685600284 0.197807747397097 0.40202556700262
0.109291875828968 0.0692239361479215 0.821484188023111
0.675263283427652 0.090988432231035 0.233748284341313
0.232563460750776 0.737109563985281 0.0303269752639425
0.248065693511943 0.312379781567467 0.43955452492059
0.724206846887273 0.194568350039628 0.0812248030730993
0.520472184035555 0.366684921845007 0.112842894119439
softmax2(test) 
V1 V2 V3
0.0160129482251862 0.00791541457994917 0.0160873326572829
0.0222801426157648 0.0141119287971036 0.167467021010719
0.133297253055616 0.0179611543732534 0.0461420085673863
0.0300956817200632 0.0953882211715476 0.00392456747990148
0.0319177104020375 0.0401927701584504 0.0565558817654287
0.155856902166214 0.0418731477437802 0.0174804287482863
0.0444688752369654 0.031329370792442 0.00964123873262096

and I think the problem is coming from logsumexp which combines the matrix input into 1 number

 logsumexp(test)
x
3.57388197339362
torkar commented 1 year ago

where do you get rbeta_mean? Can you include the R package or definition. Also can you include the values of b_H, b_BA, and b_C?

https://github.com/sims1253/bayesim is where rbeta_mean comes from, and I now use the {rethinking} implementation of softmax:

N <- 250 # Sample size

# Set beta effects
b_H <- 0.3
b_BA <- 0.2
b_C <- 0.5

D <- DAG.empty()

D <- D +
  # Confounder with Gaussian distribution
  node("C",
      distr = "rnorm",
      mean = 0,
      sd = 2) +

  # Set BA to categorical (c=3).
  # Make sure it starts at 1, to ensure sane priors later.
  # Let C be a causal effect on BA.
  node("BA", 
    distr = "rcat.b1",
    probs = softmax(C * c(0.5, 0.25, 0.25))) + 

  # Set H to categorical (c=3).
  # Let BA be a causal effect on H.
  node("H",
    distr = "rcat.b1",
    probs = (BA == 1) * c(0.7, 0.1, 0.2) + (BA == 2) * c(0.2, 0.1, 0.7) + (BA == 3) * c(0.2, 0.1, 0.7)) + 

  # Generate Beta(mu, phi) with H, BA, and C.
  # Note: We use rbeta_mean() which is a mean parameterization of Beta.
  node("P",
    distr = "rbeta_mean",
    phi = 4,
    mu = inv_logit(b_H * H + b_BA * BA + b_C * C))

# Need to use inv_logit() since mu in Beta needs it when modelling
Dset <- set.DAG(D, vecfun  = c("inv_logit", "softmax"))

# Gen N samples
d <- simcausal::sim(Dset, n = N)
d$P[d$P == 1] <- 0.99999999 # nudge iff 1