joshuaschwab / ltmle

Longitudinal Targeted Maximum Likelihood Estimation package
http://joshuaschwab.github.io/ltmle/
23 stars 16 forks source link

Mapping cum.g to original dataset #23

Open gah-bo opened 11 months ago

gah-bo commented 11 months ago

We are using LTMLE for an analysis with 4 periods (quarters). We're trying to peep at the inner workings - from my understanding, LTMLE provides propensity scores, allowing us to calculate the joint probabilities for sequences of treatments, such as P(1,1,1,1) and P(0,0,0,0). Our intent is to compare always treated vs never treated with and without balancing with cumulative P(treatment).

We believe cum.g to be the object we are after.

However I don't know how to 'map' this object back to the original dataset. I assume the Nth row corresponds to the Nth row in the original dataset. I also assume that, given that

abar = list(treatment = c(1, 0, 1, 0), control = c(0, 0, 0, 0) )

then

treatment_cum.g <- LTMLE[["cum.g"]][,,1]

and

control_cum.g <- LTMLE[["cum.g"]][,,2]

I am looking for confirmation on those. Lastly, the columns -- both of the above slices of cum.g have 8 columns, 1 for each A and C node, as per the documentation. But they're labeled V1-V8, and I don't know to which they each correspond.

LTMLE calls:

LTMLE_mod_1010x <- ltmle(Surv_No, Anodes = Anodes, Cnodes = Cnodes, Lnodes = Lnodes, Ynodes = Ynodes, survivalOutcome = FALSE, abar = list(treatment = c(1, 0, 1, 0), control = c(0, 0, 0, 0) ), SL.library = list(Q = NULL, g = SL.libx), estimate.time = FALSE)

and so on for all combinations of 1's and 0's      

gah-bo commented 11 months ago

Is this the proper way to replicate the cum.b matrices?

library(ltmle)
library(SuperLearner)
library(randomForest)

set.seed(1234) 
n = 1000
periods=3

# Generate variables for L, A, and C nodes
# code for generating data is from LTMLE manual, last pages
rexpit <- function(x) rbinom(n = length(x), size = 1, prob = plogis(x))
W <- rnorm(n)

A1 <- rexpit(W + .5)
A2 <- rexpit(W + 2 * A1)
A3 <- rexpit(2 * A1 - A2)

C1 <- rbinom(n, 1, 0.1)
C2 <- pmax(C1, rbinom(n, 1, 0.3))
C3 <- pmax(C2, rbinom(n, 1, 0.9))

Y <- rexpit(W - A1 + 0.5 * A2 + 2 * A3)

data <- data.frame(W, 
                   A1, A2, A3, 
                   Y)

# write.csv(data, "Y:\\personalStaffFolders\\Gabriel\\data.csv")

Anodes <- paste0("A", 1:periods)
Cnodes <- paste0("C", 1:periods)
Lnodes <- c("W")   
Ynodes <- c("Y") 

# Control plan: set A1-A3 to 0
control <- matrix(0, nrow=n, ncol=periods)
# DYNAMIC Treatment plan (from LTMLE manual):           set A1 to 1 if W > 0, set A2 to 1 if W > 1.5, always set A3 to 1 
treatment_dynamic <- cbind(W > 0, W > 1.5, 1)
# FIXED   Treatment plan ( P(A1|W) replicable with logit): set A1-A3 to 1
treatment_fixed <- matrix(1, nrow=n, ncol=periods)

ltmleMODEL <- ltmle(data = data, 
                Anodes = Anodes,
                Lnodes = Lnodes,
                Ynodes = Ynodes,
                survivalOutcome = FALSE, 
                abar = list(treatment_fixed,control) # if swap to treatment_dynamic, phat1 will NOT equal cum_g_treatment V1 column
                )

# Take cum.g matrices as dataframes
cum_g_treatment <- as.data.frame(ltmleMODEL$cum.g[,,1])
cum_g_control <- as.data.frame(ltmleMODEL$cum.g[,,2])

# Estimate P(A1 = 1 | W)
# data$A11 <- ifelse(data$W > 0, 1, 0)
cum_g_treatment$phat1 <- predict(glm(A1 ~ W, family = binomial(link="logit"), data = data), type = "response")

# Estimate P(A1 = 1 & A2 = 1 |W)
data$A12 <- ifelse(data$A1 == 1 & data$A2 == 1, 1, 0)
# data$A12 <- ifelse(data$A11 == 1 & data$W > 1.5, 1, 0)
cum_g_treatment$phat2 <- predict(glm(A12 ~ W, family = binomial(link="logit"), data = data), type = "response")

# Estimate P(A1 = 1 & A2 = 1 & A3 = 1 |W)
data$A13 <- ifelse(data$A12 == 1 & data$A3 == 1, 1, 0)
cum_g_treatment$phat3 <- predict(glm(A13 ~ W, family = binomial(link="logit"), data = data), type = "response")

# Should be 0 or close
cum_g_treatment$diff1 <- round((cum_g_treatment$V1 - cum_g_treatment$phat1), digits=3)
cum_g_treatment$diff2 <- round((cum_g_treatment$V2 - cum_g_treatment$phat2), digits=3)
cum_g_treatment$diff3 <- round((cum_g_treatment$V3 - cum_g_treatment$phat3), digits=3)

cum_g_treatment$W <- abs(data$W)

cum_g_treatment <- cum_g_treatment[, c("W",
                                       "V1", "phat1", "diff1",
                                       "V2", "phat2", "diff2",
                                       "V3", "phat3", "diff3")]

##### STOP
# 1. Open cum_g_treatment
# 2. note that V1=phat1, v2 is approx phat 2, and v3 is approx phat3...
# 3. BUT sort diff3, desc -- note it can be up to -23 percentage points
# ... also, sorting by W, desc is super similar than sorting by diff3, at least for the first dozens of rows
# ... 

# these equalities do not hold if we use treatment_dynamic 
# 4. Moreover, note that when using treatment_fixed, this sums to 1's...
# (cum_g_treatment$V1 + cum_g_control$V1)
# ... but this does NOT
# round((cum_g_treatment$V2 + cum_g_control$V2), digits=1)

colMeans(cum_g_treatment)
# This extends on cum.g test, NO censoring.rmd; **which must be run first to populate global env!!!**
data_CENSORING <- data.frame(W, 
                             C1, C2, C3,
                             A1, A2, A3, 
                             Y)

ltmleMODEL_CENSORING <- ltmle(data = data_CENSORING, 
                              Anodes = Anodes,
                              Cnodes = Cnodes,
                              Lnodes = Lnodes,
                              Ynodes = Ynodes,
                              survivalOutcome = FALSE,
                              abar = list(treatment_fixed,control) # if swap to treatment_dynamic, phat1 will NOT equal cum_g_treatment_CENSORING V1 column
                )

# Take cum.g matrices as dataframes
cum_g_treatment_CENSORING <- as.data.frame(ltmleMODEL_CENSORING$cum.g[,,1])
cum_g_control_CENSORING <- as.data.frame(ltmleMODEL_CENSORING$cum.g[,,2])

# Estimating C's with same logic as A's in rmd with NO censoring...
# Estimate P(C1 = 1 | W)
cum_g_treatment_CENSORING$phat1 <- predict(glm(C1 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(C1 = 1 & C2 = 1 |W)
data_CENSORING$C12 <- ifelse(data_CENSORING$C1 == 1 & data_CENSORING$C2 == 1, 1, 0)
cum_g_treatment_CENSORING$phat2 <- predict(glm(C12 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(C1 = 1 & C2 = 1 & C3 = 1 |W)
data_CENSORING$C13 <- ifelse(data_CENSORING$C12 == 1 & data_CENSORING$C3 == 1, 1, 0)
cum_g_treatment_CENSORING$phat3 <- predict(glm(C13 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(A1 = 1 | W)
data_CENSORING$A11 <- ifelse(data_CENSORING$A1 == 1 & data_CENSORING$C1 == 1, 1, 0)
# data_CENSORING$A11 <- ifelse(data_CENSORING$W > 0, 1, 0)
cum_g_treatment_CENSORING$phat4 <- predict(glm(A11 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(A1 = 1 & A2 = 1 |W)
data_CENSORING$A12 <- ifelse(data_CENSORING$A11 == 1 & data_CENSORING$A2 == 1 & data_CENSORING$C2 == 1, 1, 0)
# data_CENSORING$A12 <- ifelse(data_CENSORING$A11 == 1 & data_CENSORING$W > 1.5, 1, 0)
cum_g_treatment_CENSORING$phat5 <- predict(glm(A12 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(A1 = 1 & A2 = 1 & A3 = 1 |W)
data_CENSORING$A13 <- ifelse(data_CENSORING$A12 == 1 & data_CENSORING$A3 == 1 & data_CENSORING$C3 == 1, 1, 0)
cum_g_treatment_CENSORING$phat6 <- predict(glm(A13 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

cum_g_treatment_CENSORING$diff1 <- round((cum_g_treatment_CENSORING$V1 - cum_g_treatment_CENSORING$phat1), digits=3)
cum_g_treatment_CENSORING$diff2 <- round((cum_g_treatment_CENSORING$V2 - cum_g_treatment_CENSORING$phat2), digits=3)
cum_g_treatment_CENSORING$diff3 <- round((cum_g_treatment_CENSORING$V3 - cum_g_treatment_CENSORING$phat3), digits=3)
cum_g_treatment_CENSORING$diff4 <- round((cum_g_treatment_CENSORING$V4 - cum_g_treatment_CENSORING$phat4), digits=3)
cum_g_treatment_CENSORING$diff5 <- round((cum_g_treatment_CENSORING$V5 - cum_g_treatment_CENSORING$phat5), digits=3)
cum_g_treatment_CENSORING$diff6 <- round((cum_g_treatment_CENSORING$V6 - cum_g_treatment_CENSORING$phat6), digits=3)

cum_g_treatment_CENSORING$W <- abs(data_CENSORING$W)

cum_g_treatment_CENSORING <- cum_g_treatment_CENSORING[, c("W",
                                                           "V1", "phat1", "diff1",
                                                           "V2", "phat2", "diff2",
                                                           "V3", "phat3", "diff3",
                                                           "V4", "phat4", "diff4",
                                                           "V5", "phat5", "diff5",
                                                           "V6", "phat6", "diff6")]

##### STOP
# 1. Open cum_g_treatment_CENSORING
# 2. note that for t=(1,2,3) (ie: Censoring nodes) the following hold:...
# ...V(t)=phat(t)...
# ...V1=V2=V3=phat1=phat2=phat3...
# ...row values in cum_g_treatment_CENSORING = cum_g_control_CENSORING
# 3. As for t=(4,5,6) V(t) approx phat(t), but we have to include Censoring in defining A11, A12, and A13

colMeans(cum_g_treatment_CENSORING)