Mapping cum.g to original dataset

Is this the proper way to replicate the cum.b matrices?

library(ltmle)
library(SuperLearner)
library(randomForest)

set.seed(1234) 
n = 1000
periods=3

# Generate variables for L, A, and C nodes
# code for generating data is from LTMLE manual, last pages
rexpit <- function(x) rbinom(n = length(x), size = 1, prob = plogis(x))
W <- rnorm(n)

A1 <- rexpit(W + .5)
A2 <- rexpit(W + 2 * A1)
A3 <- rexpit(2 * A1 - A2)

C1 <- rbinom(n, 1, 0.1)
C2 <- pmax(C1, rbinom(n, 1, 0.3))
C3 <- pmax(C2, rbinom(n, 1, 0.9))

Y <- rexpit(W - A1 + 0.5 * A2 + 2 * A3)

data <- data.frame(W, 
                   A1, A2, A3, 
                   Y)

# write.csv(data, "Y:\\personalStaffFolders\\Gabriel\\data.csv")

Anodes <- paste0("A", 1:periods)
Cnodes <- paste0("C", 1:periods)
Lnodes <- c("W")   
Ynodes <- c("Y") 

# Control plan: set A1-A3 to 0
control <- matrix(0, nrow=n, ncol=periods)
# DYNAMIC Treatment plan (from LTMLE manual):           set A1 to 1 if W > 0, set A2 to 1 if W > 1.5, always set A3 to 1 
treatment_dynamic <- cbind(W > 0, W > 1.5, 1)
# FIXED   Treatment plan ( P(A1|W) replicable with logit): set A1-A3 to 1
treatment_fixed <- matrix(1, nrow=n, ncol=periods)

ltmleMODEL <- ltmle(data = data, 
                Anodes = Anodes,
                Lnodes = Lnodes,
                Ynodes = Ynodes,
                survivalOutcome = FALSE, 
                abar = list(treatment_fixed,control) # if swap to treatment_dynamic, phat1 will NOT equal cum_g_treatment V1 column
                )

# Take cum.g matrices as dataframes
cum_g_treatment <- as.data.frame(ltmleMODEL$cum.g[,,1])
cum_g_control <- as.data.frame(ltmleMODEL$cum.g[,,2])

# Estimate P(A1 = 1 | W)
# data$A11 <- ifelse(data$W > 0, 1, 0)
cum_g_treatment$phat1 <- predict(glm(A1 ~ W, family = binomial(link="logit"), data = data), type = "response")

# Estimate P(A1 = 1 & A2 = 1 |W)
data$A12 <- ifelse(data$A1 == 1 & data$A2 == 1, 1, 0)
# data$A12 <- ifelse(data$A11 == 1 & data$W > 1.5, 1, 0)
cum_g_treatment$phat2 <- predict(glm(A12 ~ W, family = binomial(link="logit"), data = data), type = "response")

# Estimate P(A1 = 1 & A2 = 1 & A3 = 1 |W)
data$A13 <- ifelse(data$A12 == 1 & data$A3 == 1, 1, 0)
cum_g_treatment$phat3 <- predict(glm(A13 ~ W, family = binomial(link="logit"), data = data), type = "response")

# Should be 0 or close
cum_g_treatment$diff1 <- round((cum_g_treatment$V1 - cum_g_treatment$phat1), digits=3)
cum_g_treatment$diff2 <- round((cum_g_treatment$V2 - cum_g_treatment$phat2), digits=3)
cum_g_treatment$diff3 <- round((cum_g_treatment$V3 - cum_g_treatment$phat3), digits=3)

cum_g_treatment$W <- abs(data$W)

cum_g_treatment <- cum_g_treatment[, c("W",
                                       "V1", "phat1", "diff1",
                                       "V2", "phat2", "diff2",
                                       "V3", "phat3", "diff3")]

##### STOP
# 1. Open cum_g_treatment
# 2. note that V1=phat1, v2 is approx phat 2, and v3 is approx phat3...
# 3. BUT sort diff3, desc -- note it can be up to -23 percentage points
# ... also, sorting by W, desc is super similar than sorting by diff3, at least for the first dozens of rows
# ... 

# these equalities do not hold if we use treatment_dynamic 
# 4. Moreover, note that when using treatment_fixed, this sums to 1's...
# (cum_g_treatment$V1 + cum_g_control$V1)
# ... but this does NOT
# round((cum_g_treatment$V2 + cum_g_control$V2), digits=1)

colMeans(cum_g_treatment)

# This extends on cum.g test, NO censoring.rmd; **which must be run first to populate global env!!!**
data_CENSORING <- data.frame(W, 
                             C1, C2, C3,
                             A1, A2, A3, 
                             Y)

ltmleMODEL_CENSORING <- ltmle(data = data_CENSORING, 
                              Anodes = Anodes,
                              Cnodes = Cnodes,
                              Lnodes = Lnodes,
                              Ynodes = Ynodes,
                              survivalOutcome = FALSE,
                              abar = list(treatment_fixed,control) # if swap to treatment_dynamic, phat1 will NOT equal cum_g_treatment_CENSORING V1 column
                )

# Take cum.g matrices as dataframes
cum_g_treatment_CENSORING <- as.data.frame(ltmleMODEL_CENSORING$cum.g[,,1])
cum_g_control_CENSORING <- as.data.frame(ltmleMODEL_CENSORING$cum.g[,,2])

# Estimating C's with same logic as A's in rmd with NO censoring...
# Estimate P(C1 = 1 | W)
cum_g_treatment_CENSORING$phat1 <- predict(glm(C1 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(C1 = 1 & C2 = 1 |W)
data_CENSORING$C12 <- ifelse(data_CENSORING$C1 == 1 & data_CENSORING$C2 == 1, 1, 0)
cum_g_treatment_CENSORING$phat2 <- predict(glm(C12 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(C1 = 1 & C2 = 1 & C3 = 1 |W)
data_CENSORING$C13 <- ifelse(data_CENSORING$C12 == 1 & data_CENSORING$C3 == 1, 1, 0)
cum_g_treatment_CENSORING$phat3 <- predict(glm(C13 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(A1 = 1 | W)
data_CENSORING$A11 <- ifelse(data_CENSORING$A1 == 1 & data_CENSORING$C1 == 1, 1, 0)
# data_CENSORING$A11 <- ifelse(data_CENSORING$W > 0, 1, 0)
cum_g_treatment_CENSORING$phat4 <- predict(glm(A11 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(A1 = 1 & A2 = 1 |W)
data_CENSORING$A12 <- ifelse(data_CENSORING$A11 == 1 & data_CENSORING$A2 == 1 & data_CENSORING$C2 == 1, 1, 0)
# data_CENSORING$A12 <- ifelse(data_CENSORING$A11 == 1 & data_CENSORING$W > 1.5, 1, 0)
cum_g_treatment_CENSORING$phat5 <- predict(glm(A12 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

# Estimate P(A1 = 1 & A2 = 1 & A3 = 1 |W)
data_CENSORING$A13 <- ifelse(data_CENSORING$A12 == 1 & data_CENSORING$A3 == 1 & data_CENSORING$C3 == 1, 1, 0)
cum_g_treatment_CENSORING$phat6 <- predict(glm(A13 ~ W, family = binomial(link="logit"), data = data_CENSORING), type = "response")

cum_g_treatment_CENSORING$diff1 <- round((cum_g_treatment_CENSORING$V1 - cum_g_treatment_CENSORING$phat1), digits=3)
cum_g_treatment_CENSORING$diff2 <- round((cum_g_treatment_CENSORING$V2 - cum_g_treatment_CENSORING$phat2), digits=3)
cum_g_treatment_CENSORING$diff3 <- round((cum_g_treatment_CENSORING$V3 - cum_g_treatment_CENSORING$phat3), digits=3)
cum_g_treatment_CENSORING$diff4 <- round((cum_g_treatment_CENSORING$V4 - cum_g_treatment_CENSORING$phat4), digits=3)
cum_g_treatment_CENSORING$diff5 <- round((cum_g_treatment_CENSORING$V5 - cum_g_treatment_CENSORING$phat5), digits=3)
cum_g_treatment_CENSORING$diff6 <- round((cum_g_treatment_CENSORING$V6 - cum_g_treatment_CENSORING$phat6), digits=3)

cum_g_treatment_CENSORING$W <- abs(data_CENSORING$W)

cum_g_treatment_CENSORING <- cum_g_treatment_CENSORING[, c("W",
                                                           "V1", "phat1", "diff1",
                                                           "V2", "phat2", "diff2",
                                                           "V3", "phat3", "diff3",
                                                           "V4", "phat4", "diff4",
                                                           "V5", "phat5", "diff5",
                                                           "V6", "phat6", "diff6")]

##### STOP
# 1. Open cum_g_treatment_CENSORING
# 2. note that for t=(1,2,3) (ie: Censoring nodes) the following hold:...
# ...V(t)=phat(t)...
# ...V1=V2=V3=phat1=phat2=phat3...
# ...row values in cum_g_treatment_CENSORING = cum_g_control_CENSORING
# 3. As for t=(4,5,6) V(t) approx phat(t), but we have to include Censoring in defining A11, A12, and A13

colMeans(cum_g_treatment_CENSORING)

joshuaschwab / ltmle

Mapping cum.g to original dataset #23