mshafieek / ADS-Missing-data-social-network

ADS master thesis
MIT License
0 stars 1 forks source link

Results error - Interpolation of 'time' and mismatch between risk set and 'replacement' dataframe #21

Closed dominic-comerford closed 5 months ago

dominic-comerford commented 5 months ago
##### Load packages
packages_to_install <- c("purrr", "furrr", "magrittr", "dplyr", 
                         "tibble", "data.table", "survival", "tidyverse", 
                         "devtools", "reprex", "styler", "igraph", "RColorBrewer",
                         "visNetwork", "relevent", "sna", "zoo", "imputeTS")

for (pkg in packages_to_install) {
  if (!require(pkg, character.only = TRUE)) {
    # If not, install the package
    install.packages(pkg)
  }
}

library(purrr, warn.conflicts = FALSE)    # for functional programming
library(furrr, warn.conflicts = FALSE)    # for functional futures
library(magrittr, warn.conflicts = FALSE) # for pipes
library(dplyr, warn.conflicts = FALSE)    # for data manipulation
library(tibble, warn.conflicts = FALSE)     # for tibbles
library(data.table, warn.conflicts = FALSE)   
library(survival, warn.conflicts = FALSE) # for REM analysis
library(tidyverse, warn.conflicts = FALSE)  
library(tidyr, warn.conflicts = FALSE)
library(reprex, warn.conflicts = FALSE)   # reproducible code
library(styler, warn.conflicts = FALSE)
library(igraph, warn.conflicts = FALSE)
library(RColorBrewer, warn.conflicts = FALSE)  
library(visNetwork, warn.conflicts = FALSE)
library(relevent, warn.conflicts = FALSE)  
library(sna, warn.conflicts = FALSE)
library(zoo, warn.conflicts = FALSE)
library(imputeTS, warn.conflicts = FALSE)

devtools::install_github("TilburgNetworkGroup/remify")
devtools::install_github("gerkovink/mice@match_conditional_current") 
devtools::install_github("TilburgNetworkGroup/remstats")

library(mice, warn.conflicts = FALSE)     # for imputation and amputation
library(remstats, warn.conflicts = FALSE) # for REM statistics
library(remify, warn.conflicts = FALSE)   # for converting  

##### Load data
con <- url("https://github.com/mshafieek/ADS-Missing-data-social-network/raw/main/literature_%20REM/Tutorial_REM_REH_DATA/UUsummerschool.Rdata")
load(con)
apollo <- PartOfApollo_13 %>% 
  rename(
    actor1 = sender,
    actor2 = receiver
  )
rm(Class, PartOfApollo_13, Twitter_data_rem3, WTCPoliceCalls, ClassIntercept, 
   ClassIsFemale, ClassIsTeacher, WTCPoliceIsICR, con, pkg)

# Missingness simulations
set.seed(123) # fix seed to realize a sufficient set
apollo <- as.tibble(apollo)
indic <- sample(1:nrow(apollo), 1500)

remify(apollo[indic, ], model = "tie") %>% dim()
#> events actors  dyads 
#>   1500     16    240
#### Combine the sufficient set and the incomplete set
make_missing <- function(x, indic){
  sufficient <- x[indic, ]
  miss <- x[-c(indic), ] |>
    # prop is set to .4 to get close to .2 missingness since almost half of the dataset is used for the sufficient set
    ampute(prop = .4, 
           mech = "MCAR",
           patterns = matrix(c(1,1,0, # missing in actor 2
                               1,0,1, # missing in actor 1
                               0,1,1, # missing in time
                               0,0,0, # missing in all
                               1,0,0, # missing in actor 1 + 2
                               0,1,0, # missing in time + actor 2
                               0,0,1  # missing in time + actor 1
                               ),
                             nrow=7,  # change to number of missingness patterns above
                             byrow=TRUE)) %>% 
    .$amp
  combined <- rbind(sufficient, miss)
  return(combined[order(combined$time), ]) # sort it all like apollo
}
whichcol <- c("", "actor2", "actor1") # Ensure that actor 1 != actor 2 in imputations
names(whichcol) <- colnames(apollo)

## predictor matrix
pred <- make.predictorMatrix(apollo)
#pred[ ,"time"] <- 0 # using time for imputation creates loops
## use the pmm.conditional method
method <- make.method(apollo)
method[1] <- "norm" # imputation of time correct? Something Like Last Obs Carried Forward?
method[c(2,3)] <- "pmm.conditional"
## post squeeze time between min and max time
#post <- make.post(apollo)
#min(apollo$time)
#max(apollo$time)
#post["time"] <- "imp[[j]][, i] <- squeeze(imp[[j]][, i], c(11849.2, 50014.8))"

## Interpolate 'time' before MICE
set.seed(123)
mbased_finite_apollo <-
  furrr::future_map(1:10, ~ {   # start of with 10 simulations for testing
    make_missing(apollo, indic) }, .options = furrr_options(seed = 123))

# Function to randomly create missingness in 'time' column as does not seem to remove in make_missing function?
create_miss_time <- function(df) {
  # Randomly select indices to replace with NA
  na_indices <- sample(1:nrow(df), size = round(0.2 * nrow(df)))  # Adjust percentage
  # Replace values at selected indices with NA
  df$time[na_indices] <- NA
  return(df)
}

# Iterate over the tibbles in mbased_finite_apollo
for (i in 1:length(mbased_finite_apollo)) {
  # Randomly create missingness in the 'time' column
  mbased_finite_apollo[[i]] <- create_miss_time(mbased_finite_apollo[[i]])
}

##Impute time with interpolation (single imputation)
for (i in 1:length(mbased_finite_apollo)) {
  # Impute missing values in the 'time' column using na_interpolation
  mbased_finite_apollo[[i]]$time <- na_interpolation(mbased_finite_apollo[[i]]$time)
}

##Impute sender and receiver through MICE
##### Model-based finite populations
# Function for a single tibble
apply_mice_to_tibble <- function(tibble) {
  # Apply mice to the tibble
  imp <- mice(tibble,
                       m = 5,
                       maxit = 5,
                       method = method,
                       whichcolumn = whichcol,
                       predictorMatrix = pred,
                       print = FALSE)
  # Return the imputed tibble
  return(imp)
}
# Apply mice to each tibble in mbased_finite_apollo
mbased_finite_apollo <- lapply(mbased_finite_apollo, apply_mice_to_tibble)

##### Defining effects for the relational event model
effects <- ~ -1 + reciprocity(scaling = ("std")) + indegreeSender() + outdegreeReceiver()

##### Function to get the statistics of the previously defined effects.
stats_function <- function(data) {
  # remify the data
  reh <- remify::remify(edgelist = data, model = "tie")
  # calculate effect statistics
  statsObject_imp <- remstats(reh = reh, tie_effects = effects)
  # Return the statistics
  return(statsObject_imp)
}

##### Function for making the data compatible with coxph()
prepare_coxph_data <- function(statsObject, apollo) {
  risk_sets <- attr(statsObject, "riskset")
  risk_sets <- risk_sets %>% select(-'id')

  # Get the times
  time <- apollo$time

  # merge riskset with each timepoint
  combined <- merge(risk_sets, time, by = NULL)

  combined <- combined %>% rename("time" = "y")
  combined <- lapply(combined, as.numeric)
  combined <- as.data.frame(combined)

  # Create matrices for subtraction to make a status column for coxph
  combined_matrix <- data.matrix(combined)
  matrix_rows <- nrow(combined)

  repeated_df <- apollo[rep(seq_len(nrow(apollo)), each = 240), ] 
  repeated_df <- repeated_df[, c(2,3,1)]
  apollo_matrix <- data.matrix(repeated_df)

  status_matrix <- apollo_matrix - combined_matrix

  # create a status column
  status <- as.integer(rowSums(status_matrix == 0) == ncol(status_matrix))
  status <- as.data.frame(status)

  # Add status to the combined set
  combined <- cbind(combined, status)

  # Extract statistics and add them to the dataframe
  reciprocity <- statsObject[,,1]
  indegreeSender <- statsObject[,,2]
  outdegreeReceiver <- statsObject[,,3]

  combined$reciprocity <- c(reciprocity)
  combined$indegreeSender <- c(indegreeSender)
  combined$outdegreeReceiver <- c(outdegreeReceiver)

  return(combined)
}

###### TRUE ANALYSIS
true.reh <- remify(edgelist = apollo, 
                   model = "tie")
# calculate stats
stats <- remstats(tie_effects = effects, 
                  reh = true.reh)
# use the function to create the correct format of the dataframe
true.cox.set <- prepare_coxph_data(stats, apollo)
# fit cox model 
true.cox.fit <- coxph(Surv(time, status) ~ reciprocity + indegreeSender + 
                        outdegreeReceiver, 
                      data=true.cox.set)
true <- coefficients(true.cox.fit)

true.cox.fit
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = true.cox.set)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity        2.357e-02  1.024e+00  1.858e-02  1.269    0.204
#> indegreeSender     4.314e-04  1.000e+00  7.400e-05  5.830 5.55e-09
#> outdegreeReceiver -9.115e-05  9.999e-01  7.437e-05 -1.226    0.220
#> 
#> Likelihood ratio test=41.29  on 3 df, p=5.673e-09
#> n= 931680, number of events= 3882

###### Running the REM on  simulations
Results1 <- 
  mbased_finite_apollo[1:2] %>% # just test first 2 elements
  map(~.x %>% # for every simulation
        complete("all") %>% 
        map(~.x %>% # for every imputation
              stats_function() %>% # do stats function
              prepare_coxph_data(apollo = apollo) %$% # prepare cox ph
              coxph(Surv(time, status) ~ 
                      reciprocity + 
                      indegreeSender + 
                      outdegreeReceiver)) %>%
        pool(custom.t = ".data$b + .data$b / .data$m") %>% # pool coefficients
        .$pooled %>% # extract table of pooled coefficients
          mutate(true = true, # add true
                 df = m-1,  # correct df
                 riv = Inf, # correct riv
                 std.error = sqrt(t), # standard error
                 statistic = estimate / std.error, # test statistic
                 p.value = 2 * (pt(abs(statistic), 
                                   pmax(df, 0.001), 
                                   lower.tail = FALSE)), # correct p.value
                 `2.5 %` = estimate - qt(.975, df) * std.error, # lower bound CI
                 `97.5 %` = estimate + qt(.975, df) * std.error, # upper bound CI
                 cov = `2.5 %` < true & true < `97.5 %`, # coverage
                 bias = estimate - true) %>% # bias
          select(term, m, true, estimate, std.error, statistic, p.value, 
                 riv, `2.5 %`, `97.5 %`, cov, bias) %>% 
          column_to_rownames("term") # `term` as rownames
    )
#> Error in `map()`:
#> ℹ In index: 1.
#> Caused by error in `map()`:
#> ℹ In index: 1.
#> ℹ With name: 1.
#> Caused by error in `$<-.data.frame`:
#> ! replacement has 799920 rows, data has 931680

Created on 2024-05-20 with reprex v2.1.0

dominic-comerford commented 5 months ago

1) When amputing ‘time’ in the make_missing function – by adding extra patterns (to bring the total to 7 patterns) - it does not seem to ampute time at all based on the resulting list.

2) I then add missingness in time using the create_miss_time function and subsequently interpolate the imputations for time in a next step. I use MICE to multiply impute the sender and receiver columns. Resulting in a list with MIDS objects, where time is of course not multiply imputed in (as this was done with interpolation).

3) When running the model for the results I get the following error. It seems that the risk set with 931680 rows, which is (the number of timepoints x N x N-1) does not match the 799920 rows in the ‘replacement’ dataframe. I don’t understand this error, and tried excluding patterns (mentioned at 1) that have time missing, as time gets excluded later anyway. But still the problem persists, albeit it with a different mismatch than it has now.