mshafieek / ADS-Missing-data-social-network

ADS master thesis
MIT License
0 stars 1 forks source link

Not enough risk sets for the MAR data #10

Closed JesseJvdw closed 1 year ago

JesseJvdw commented 1 year ago

Not sure if it was the intention to do both MCAR and MAR but with MAR I've found an issue where when it's calculating the remstats for the MAR imputation it only gives me 210 risk sets instead of 240. So I can't combine the statistics with the coxph data because the rows are not equal. I've checked the completed MAR set and the contain the 19 actors so it's weird that it only gives 210 risk sets.

The code regarding this is all the way at the bottom of my rmd.

EDIT : I see that when I do multiple simulations it also happens for MCAR. So for some imputations it creates only 210 risk sets instead of 240.

gerkovink commented 1 year ago

Can you give me small reprex here that illustrates the problem for MCAR? Then I customise a solution to your exact code.

JesseJvdw commented 1 year ago

So much for "short" but here is the smallest runnable code.

library(mice)       # for imputation and amputation
#> 
#> Attaching package: 'mice'
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following objects are masked from 'package:base':
#> 
#>     cbind, rbind
library(purrr)      # for functional programming
#> Warning: package 'purrr' was built under R version 4.1.3
library(furrr)      # for functional futures
#> Warning: package 'furrr' was built under R version 4.1.3
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.1.3
library(magrittr)   # for pipes
#> Warning: package 'magrittr' was built under R version 4.1.3
#> 
#> Attaching package: 'magrittr'
#> The following object is masked from 'package:purrr':
#> 
#>     set_names
library(dplyr)      # for data manipulation
#> Warning: package 'dplyr' was built under R version 4.1.3
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tibble)     # for tibbles
#> Warning: package 'tibble' was built under R version 4.1.3
library(remstats)   # for REM statistics
library(remify)     # for converting  
library(data.table) 
#> Warning: package 'data.table' was built under R version 4.1.3
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
#> The following object is masked from 'package:purrr':
#> 
#>     transpose
library(survival)   # for REM analysis
#> Warning: package 'survival' was built under R version 4.1.3
#> 
#> Attaching package: 'survival'
#> The following object is masked from 'package:future':
#> 
#>     cluster
library(tidyverse)
#> Warning: package 'tidyverse' was built under R version 4.1.3
#> Warning: package 'ggplot2' was built under R version 4.1.3
#> Warning: package 'tidyr' was built under R version 4.1.3
#> Warning: package 'readr' was built under R version 4.1.3
#> Warning: package 'stringr' was built under R version 4.1.3
#> Warning: package 'forcats' was built under R version 4.1.3
library(datapasta)
#> Warning: package 'datapasta' was built under R version 4.1.3

set.seed(123)

##### Load data

load("UUsummerschool.rdata")

apollo <- as_tibble(PartOfApollo_13)

##### renaming columns to work with remify

setnames(apollo, old = c('sender','receiver'), 
         new = c('actor1','actor2'))

whichcol <- c("", "actor2", "actor1")
names(whichcol) <- colnames(apollo)

# use the custom pmm method
method <- make.method(apollo)
method[c(2,3)] <- "pmm.conditional"

##### Creating missing data

##### Missing pattern
pattern <- matrix(c(1,0,1,1,1,0), nrow=2, byrow=TRUE)

##### predictor matrix
predictormatrix <- matrix(c(0,0,0,0,0,1,0,1,0), nrow=3, byrow=TRUE)

##### Model-based finite populations
mbased_finite_apollo <- list( 
  MCAR = furrr::future_map(1:5, ~ {
    apollo %>% 
      ampute(prop = .5, 
             mech = "MCAR",
             patterns = pattern) %>% .$amp %>%
      mice(m = 5, 
           maxit = 5,
           method = method,
           whichcolumn = whichcol,
           predictorMatrix = predictormatrix,
           print = FALSE)
  }, .options = furrr_options(seed = 123)))

#### 
effects <- ~ -1 + outdegreeReceiver() + indegreeSender() + reciprocity()
rehObject <- remify(apollo, model = "tie")

statsObject <- remstats(rehObject, tie_effects = effects)

### get the risk sets
risk_sets <- attr(statsObject, "riskset")
risk_sets <- risk_sets %>% select(-'id')

### Get the times
time <- apollo$time

### merge riskset with each timepoint
combined <- merge(risk_sets, time, by = NULL)

combined <- combined %>% rename("time" = "y")

combined <- lapply(combined, as.numeric)

combined <- as.data.frame(combined)

### Create matrices for subtraction to make a status column for coxph
combined_matrix <- data.matrix(combined)

matrix_rows <- nrow(combined)

repeated_df <- apollo[rep(seq_len(nrow(apollo)), each = 240), ] %>% rename("sender" = "actor1", "receiver" = "actor2")

repeated_df <- repeated_df[, c(2,3,1)]

apollo_matrix <- data.matrix(repeated_df)

status_matrix <- apollo_matrix - combined_matrix

### create a status column

status <- as.integer(rowSums(status_matrix == 0) == ncol(status_matrix))

status <- as.data.frame(status)

### Add status to the combined set
combined <- cbind(combined, status) 

###### REM analysis on completed data sets of MCAR and MAR

run_coxph <- function(df) {
  rehObject <- remify(as.data.frame(df), model = "tie")
  statsObject <- remstats(rehObject, tie_effects = effects)

  reciprocity <- as.vector(statsObject[,,3])
  outdegreeReceiver <- as.vector(statsObject[,,1])
  indegreeSender <- as.vector(statsObject[,,2])

  data <- cbind(combined, reciprocity, outdegreeReceiver, indegreeSender)
}

###### running the function on all completed datasets

Results <- mbased_finite_apollo %>% 
  map(~.x %>% # for every missingness mechanism
    map(~.x %>% # for every simulated multiple imputation.... # create a list of completed data sets
          complete("all") %>%
        map(~.x %$% # for every completed data set....
              with(.x, run_coxph(.x))
            ) 
  )
)
#> Error in `map()`:
#> i In index: 1.
#> i With name: MCAR.
#> Caused by error in `map()`:
#> i In index: 3.
#> Caused by error in `map()`:
#> i In index: 1.
#> i With name: 1.
#> Caused by error in `data.frame()`:
#> ! arguments imply differing number of rows: 931680, 815220
#> Backtrace:
#>      x
#>   1. +-mbased_finite_apollo %>% ...
#>   2. +-purrr::map(...)
#>   3. | \-purrr:::map_("list", .x, .f, ..., .progress = .progress)
#>   4. |   +-purrr:::with_indexed_errors(...)
#>   5. |   | \-base::withCallingHandlers(...)
#>   6. |   +-purrr:::call_with_cleanup(...)
#>   7. |   \-.f(.x[[i]], ...)
#>   8. |     \-.x %>% ...
#>   9. +-purrr::map(...)
#>  10. | \-purrr:::map_("list", .x, .f, ..., .progress = .progress)
#>  11. |   +-purrr:::with_indexed_errors(...)
#>  12. |   | \-base::withCallingHandlers(...)
#>  13. |   +-purrr:::call_with_cleanup(...)
#>  14. |   \-.f(.x[[i]], ...)
#>  15. |     \-.x %>% complete("all") %>% ...
#>  16. +-purrr::map(., ~.x %$% with(.x, run_coxph(.x)))
#>  17. | \-purrr:::map_("list", .x, .f, ..., .progress = .progress)
#>  18. |   +-purrr:::with_indexed_errors(...)
#>  19. |   | \-base::withCallingHandlers(...)
#>  20. |   +-purrr:::call_with_cleanup(...)
#>  21. |   \-.f(.x[[i]], ...)
#>  22. |     \-.x %$% with(.x, run_coxph(.x))
#>  23. +-base::with(., with(.x, run_coxph(.x)))
#>  24. +-base::with.default(., with(.x, run_coxph(.x)))
#>  25. | \-base::eval(substitute(expr), data, enclos = parent.frame())
#>  26. |   \-base::eval(substitute(expr), data, enclos = parent.frame())
#>  27. |     +-base::with(.x, run_coxph(.x))
#>  28. |     \-base::with.default(.x, run_coxph(.x))
#>  29. |       \-base::eval(substitute(expr), data, enclos = parent.frame())
#>  30. |         \-base::eval(substitute(expr), data, enclos = parent.frame())
#>  31. |           \-global run_coxph(.x)
#>  32. |             \-mice::cbind(combined, reciprocity, outdegreeReceiver, indegreeSender)
#>  33. |               \-base::cbind(...)
#>  34. |                 \-base::cbind(deparse.level, ...)
#>  35. |                   \-base::data.frame(..., check.names = FALSE)
#>  36. |                     \-base::stop(...)
#>  37. \-base::.handleSimpleError(...)
#>  38.   \-purrr (local) h(simpleError(msg, call))
#>  39.     \-cli::cli_abort(...)
#>  40.       \-rlang::abort(...)

Created on 2023-06-01 with reprex v2.0.2

gerkovink commented 1 year ago

;-) Give me some time.

gerkovink commented 1 year ago

I believe you have used the wrong pipe operator. Changing the exposition pipe %$% to the regular pipe %>% or |> in the line where you call run_coxph(.x) should do the trick:

library(mice, warn.conflicts = FALSE)     # for imputation and amputation
library(purrr, warn.conflicts = FALSE)    # for functional programming
library(furrr, warn.conflicts = FALSE)    # for functional futures
#> Loading required package: future
library(magrittr, warn.conflicts = FALSE) # for pipes
library(dplyr, warn.conflicts = FALSE)    # for data manipulation
library(tibble, warn.conflicts = FALSE)     # for tibbles
library(remstats, warn.conflicts = FALSE) # for REM statistics
library(remify, warn.conflicts = FALSE)   # for converting  
library(data.table, warn.conflicts = FALSE)   
library(survival, warn.conflicts = FALSE) # for REM analysis
library(tidyverse, warn.conflicts = FALSE)  
library(datapasta, warn.conflicts = FALSE)  
set.seed(123)

##### Load data
con <- url("https://github.com/mshafieek/ADS-Missing-data-social-network/raw/main/literature_%20REM/Tutorial_REM_REH_DATA/UUsummerschool.Rdata")
load(con)
apollo <- as_tibble(PartOfApollo_13)

##### renaming columns to work with remify
setnames(apollo, old = c('sender','receiver'), 
         new = c('actor1','actor2'))

whichcol <- c("", "actor2", "actor1")
names(whichcol) <- colnames(apollo)

# use the custom pmm method
method <- make.method(apollo)
method[c(2,3)] <- "pmm.conditional"

##### Creating missing data
##### Missing pattern
pattern <- matrix(c(1,0,1,1,1,0), nrow=2, byrow=TRUE)

##### predictor matrix
predictormatrix <- matrix(c(0,0,0,0,0,1,0,1,0), nrow=3, byrow=TRUE)

##### Model-based finite populations
mbased_finite_apollo <- list( 
  MCAR = furrr::future_map(1:5, ~ {
    apollo %>% 
      ampute(prop = .5, 
             mech = "MCAR",
             patterns = pattern) %>% .$amp %>%
      mice(m = 5, 
           maxit = 5,
           method = method,
           whichcolumn = whichcol,
           predictorMatrix = predictormatrix,
           print = FALSE)
  }, .options = furrr_options(seed = 123)))

#### 
effects <- ~ -1 + outdegreeReceiver() + indegreeSender() + reciprocity()
rehObject <- remify(apollo, model = "tie")

statsObject <- remstats(rehObject, tie_effects = effects)

### get the risk sets
risk_sets <- attr(statsObject, "riskset")
risk_sets <- risk_sets %>% select(-'id')

### Get the times
time <- apollo$time

### merge riskset with each timepoint
combined <- merge(risk_sets, time, by = NULL)
combined <- combined %>% rename("time" = "y")
combined <- lapply(combined, as.numeric)
combined <- as.data.frame(combined)

### Create matrices for subtraction to make a status column for coxph
combined_matrix <- data.matrix(combined)
matrix_rows <- nrow(combined)
repeated_df <- apollo[rep(seq_len(nrow(apollo)), each = 240), ] %>% rename("sender" = "actor1", "receiver" = "actor2")
repeated_df <- repeated_df[, c(2,3,1)]
apollo_matrix <- data.matrix(repeated_df)
status_matrix <- apollo_matrix - combined_matrix

### create a status column
status <- as.integer(rowSums(status_matrix == 0) == ncol(status_matrix))
status <- as.data.frame(status)

### Add status to the combined set
combined <- cbind(combined, status) 

###### REM analysis on completed data sets of MCAR and MAR
run_coxph <- function(df) {
  rehObject <- remify(as.data.frame(df), model = "tie")
  statsObject <- remstats(rehObject, tie_effects = effects)

  reciprocity <- as.vector(statsObject[,,3])
  outdegreeReceiver <- as.vector(statsObject[,,1])
  indegreeSender <- as.vector(statsObject[,,2])

  data <- cbind(combined, reciprocity, outdegreeReceiver, indegreeSender)
}

###### running the function on all completed datasets
Results <- mbased_finite_apollo %>% 
  map(~.x %>% # for every missingness mechanism
        map(~.x %>% # for every simulated multiple imputation.... # create a list of completed data sets
              complete("all") %>%
              map(~.x %>% # for every completed data set....
                    with(.x, run_coxph(.x))
              ) 
        )
  )

str(Results)
#> List of 1
#>  $ MCAR:List of 5
#>   ..$ :List of 5
#>   .. ..$ 1:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 19 17 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 19 17 2 ...
#>   .. ..$ 2:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 19 2 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 19 17 2 ...
#>   .. ..$ 3:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 19 2 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 18 17 19 17 18 ...
#>   .. ..$ 4:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 19 17 19 18 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 19 17 2 ...
#>   .. ..$ 5:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 19 2 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 19 17 2 ...
#>   ..$ :List of 5
#>   .. ..$ 1:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 18 19 17 19 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 19 17 19 ...
#>   .. ..$ 2:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 19 17 2 2 17 19 17 19 17 ...
#>   .. .. ..$ actor2: num [1:3882] 17 18 2 18 17 2 17 19 17 19 ...
#>   .. ..$ 3:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 17 19 2 19 19 17 19 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 19 17 19 ...
#>   .. ..$ 4:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 17 17 2 17 19 17 19 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 19 17 19 ...
#>   .. ..$ 5:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 17 2 2 17 19 17 19 17 ...
#>   .. .. ..$ actor2: num [1:3882] 7 18 2 18 17 2 17 19 17 19 ...
#>   ..$ :List of 5
#>   .. ..$ 1:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 17 2 18 2 2 17 19 17 2 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 19 2 18 17 2 17 19 17 19 ...
#>   .. ..$ 2:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 19 2 18 17 2 17 2 17 2 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 17 2 18 17 2 17 19 17 19 ...
#>   .. ..$ 3:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 2 17 2 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 19 17 19 ...
#>   .. ..$ 4:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 19 2 18 2 2 17 2 17 2 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 19 2 18 17 2 17 19 17 19 ...
#>   .. ..$ 5:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 18 17 2 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 17 2 18 17 2 17 19 17 19 ...
#>   ..$ :List of 5
#>   .. ..$ 1:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 2 17 19 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 18 17 19 ...
#>   .. ..$ 2:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 2 17 19 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 2 17 19 ...
#>   .. ..$ 3:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 2 17 19 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 2 17 19 ...
#>   .. ..$ 4:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 2 17 19 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 19 17 2 17 19 ...
#>   .. ..$ 5:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 2 17 19 2 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 17 2 17 19 ...
#>   ..$ :List of 5
#>   .. ..$ 1:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 19 17 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 2 2 17 19 ...
#>   .. ..$ 2:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 19 2 18 2 2 17 19 17 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 2 2 17 19 ...
#>   .. ..$ 3:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 17 2 18 2 2 17 19 17 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 2 2 17 19 ...
#>   .. ..$ 4:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 19 17 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 2 2 17 19 ...
#>   .. ..$ 5:'data.frame': 3882 obs. of  3 variables:
#>   .. .. ..$ time  : num [1:3882] 11849 11854 11885 11890 12232 ...
#>   .. .. ..$ actor1: num [1:3882] 18 2 18 2 2 17 19 17 2 17 ...
#>   .. .. ..$ actor2: num [1:3882] 2 18 2 18 17 2 2 18 17 19 ...

sessionInfo()
#> R version 4.3.0 (2023-04-21)
#> Platform: aarch64-apple-darwin20 (64-bit)
#> Running under: macOS Ventura 13.4
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> time zone: Europe/Amsterdam
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] datapasta_3.1.0   lubridate_1.9.2   forcats_1.0.0     stringr_1.5.0    
#>  [5] readr_2.1.4       tidyr_1.3.0       ggplot2_3.4.2     tidyverse_2.0.0  
#>  [9] survival_3.5-5    data.table_1.14.8 remify_3.0.0      remstats_3.1.0   
#> [13] tibble_3.2.1      dplyr_1.1.2       magrittr_2.0.3    furrr_0.3.1      
#> [17] future_1.32.0     purrr_1.0.1       mice_3.15.3      
#> 
#> loaded via a namespace (and not attached):
#>  [1] utf8_1.2.3        generics_0.1.3    stringi_1.7.12    lattice_0.21-8   
#>  [5] hms_1.1.3         listenv_0.9.0     digest_0.6.31     timechange_0.2.0 
#>  [9] evaluate_0.21     grid_4.3.0        fastmap_1.1.1     Matrix_1.5-4     
#> [13] backports_1.4.1   fansi_1.0.4       scales_1.2.1      codetools_0.2-19 
#> [17] cli_3.6.1         rlang_1.1.1       parallelly_1.36.0 munsell_0.5.0    
#> [21] splines_4.3.0     reprex_2.0.2      withr_2.5.0       yaml_2.3.7       
#> [25] tools_4.3.0       parallel_4.3.0    tzdb_0.4.0        colorspace_2.1-0 
#> [29] globals_0.16.2    broom_1.0.4       vctrs_0.6.2       R6_2.5.1         
#> [33] lifecycle_1.0.3   fs_1.6.2          pkgconfig_2.0.3   pillar_1.9.0     
#> [37] gtable_0.3.3      glue_1.6.2        Rcpp_1.0.10       xfun_0.39        
#> [41] tidyselect_1.2.0  rstudioapi_0.14   knitr_1.43        htmltools_0.5.5  
#> [45] rmarkdown_2.21    compiler_4.3.0

Created on 2023-06-01 with reprex v2.0.2

JesseJvdw commented 1 year ago

That didn't fix the problem. When I do that it doesn't run the coxph function to the data (which I left out in the reprex) Below is the code with the coxph in my own made function. When I run this it works fine right up to 1 complete data set where it gives this error: ! arguments imply differing number of rows: 931680, 815220. This happens because when the completed dataset gets fed to the remify function there are only 210 risk sets instead of the original 240. Below is the unedited which prints the results of the coxph without pooling. Maybe some actors are left out during imputation.

library(mice, warn.conflicts = FALSE)     # for imputation and amputation
library(purrr, warn.conflicts = FALSE)    # for functional programming
#> Warning: package 'purrr' was built under R version 4.1.3
library(furrr, warn.conflicts = FALSE)    # for functional futures
#> Warning: package 'furrr' was built under R version 4.1.3
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.1.3
#> Loading required package: future
library(magrittr, warn.conflicts = FALSE) # for pipes
#> Warning: package 'magrittr' was built under R version 4.1.3
library(dplyr, warn.conflicts = FALSE)    # for data manipulation
#> Warning: package 'dplyr' was built under R version 4.1.3
library(tibble, warn.conflicts = FALSE)     # for tibbles
#> Warning: package 'tibble' was built under R version 4.1.3
library(remstats, warn.conflicts = FALSE) # for REM statistics
library(remify, warn.conflicts = FALSE)   # for converting  
library(data.table, warn.conflicts = FALSE)   
#> Warning: package 'data.table' was built under R version 4.1.3
library(survival, warn.conflicts = FALSE) # for REM analysis
#> Warning: package 'survival' was built under R version 4.1.3
library(tidyverse, warn.conflicts = FALSE)  
#> Warning: package 'tidyverse' was built under R version 4.1.3
#> Warning: package 'ggplot2' was built under R version 4.1.3
#> Warning: package 'tidyr' was built under R version 4.1.3
#> Warning: package 'readr' was built under R version 4.1.3
#> Warning: package 'stringr' was built under R version 4.1.3
#> Warning: package 'forcats' was built under R version 4.1.3
library(datapasta, warn.conflicts = FALSE)  
#> Warning: package 'datapasta' was built under R version 4.1.3
set.seed(123)

##### Load data
con <- url("https://github.com/mshafieek/ADS-Missing-data-social-network/raw/main/literature_%20REM/Tutorial_REM_REH_DATA/UUsummerschool.Rdata")
load(con)
apollo <- as_tibble(PartOfApollo_13)

##### renaming columns to work with remify
setnames(apollo, old = c('sender','receiver'), 
         new = c('actor1','actor2'))

whichcol <- c("", "actor2", "actor1")
names(whichcol) <- colnames(apollo)

# use the custom pmm method
method <- make.method(apollo)
method[c(2,3)] <- "pmm.conditional"

##### Creating missing data
##### Missing pattern
pattern <- matrix(c(1,0,1,1,1,0), nrow=2, byrow=TRUE)

##### predictor matrix
predictormatrix <- matrix(c(0,0,0,0,0,1,0,1,0), nrow=3, byrow=TRUE)

##### Model-based finite populations
mbased_finite_apollo <- list( 
  MCAR = furrr::future_map(1:5, ~ {
    apollo %>% 
      ampute(prop = .5, 
             mech = "MCAR",
             patterns = pattern) %>% .$amp %>%
      mice(m = 5, 
           maxit = 5,
           method = method,
           whichcolumn = whichcol,
           predictorMatrix = predictormatrix,
           print = FALSE)
  }, .options = furrr_options(seed = 123)))

#### 
effects <- ~ -1 + outdegreeReceiver() + indegreeSender() + reciprocity()
rehObject <- remify(apollo, model = "tie")

statsObject <- remstats(rehObject, tie_effects = effects)

### get the risk sets
risk_sets <- attr(statsObject, "riskset")
risk_sets <- risk_sets %>% select(-'id')

### Get the times
time <- apollo$time

### merge riskset with each timepoint
combined <- merge(risk_sets, time, by = NULL)
combined <- combined %>% rename("time" = "y")
combined <- lapply(combined, as.numeric)
combined <- as.data.frame(combined)

### Create matrices for subtraction to make a status column for coxph
combined_matrix <- data.matrix(combined)
matrix_rows <- nrow(combined)
repeated_df <- apollo[rep(seq_len(nrow(apollo)), each = 240), ] %>% rename("sender" = "actor1", "receiver" = "actor2")
repeated_df <- repeated_df[, c(2,3,1)]
apollo_matrix <- data.matrix(repeated_df)
status_matrix <- apollo_matrix - combined_matrix

### create a status column
status <- as.integer(rowSums(status_matrix == 0) == ncol(status_matrix))
status <- as.data.frame(status)

### Add status to the combined set
combined <- cbind(combined, status) 

###### REM analysis on completed data sets of MCAR and MAR
run_coxph <- function(df) {
  rehObject <- remify(as.data.frame(df), model = "tie")
  statsObject <- remstats(rehObject, tie_effects = effects)

  reciprocity <- as.vector(statsObject[,,3])
  outdegreeReceiver <- as.vector(statsObject[,,1])
  indegreeSender <- as.vector(statsObject[,,2])

  data <- cbind(combined, reciprocity, outdegreeReceiver, indegreeSender)

  fit <- coxph(Surv(time, status) ~ reciprocity + indegreeSender + outdegreeReceiver, data = data)
  return(fit)
}

###### running the function on all completed datasets
Results <- mbased_finite_apollo %>% 
  map(~.x %>% # for every missingness mechanism
        map(~.x %>% # for every simulated multiple imputation.... # create a list of completed data sets
              complete("all") %>%
              map(~.x %$% # for every completed data set....
                   print(with(.x, run_coxph(.x))
              ) 
        )
  ))
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity       -5.470e-05  9.999e-01  4.376e-04 -0.125    0.901
#> indegreeSender     3.923e-04  1.000e+00  7.542e-05  5.201 1.98e-07
#> outdegreeReceiver -5.963e-05  9.999e-01  7.626e-05 -0.782    0.434
#> 
#> Likelihood ratio test=28.84  on 3 df, p=2.425e-06
#> n= 931680, number of events= 3882 
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity       -1.204e-04  9.999e-01  4.523e-04 -0.266    0.790
#> indegreeSender     4.440e-04  1.000e+00  7.680e-05  5.781 7.43e-09
#> outdegreeReceiver -6.127e-05  9.999e-01  7.559e-05 -0.811    0.418
#> 
#> Likelihood ratio test=34.83  on 3 df, p=1.325e-07
#> n= 931680, number of events= 3882 
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z       p
#> reciprocity       -1.375e-05  1.000e+00  4.243e-04 -0.032   0.974
#> indegreeSender     5.057e-04  1.001e+00  7.603e-05  6.651 2.9e-11
#> outdegreeReceiver -6.634e-05  9.999e-01  7.428e-05 -0.893   0.372
#> 
#> Likelihood ratio test=46.56  on 3 df, p=4.318e-10
#> n= 931680, number of events= 3882 
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity       -1.082e-04  9.999e-01  4.431e-04 -0.244    0.807
#> indegreeSender     4.041e-04  1.000e+00  7.688e-05  5.257 1.47e-07
#> outdegreeReceiver -5.584e-05  9.999e-01  7.473e-05 -0.747    0.455
#> 
#> Likelihood ratio test=28.83  on 3 df, p=2.428e-06
#> n= 931680, number of events= 3882 
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity       -5.048e-05  9.999e-01  4.499e-04 -0.112    0.911
#> indegreeSender     3.241e-04  1.000e+00  7.494e-05  4.325 1.53e-05
#> outdegreeReceiver -5.843e-05  9.999e-01  7.542e-05 -0.775    0.438
#> 
#> Likelihood ratio test=20.51  on 3 df, p=0.0001328
#> n= 931680, number of events= 3882 
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity       -4.434e-05  1.000e+00  4.331e-04 -0.102    0.918
#> indegreeSender     4.319e-04  1.000e+00  7.434e-05  5.810 6.24e-09
#> outdegreeReceiver -6.534e-05  9.999e-01  7.545e-05 -0.866    0.386
#> 
#> Likelihood ratio test=35.9  on 3 df, p=7.858e-08
#> n= 931680, number of events= 3882 
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity       -4.870e-05  1.000e+00  4.423e-04 -0.110    0.912
#> indegreeSender     4.909e-04  1.000e+00  7.458e-05  6.582 4.65e-11
#> outdegreeReceiver -7.241e-05  9.999e-01  7.579e-05 -0.955    0.339
#> 
#> Likelihood ratio test=46  on 3 df, p=5.664e-10
#> n= 931680, number of events= 3882 
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity       -5.458e-05  9.999e-01  4.302e-04 -0.127    0.899
#> indegreeSender     4.147e-04  1.000e+00  7.404e-05  5.601 2.13e-08
#> outdegreeReceiver -6.408e-05  9.999e-01  7.527e-05 -0.851    0.395
#> 
#> Likelihood ratio test=33.38  on 3 df, p=2.672e-07
#> n= 931680, number of events= 3882 
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity       -3.972e-05  1.000e+00  4.250e-04 -0.093    0.926
#> indegreeSender     4.946e-04  1.000e+00  7.335e-05  6.743 1.55e-11
#> outdegreeReceiver -6.490e-05  9.999e-01  7.506e-05 -0.865    0.387
#> 
#> Likelihood ratio test=47.73  on 3 df, p=2.433e-10
#> n= 931680, number of events= 3882 
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = data)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity       -4.477e-05  1.000e+00  4.686e-04 -0.096 0.923882
#> indegreeSender     2.891e-04  1.000e+00  8.129e-05  3.557 0.000376
#> outdegreeReceiver -5.661e-05  9.999e-01  7.636e-05 -0.741 0.458500
#> 
#> Likelihood ratio test=13.87  on 3 df, p=0.003087
#> n= 931680, number of events= 3882
#> Error in `map()`:
#> i In index: 1.
#> i With name: MCAR.
#> Caused by error in `map()`:
#> i In index: 3.
#> Caused by error in `map()`:
#> i In index: 1.
#> i With name: 1.
#> Caused by error in `data.frame()`:
#> ! arguments imply differing number of rows: 931680, 815220
#> Backtrace:
#>      x
#>   1. +-mbased_finite_apollo %>% ...
#>   2. +-purrr::map(...)
#>   3. | \-purrr:::map_("list", .x, .f, ..., .progress = .progress)
#>   4. |   +-purrr:::with_indexed_errors(...)
#>   5. |   | \-base::withCallingHandlers(...)
#>   6. |   +-purrr:::call_with_cleanup(...)
#>   7. |   \-.f(.x[[i]], ...)
#>   8. |     \-.x %>% ...
#>   9. +-purrr::map(...)
#>  10. | \-purrr:::map_("list", .x, .f, ..., .progress = .progress)
#>  11. |   +-purrr:::with_indexed_errors(...)
#>  12. |   | \-base::withCallingHandlers(...)
#>  13. |   +-purrr:::call_with_cleanup(...)
#>  14. |   \-.f(.x[[i]], ...)
#>  15. |     \-.x %>% complete("all") %>% ...
#>  16. +-purrr::map(., ~.x %$% print(with(.x, run_coxph(.x))))
#>  17. | \-purrr:::map_("list", .x, .f, ..., .progress = .progress)
#>  18. |   +-purrr:::with_indexed_errors(...)
#>  19. |   | \-base::withCallingHandlers(...)
#>  20. |   +-purrr:::call_with_cleanup(...)
#>  21. |   \-.f(.x[[i]], ...)
#>  22. |     \-.x %$% print(with(.x, run_coxph(.x)))
#>  23. +-base::with(., print(with(.x, run_coxph(.x))))
#>  24. +-base::with.default(., print(with(.x, run_coxph(.x))))
#>  25. | \-base::eval(substitute(expr), data, enclos = parent.frame())
#>  26. |   \-base::eval(substitute(expr), data, enclos = parent.frame())
#>  27. |     +-base::print(with(.x, run_coxph(.x)))
#>  28. |     +-base::with(.x, run_coxph(.x))
#>  29. |     \-base::with.default(.x, run_coxph(.x))
#>  30. |       \-base::eval(substitute(expr), data, enclos = parent.frame())
#>  31. |         \-base::eval(substitute(expr), data, enclos = parent.frame())
#>  32. |           \-global run_coxph(.x)
#>  33. |             \-mice::cbind(combined, reciprocity, outdegreeReceiver, indegreeSender)
#>  34. |               \-base::cbind(...)
#>  35. |                 \-base::cbind(deparse.level, ...)
#>  36. |                   \-base::data.frame(..., check.names = FALSE)
#>  37. |                     \-base::stop(...)
#>  38. \-base::.handleSimpleError(...)
#>  39.   \-purrr (local) h(simpleError(msg, call))
#>  40.     \-cli::cli_abort(...)
#>  41.       \-rlang::abort(...)

Created on 2023-06-01 with reprex v2.0.2

gerkovink commented 1 year ago

Hi Jesse,

@mshafieek an I figured out where the error comes from. One model has

events actors  dyads 
  3882     16    240 

while the other model has

events actors  dyads 
  3882     15    210 

exactly what you identified. [n=15 actors * n-1 = 14 actors] = 210, 16*15=240. The missingness generated is the cause for this. With MCAR, this happens not so often. with MAR, there is a higher likelihood of this happening, depending on the MAR mechanism (left, right, tail, mid. There are three solutions:

  1. Make sure that the generated missingness always includes the same actors/pairs. So, if the pairs are 1514 for some, generate new incomplete sets. A for-loop with tryCatch() may help here, or only generating missingness in a part of the data, where the other part always includes the necessary actors (much simpler!*).
  2. Limit your investigations to the lowest observed dyads. i.e. if one simulation has 13*12, only focus all simulation on the subset of those 13*12.
  3. Bring the
    combined <- cbind(combined, status) 

    and all required code into your run_coxph() evaluation function, and make the evaluation of each simulation independent of the others.

I would do (1) or (2), as it is the most robust and ensures equal evaluation of analysis models over the sims. This may avoid problems later on.

Hope this helps. Let me know if you need me.

JesseJvdw commented 1 year ago

I want to do the first solution but I have no idea where to start.

gerkovink commented 1 year ago

I would do something along the lines of this:

library(mice, warn.conflicts = FALSE)     # for imputation and amputation
library(purrr, warn.conflicts = FALSE)    # for functional programming
library(furrr, warn.conflicts = FALSE)    # for functional futures
#> Loading required package: future
library(magrittr, warn.conflicts = FALSE) # for pipes
library(dplyr, warn.conflicts = FALSE)    # for data manipulation
library(tibble, warn.conflicts = FALSE)     # for tibbles
library(remstats, warn.conflicts = FALSE) # for REM statistics
library(remify, warn.conflicts = FALSE)   # for converting  
library(data.table, warn.conflicts = FALSE)   
library(survival, warn.conflicts = FALSE) # for REM analysis
library(tidyverse, warn.conflicts = FALSE)  
library(datapasta, warn.conflicts = FALSE)  

##### Load data
con <- url("https://github.com/mshafieek/ADS-Missing-data-social-network/raw/main/literature_%20REM/Tutorial_REM_REH_DATA/UUsummerschool.Rdata")
load(con)
apollo <- as_tibble(PartOfApollo_13)

##### renaming columns to work with remify
setnames(apollo, old = c('sender','receiver'), 
         new = c('actor1','actor2'))

whichcol <- c("", "actor2", "actor1")
names(whichcol) <- colnames(apollo)

#### set with sufficient actors & dyads
set.seed(123) # fix seed to realize a sufficient set
indic <- sample(1:nrow(apollo), 1500)
remify(apollo[indic, ], model = "tie") %>% dim() 
#> 
#> Warning: the `time` variable is not sorted. Sorting will be forced.
#> events actors  dyads 
#>   1500     16    240

#### Combine the sufficient set and the incomplete set
make_missing <- function(x, indic){
  sufficient <- x[indic, ]
  miss <- x[-c(indic), ] |>
    ampute(prop = .8, 
           mech = "MCAR",
           patterns = matrix(c(1,0,1,
                               1,1,0), 
                             nrow=2, 
                             byrow=TRUE)) %>% 
    .$amp
  combined <- rbind(sufficient, 
                    miss)
  return(combined[order(combined$time), ]) # sort it all like apollo
}

#### Test function
test <- make_missing(apollo, indic)
dim(test)
#> [1] 3882    3
all(apollo$time == test$time)
#> [1] TRUE
head(apollo, n=10); head(test, n=10)
#> # A tibble: 10 × 3
#>      time actor1 actor2
#>     <dbl>  <dbl>  <dbl>
#>  1 11849.     18      2
#>  2 11854.      2     18
#>  3 11885.     18      2
#>  4 11890.      2     18
#>  5 12232.      2     17
#>  6 12342.     17      2
#>  7 12345.     19     17
#>  8 12348.     17     19
#>  9 12370.     19     17
#> 10 12382.     17     19
#> # A tibble: 10 × 3
#>      time actor1 actor2
#>     <dbl>  <dbl>  <dbl>
#>  1 11849.     18      2
#>  2 11854.      2     18
#>  3 11885.     NA      2
#>  4 11890.      2     18
#>  5 12232.      2     NA
#>  6 12342.     17     NA
#>  7 12345.     19     NA
#>  8 12348.     17     19
#>  9 12370.     19     17
#> 10 12382.     17     19
tail(apollo, n=10); tail(test, n=10)
#> # A tibble: 10 × 3
#>      time actor1 actor2
#>     <dbl>  <dbl>  <dbl>
#>  1 49980.      9      7
#>  2 49983.      2     19
#>  3 49993.      4      7
#>  4 49994.      7      4
#>  5 49996.     19      2
#>  6 49997.      4      7
#>  7 50001.      2     19
#>  8 50002.      7      4
#>  9 50013.      7      4
#> 10 50015.      4      7
#> # A tibble: 10 × 3
#>      time actor1 actor2
#>     <dbl>  <dbl>  <dbl>
#>  1 49980.      9      7
#>  2 49983.      2     NA
#>  3 49993.      4      7
#>  4 49994.      7      4
#>  5 49996.     19      2
#>  6 49997.      4      7
#>  7 50001.     NA     19
#>  8 50002.     NA      4
#>  9 50013.      7      4
#> 10 50015.      4      7

#### missing data pattern
md.pattern(test)

#>      time actor1 actor2     
#> 1950    1      1      1    0
#> 966     1      1      0    1
#> 966     1      0      1    1
#>         0    966    966 1932

Created on 2023-06-02 with reprex v2.0.2

gerkovink commented 1 year ago

This will yield approx 1500+.2*2382 = 1976 observed cases and .8*2382 = 1906 missing cases. Play with the proportion in ampute() to get to the desired missingness proportion, if needed. You can use the make_missing() function in your future, as long as you specify a grand (i.e. for all sims the same) indic outside of the future.

JesseJvdw commented 1 year ago

That did the trick. Thank you.