mshafieek / ADS-Missing-data-social-network

ADS master thesis
MIT License
0 stars 1 forks source link

indic #17

Closed LisanneLageweg closed 1 year ago

LisanneLageweg commented 1 year ago

Hai @gerkovink ,

Ik heb, zoals je zei, analyse met die 1500 gedaan op onderstaande manier. Dan kom ik op dezelfde p-values uit als met de original dataset. Heb ik het goed gedaan? En zo ja, en de p-values zijn dus inderdaad lager door het imputation proces. Wat voor verklaring kan hieraan ten grondslag liggen?

indic <- as.data.frame(indic)

class(indic)

true.reh <- remify(edgelist = indic, model = "tie")

calculate stats

stats <- remstats(tie_effects = effects, reh = true.reh)

use the function to create the correct format of the dataframe

true.cox.set <- prepare_coxph_data(stats, PartOfApollo_13)

fit cox model

true.cox.fit <- coxph(Surv(time, status) ~ reciprocity + indegreeSender + outdegreeReceiver, data=true.cox.set) true <- coefficients(true.cox.fit)

vira-dvoriak commented 1 year ago

I also ran the code on the 1500 set and did not get any significant p-values.

library(mice)
#> 
#> Attaching package: 'mice'
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following objects are masked from 'package:base':
#> 
#>     cbind, rbind
require(lattice)
#> Loading required package: lattice
library(tidyverse)
library(magrittr)
#> 
#> Attaching package: 'magrittr'
#> The following object is masked from 'package:purrr':
#> 
#>     set_names
#> The following object is masked from 'package:tidyr':
#> 
#>     extract
library(dplyr) 
library(purrr)
library(furrr)  
#> Loading required package: future
library(relevent)
#> Loading required package: trust
#> Loading required package: sna
#> Loading required package: statnet.common
#> 
#> Attaching package: 'statnet.common'
#> The following objects are masked from 'package:base':
#> 
#>     attr, order
#> Loading required package: network
#> 
#> 'network' 1.18.1 (2023-01-24), part of the Statnet Project
#> * 'news(package="network")' for changes since last version
#> * 'citation("network")' for citation information
#> * 'https://statnet.org' for help, support, and other information
#> sna: Tools for Social Network Analysis
#> Version 2.7-1 created on 2023-01-24.
#> copyright (c) 2005, Carter T. Butts, University of California-Irvine
#>  For citation information, type citation("sna").
#>  Type help(package="sna") to get started.
#> Loading required package: coda
#> relevent: Relational Event Models
#> Version 1.2-1 created on 2023-01-24.
#> copyright (c) 2007, Carter T. Butts, University of California-
#> Irvine
#>  For citation information, type citation("relevent").
#>  Type help(package="relevent") to get started.
library(broom.mixed)
library(rem)
library(ggplot2)
library(survival)
#> 
#> Attaching package: 'survival'
#> The following object is masked from 'package:future':
#> 
#>     cluster
library(remify)
library(remstats)
library(devtools)
#> Loading required package: usethis
#library(remstimate)
library(tibble)
library(reprex)
library(styler)

## 1. Read the data
load("UUsummerschool.Rdata")
rm(Twitter_data_rem3, WTCPoliceCalls, ClassIntercept, ClassIsFemale,
   ClassIsTeacher, WTCPoliceIsICR, Class)

## 2. Ampute and then impute the data
# renaming dataset for later convenience 
apollo.renamed <- PartOfApollo_13
apollo.renamed <- apollo.renamed %>% 
  rename(
    actor1 = sender,
    actor2 = receiver
  )

#### set with sufficient actors & dyads
set.seed(123) # for some reason the seed 123 was just not working for me, it
# keeps giving me 210 dyads, even with size 1600 
# even the seed, it was giving me 210 dyads, so I changed the sample to 1600
fixed.sample <- apollo.renamed[sample(1:nrow(apollo.renamed), 1500),] 
remify(fixed.sample, model = "tie") %>% dim() 
#> 
#> Warning: the `time` variable is not sorted. Sorting will be forced.
#> events actors  dyads 
#>   1500     16    240

# creating functions 
# Define effects
effects <- ~ -1 + reciprocity(scaling = ("std")) +
  indegreeSender() + outdegreeReceiver()

# Prepare event history
fixed.reh <- remify(edgelist = fixed.sample, 
                    model = "tie")
#> 
#> Warning: the `time` variable is not sorted. Sorting will be forced.
# calculate stats
stats <- remstats(tie_effects = effects, 
                  reh = fixed.reh)

risk_sets <- attr(stats, "riskset") %>% select(-'id')

# creating one set with all risksets for each time point 
combined <- merge(risk_sets, fixed.sample$time, by=NULL)  %>% 
  rename(time = y) %>% 
  .[, c("time", "sender", "receiver")] %>% 
  mutate(sender = as.numeric(sender), 
         receiver = as.numeric(receiver))

# GV: Calculate divergence
diff <- fixed.sample[rep(seq_len(nrow(fixed.sample)), each = 240), ] %>% 
  data.matrix() %>% 
  .[, 1:3] - combined
# GV: identify non-divergence
combined$status <- 
  rowSums(diff == 0) == ncol(diff)

#combining the dataset with riskset to the statistic
combined$reciprocity <- c(stats[,,1])
combined$indegreeSender <- c(stats[,,2])
combined$outdegreeReceiver <- c(stats[,,3])

fixed.cox.fit <- coxph(Surv(time, status) ~ reciprocity + indegreeSender + 
                         outdegreeReceiver, 
                       data=combined)
fixed.cox.fit
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = combined)
#> 
#>                         coef  exp(coef)   se(coef)      z     p
#> reciprocity       -0.0312374  0.9692455  0.0286696 -1.090 0.276
#> indegreeSender     0.0003728  1.0003729  0.0003083  1.209 0.227
#> outdegreeReceiver -0.0003139  0.9996861  0.0003019 -1.040 0.298
#> 
#> Likelihood ratio test=4.1  on 3 df, p=0.2509
#> n= 360000, number of events= 1500

Created on 2023-06-21 with reprex v2.0.2

gerkovink commented 1 year ago

And if you run it on one imputed set, as if the imputed set were observed data?

vira-dvoriak commented 1 year ago

I hope I understood you correctly, I didn't do any simulations and changed the number of imputations to 1. After analysing just a single imputed dataset I still did not get all significant p-values. Only one is significant, exactly like in the fully observed dataset.

library(mice)
#> 
#> Attaching package: 'mice'
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> The following objects are masked from 'package:base':
#> 
#>     cbind, rbind
require(lattice)
#> Loading required package: lattice
library(tidyverse)
library(magrittr)
#> 
#> Attaching package: 'magrittr'
#> The following object is masked from 'package:purrr':
#> 
#>     set_names
#> The following object is masked from 'package:tidyr':
#> 
#>     extract
library(dplyr) 
library(purrr)
library(furrr)  
#> Loading required package: future
library(relevent)
#> Loading required package: trust
#> Loading required package: sna
#> Loading required package: statnet.common
#> 
#> Attaching package: 'statnet.common'
#> The following objects are masked from 'package:base':
#> 
#>     attr, order
#> Loading required package: network
#> 
#> 'network' 1.18.1 (2023-01-24), part of the Statnet Project
#> * 'news(package="network")' for changes since last version
#> * 'citation("network")' for citation information
#> * 'https://statnet.org' for help, support, and other information
#> sna: Tools for Social Network Analysis
#> Version 2.7-1 created on 2023-01-24.
#> copyright (c) 2005, Carter T. Butts, University of California-Irvine
#>  For citation information, type citation("sna").
#>  Type help(package="sna") to get started.
#> Loading required package: coda
#> relevent: Relational Event Models
#> Version 1.2-1 created on 2023-01-24.
#> copyright (c) 2007, Carter T. Butts, University of California-
#> Irvine
#>  For citation information, type citation("relevent").
#>  Type help(package="relevent") to get started.
library(broom.mixed)
library(rem)
library(ggplot2)
library(survival)
#> 
#> Attaching package: 'survival'
#> The following object is masked from 'package:future':
#> 
#>     cluster
library(remify)
library(remstats)
library(devtools)
#> Loading required package: usethis
#library(remstimate)
library(tibble)
library(reprex)
library(styler)

## 1. Read the data
load("UUsummerschool.Rdata")
rm(Twitter_data_rem3, WTCPoliceCalls, ClassIntercept, ClassIsFemale,
   ClassIsTeacher, WTCPoliceIsICR, Class)

## 2. Ampute and then impute the data
# renaming dataset for later convenience 
apollo.renamed <- PartOfApollo_13
apollo.renamed <- apollo.renamed %>% 
  rename(
    actor1 = sender,
    actor2 = receiver
  )
# making the dataset a tibble 
apollo.renamed <- as_tibble(apollo.renamed)

# determine which column to condition on
whichcol <- c("", "actor2", "actor1")
names(whichcol) <- colnames(apollo.renamed)

# create predictor matrix for imoutations
pred <- make.predictorMatrix(apollo.renamed)
pred[ ,"time"] <- 0
pred
#>        time actor1 actor2
#> time      0      1      1
#> actor1    0      0      1
#> actor2    0      1      0

# use the custom pmm method
method <- make.method(apollo.renamed)
method[c(2,3)] <- "pmm.conditional"

MCAR_finite <- apollo.renamed %>% 
  ampute(prop = .2, 
         patterns = c(1,0,1),
         mech = "MCAR") %>% .$amp %>% 
  mice(m = 1, 
       maxit = 5,
       method = method,
       pred=pred,
       whichcolumn = whichcol,
       print = F)

one.imp <- complete(MCAR_finite)

# creating functions 
# Define effects
effects <- ~ -1 + reciprocity(scaling = ("std")) +
  indegreeSender() + outdegreeReceiver()

# Prepare event history
single.reh <- remify(edgelist = one.imp, 
                     model = "tie")
# calculate stats
single.stats <- remstats(tie_effects = effects, 
                         reh = single.reh)

risk_sets <- attr(single.stats, "riskset") %>% select(-'id')

# creating one set with all risksets for each time point 
combined <- merge(risk_sets, PartOfApollo_13$time, by=NULL)  %>% 
  rename(time = y) %>% 
  .[, c("time", "sender", "receiver")] %>% 
  mutate(sender = as.numeric(sender), 
         receiver = as.numeric(receiver))

# GV: Calculate divergence
diff <- PartOfApollo_13[rep(seq_len(nrow(PartOfApollo_13)), each = 240), ] %>% 
  data.matrix() %>% 
  .[, 1:3] - combined
# GV: identify non-divergence
combined$status <- 
  rowSums(diff == 0) == ncol(diff)

#combining the dataset with riskset to the statistic
combined$reciprocity <- c(single.stats[,,1])
combined$indegreeSender <- c(single.stats[,,2])
combined$outdegreeReceiver <- c(single.stats[,,3])

single.cox.fit <- coxph(Surv(time, status) ~ reciprocity + indegreeSender + 
                          outdegreeReceiver, 
                        data=combined)
single.cox.fit
#> Call:
#> coxph(formula = Surv(time, status) ~ reciprocity + indegreeSender + 
#>     outdegreeReceiver, data = combined)
#> 
#>                         coef  exp(coef)   se(coef)      z        p
#> reciprocity        2.433e-02  1.025e+00  1.873e-02  1.299    0.194
#> indegreeSender     4.305e-04  1.000e+00  7.399e-05  5.818 5.95e-09
#> outdegreeReceiver -8.757e-05  9.999e-01  7.282e-05 -1.203    0.229
#> 
#> Likelihood ratio test=41.29  on 3 df, p=5.664e-09
#> n= 931680, number of events= 3882

Created on 2023-06-22 with reprex v2.0.2

gerkovink commented 1 year ago

I think it has to do with the exclusion of the 1500 cases, which artificially deflates the between variance. I'll look into it, but don't worry too much about it 😉