stocnet / goldfish

Actor-oriented and tie-based network event models in R
https://stocnet.github.io/goldfish/
GNU General Public License v3.0
61 stars 13 forks source link

`C` and `R` engine results differ for rate model #71

Closed auzaheta closed 1 year ago

auzaheta commented 2 years ago

Describe the bug Default C engine gives different result as R version when windows effects are used.

To Reproduce

library(goldfish)
data("Social_Evolution")

callNetwork <- defineNetwork(nodes = actors, directed = TRUE) |> # 1
  linkEvents(changeEvent = calls, nodes = actors) # 2

callsDependent <- defineDependentEvents(
  events = calls, nodes = actors,
  defaultNetwork = callNetwork
  )

windowFormulaRate <-
  callsDependent ~ 1 + indeg(callNetwork) + outdeg(callNetwork) +
                   indeg(callNetwork, window = 300) +
                   outdeg(callNetwork, window = 300) +
                   indeg(friendshipNetwork)

mod04Rate <- estimate(
  windowFormulaRate,
  model = "DyNAM", subModel = "rate",
  estimationInit = list(engine = "default_c")
)
mod04RateC <- coef(mod04Rate)

mod04Rate <- estimate(
  windowFormulaRate,
  model = "DyNAM", subModel = "rate"
)
mod04RateR <- coef(mod04Rate)

table(abs(mod04RateC - mod04RateR) < 1e-2)

> FALSE 
>     6

Desktop (please complete the following information):

marion-hoffman commented 2 years ago

Description Using your example, it seems clear that the issue comes from the right-censoring. If you remove the windowed effects but keep the frienship updates (that create one right-censored event) you get the same behavior. I tried to use the GLM library to figure out which one is right, to know where to start, but I'm a bit confused, because I get yet another result (see script below). But it still looks like the R engine is closer to the GLM library result. Maybe @auzaheta you can see what I'm missing here?!

To reproduce (after the previous one) `

First test with glm

1. we get the preprocessed stats for a simple model

FormulaSimple <- callsDependent ~ 1 + indeg(friendshipNetwork) prepData <- estimate( FormulaSimple, model = "DyNAM", subModel = "rate", preprocessingOnly = T )

2. we calculate the exposure timesand the stats for each dependent event

Ndep <- length(prepData$intervals) Nexo <- length(prepData$rightCensoredIntervals) Na <- nrow(actors)

we only need to calculate the update of the friendship net between the first and second event

updates_T1 <- prepData$dependentStatsChange[[2]][[1]] indegs_T1 <- rep(0,Na) for(a in 1:Na) { if(any(updates_T1[,1] == a)) indegs_T1[a] <- updates_T1[max(which(updates_T1[,1] == a)),3] }

times <- rep(0,(Ndep+Nexo)Na ) indegs_friend <- rep(0,(Ndep+Nexo)Na ) outcomes <- rep(0,(Ndep+Nexo)*Na ) cptdep <- 1 cptexo <- 1 for(e in 1:(Ndep+Nexo)){

dep or exo?

isdep <- prepData$orderEvents[e] == 1 if(isdep) { times[((e-1)Na+1) : (eNa)] <- prepData$intervals[cptdep] outcomes[(e-1)Na + prepData$eventSender[e]] <- 1 cptdep <- cptdep + 1 } if(!isdep) { times[((e-1)Na+1) : (eNa)] <- prepData$rightCensoredIntervals[cptexo] cptexo <- cptexo + 1 } if(e>2){ indegs_friend[((e-1)Na+1) : (e*Na)] <- indegs_T1 } }

3. check glm results

df <- data.frame(times, logtimes = log(times), indegs_friend, outcomes) df[1:(Na+1),1] <- 1 df[1:(Na+1),2] <- 0

df <- df[(Na+1):((Ndep+Nexo)*Na),]

modGLM <- glm(outcomes ~ offset(logtimes) + indegs_friend, family = poisson(link = "log"), data = df) coef(modGLM)

4. check previous models

mod04RateCSimple <- estimate( FormulaSimple, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default_c", startTime = 1220733469) ) coef(mod04RateCSimple)

mod04RateRSimple <- estimate( FormulaSimple, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default", startTime = 1220733469) ) coef(mod04RateRSimple)

coef(mod04RateCSimple) - coef(mod04RateRSimple) # 0.0061807584 -0.0009279592 coef(mod04RateCSimple) - coef(modGLM) # 0.0055274138 -0.0008136509 coef(mod04RateRSimple) - coef(modGLM) # -0.0006533446 0.0001143083

Second test with glm

1. we get the preprocessed stats for another simple model with time windows

FormulaSimple2 <- callsDependent ~ 1 + indeg(callNetwork, window = 300) prepData2 <- estimate( FormulaSimple2, model = "DyNAM", subModel = "rate", preprocessingOnly = T )

2. we calculate the exposure times and the stats for each dependent event

Nexo2 <- length(prepData2$rightCensoredIntervals)

this time we need to calculate window updates

times2 <- rep(0,(Ndep+Nexo2)Na ) indegs_window <- rep(0,(Ndep+Nexo2)Na ) outcomes2 <- rep(0,(Ndep+Nexo2)*Na ) cptdep <- 1 cptexo <- 1

updates_dep <- prepData2$dependentStatsChange updates_rc <- prepData2$dependentStatsChange current_stats <- rep(0,Na) for(e in 1:(Ndep+Nexo2)){

dep or exo?

isdep <- prepData2$orderEvents[e] == 1 if(isdep) {

update windowed stat

up <- updates_dep[[cptdep]][[1]]
for(a in 1:Na) {
  if(!is.null(up) && any(up[,1] == a))
    current_stats[a] <- up[max(which(up[,1] == a)),3]
}
indegs_window[((e-1)*Na+1) : (e*Na)] <- current_stats
times2[((e-1)*Na+1) : (e*Na)] <- prepData2$intervals[cptdep]
outcomes2[(e-1)*Na + prepData2$eventSender[e]] <- 1
cptdep <- cptdep + 1

} if(!isdep) {

update windowed stat

up <- updates_rc[[cptexo]][[1]]
for(a in 1:Na) {
  if(!is.null(up) && any(up[,1] == a))
    current_stats[a] <- up[max(which(up[,1] == a)),3]
}
indegs_window[((e-1)*Na+1) : (e*Na)] <- current_stats
times2[((e-1)*Na+1) : (e*Na)] <- prepData2$rightCensoredIntervals[cptexo]
cptexo <- cptexo + 1

} }

3. check glm results

df2 <- data.frame(times = times2, logtimes = log(times2), indegs_window, outcomes = outcomes2) df2[1:(Na+1),1] <- 1 df2[1:(Na+1),2] <- 0 df2 <- df2[1:((Ndep+Nexo2-2)*Na),] modGLM2 <- glm(outcomes ~ offset(logtimes) + indegs_window, family = poisson(link = "log"), data = df2) coef(modGLM2)

4. check previous models

mod04RateCSimple2 <- estimate( FormulaSimple2, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default_c", startTime = 1220733469) ) coef(mod04RateCSimple2)

mod04RateRSimple2 <- estimate( FormulaSimple2, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default", startTime = 1220733469) ) coef(mod04RateRSimple2)

coef(mod04RateCSimple2) - coef(mod04RateRSimple2) # 0.1557715 -0.9546496 coef(mod04RateCSimple2) - coef(modGLM2) # 0.1444659 -1.5702790 coef(mod04RateRSimple2) - coef(modGLM2) # -0.0113056 -0.6156294

`

auzaheta commented 2 years ago

Hopefully, this simulation helps with the discussion @stadtfeldethz and @marion-hoffman. It only has one exogenous event.

### small simulation
actDf <- data.frame(
  label = LETTERS,
  xVar = rbinom(length(LETTERS), 1, 0.2)
) 

xVarChange <- rbinom(length(LETTERS), 1, 0.7)

X <- cbind(1, actDf$xVar)

endTime <- 30
exEventTime <- 15
parms <- c(log(1000 / endTime / length(LETTERS)), 1 )

expXb <- exp(X %*% parms)
sumRate <- sum(expXb)

time <- 0
events <- NULL
dfglm <- NULL
first <- TRUE

while (time < endTime) {
  # cat("Time:", time, "\n")
  timeEvent <- rexp(1, sumRate)
  if (first && time + timeEvent > exEventTime) {
    dfglm <- rbind(
      dfglm,
      data.frame(
        time = time,
        diffTime = exEventTime - time,
        xVar = X[, 2],
        outcomes = 0 
      )
    )

    X <- cbind(1, xVarChange)
    expXb <- exp(X %*% parms)
    sumRate <- sum(expXb)
    time <- exEventTime
    first <- FALSE
  } else {
    time <- time + timeEvent
    sender <- sample(LETTERS, 1, prob = expXb)
    receiver <- sample(LETTERS[sender != LETTERS], 1)
    events <- rbind(
      events,
      data.frame(
        time = time,
        sender = sender,
        receiver = receiver,
        increment = 1
      )
    )

    dfglm <- rbind(
      dfglm,
      data.frame(
        time = time,
        diffTime = timeEvent,
        xVar = X[, 2],
        outcomes = 1 * (LETTERS == sender) 
      )
    )
  }
}

changeAttr <- data.frame(
  time = exEventTime,
  node = LETTERS,
  replace = xVarChange
)

actDf <- defineNodes(actDf) |> 
  linkEvents(changeEvents = changeAttr, attribute = "xVar")

netEvents <- defineNetwork(nodes = actDf) |> 
  linkEvents(changeEvents = events, nodes = actDf)

depEvents <- defineDependentEvents(events, nodes = actDf,
                                   defaultNetwork = netEvents)

# model exclude last event
preproTest <- estimate(depEvents ~ 1 + ego(actDf$xVar),
                       model = "DyNAM", subModel = "rate",
                       estimationInit = list(startTime = 0, endTime = 30))
summary(modTest)
confint(modTest)

# model exclude last event
dfglm$logtimes <- log(dfglm$diffTime)
modTestGLM <-  glm(outcomes ~ offset(logtimes) + xVar,
                   family = poisson(link = "log"),
                   data = dfglm[seq.int(1, (nrow(events)) * length(LETTERS)), ])
summary(modTestGLM)
confint(modTestGLM)

library(broom)
library(pixiedust)
tidy(modTestGLM, conf.int = TRUE)
tidy(modTest, conf.int = TRUE)

From one run

> library(broom)
> library(pixiedust)
> parms
[1] 0.2484614 1.0000000
> tidy(modTestGLM, conf.int = TRUE) |> 
+   dust() |> 
+   sprinkle(col = c(2:4, 6:7), round = 6) |> 
+   sprinkle(col = 5, fn = quote(pvalString(value)))
         term estimate std.error statistic p.value conf.low conf.high
1 (Intercept) 0.224695  0.045268  4.963687 < 0.001 0.134638  0.312124
2        xVar 1.011514  0.052867 19.133289 < 0.001 0.908709  1.115992

> tidy(modTest, conf.int = TRUE) |> 
+   dust() |> 
+   sprinkle(col = c(2:4, 6:7), round = 6) |> 
+   sprinkle(col = 5, fn = quote(pvalString(value)))
            term estimate std.error statistic p.value conf.low conf.high
1      Intercept 0.224695  0.045268  4.963685 < 0.001 0.135972  0.313419
2 actDf xVar ego 1.011514  0.052867 19.133283 < 0.001 0.907897  1.115131

> coef(modTestGLM) - coef(modTest)
  (Intercept)          xVar 
-3.640075e-10  3.640201e-10 

Seems like the last time interval during preprocessing is calculated wrong and produce the differences between glm and goldfish estimations.