auzaheta commented 2 years ago

Describe the bug Default C engine gives different result as R version when windows effects are used.

To Reproduce

library(goldfish)
data("Social_Evolution")

callNetwork <- defineNetwork(nodes = actors, directed = TRUE) |> # 1
  linkEvents(changeEvent = calls, nodes = actors) # 2

callsDependent <- defineDependentEvents(
  events = calls, nodes = actors,
  defaultNetwork = callNetwork
  )

windowFormulaRate <-
  callsDependent ~ 1 + indeg(callNetwork) + outdeg(callNetwork) +
                   indeg(callNetwork, window = 300) +
                   outdeg(callNetwork, window = 300) +
                   indeg(friendshipNetwork)

mod04Rate <- estimate(
  windowFormulaRate,
  model = "DyNAM", subModel = "rate",
  estimationInit = list(engine = "default_c")
)
mod04RateC <- coef(mod04Rate)

mod04Rate <- estimate(
  windowFormulaRate,
  model = "DyNAM", subModel = "rate"
)
mod04RateR <- coef(mod04Rate)

table(abs(mod04RateC - mod04RateR) < 1e-2)

> FALSE 
>     6

Desktop (please complete the following information):

OS: Windows
R Version 4.2.0
- Goldfish Version 1.6.2

marion-hoffman commented 2 years ago

Description Using your example, it seems clear that the issue comes from the right-censoring. If you remove the windowed effects but keep the frienship updates (that create one right-censored event) you get the same behavior. I tried to use the GLM library to figure out which one is right, to know where to start, but I'm a bit confused, because I get yet another result (see script below). But it still looks like the R engine is closer to the GLM library result. Maybe @auzaheta you can see what I'm missing here?!

To reproduce (after the previous one) `

First test with glm

1. we get the preprocessed stats for a simple model

FormulaSimple <- callsDependent ~ 1 + indeg(friendshipNetwork) prepData <- estimate( FormulaSimple, model = "DyNAM", subModel = "rate", preprocessingOnly = T )

2. we calculate the exposure timesand the stats for each dependent event

Ndep <- length(prepData$intervals) Nexo <- length(prepData$rightCensoredIntervals) Na <- nrow(actors)

we only need to calculate the update of the friendship net between the first and second event

updates_T1 <- prepData$dependentStatsChange[[2]][[1]] indegs_T1 <- rep(0,Na) for(a in 1:Na) { if(any(updates_T1[,1] == a)) indegs_T1[a] <- updates_T1[max(which(updates_T1[,1] == a)),3] }

times <- rep(0,(Ndep+Nexo)Na ) indegs_friend <- rep(0,(Ndep+Nexo)Na ) outcomes <- rep(0,(Ndep+Nexo)*Na ) cptdep <- 1 cptexo <- 1 for(e in 1:(Ndep+Nexo)){

dep or exo?

isdep <- prepData$orderEvents[e] == 1 if(isdep) { times[((e-1)Na+1) : (eNa)] <- prepData$intervals[cptdep] outcomes[(e-1)Na + prepData$eventSender[e]] <- 1 cptdep <- cptdep + 1 } if(!isdep) { times[((e-1)Na+1) : (eNa)] <- prepData$rightCensoredIntervals[cptexo] cptexo <- cptexo + 1 } if(e>2){ indegs_friend[((e-1)Na+1) : (e*Na)] <- indegs_T1 } }

3. check glm results

df <- data.frame(times, logtimes = log(times), indegs_friend, outcomes) df[1:(Na+1),1] <- 1 df[1:(Na+1),2] <- 0

df <- df[(Na+1):((Ndep+Nexo)*Na),]

modGLM <- glm(outcomes ~ offset(logtimes) + indegs_friend, family = poisson(link = "log"), data = df) coef(modGLM)

4. check previous models

mod04RateCSimple <- estimate( FormulaSimple, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default_c", startTime = 1220733469) ) coef(mod04RateCSimple)

mod04RateRSimple <- estimate( FormulaSimple, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default", startTime = 1220733469) ) coef(mod04RateRSimple)

coef(mod04RateCSimple) - coef(mod04RateRSimple) # 0.0061807584 -0.0009279592 coef(mod04RateCSimple) - coef(modGLM) # 0.0055274138 -0.0008136509 coef(mod04RateRSimple) - coef(modGLM) # -0.0006533446 0.0001143083

Second test with glm

1. we get the preprocessed stats for another simple model with time windows

FormulaSimple2 <- callsDependent ~ 1 + indeg(callNetwork, window = 300) prepData2 <- estimate( FormulaSimple2, model = "DyNAM", subModel = "rate", preprocessingOnly = T )

2. we calculate the exposure times and the stats for each dependent event

Nexo2 <- length(prepData2$rightCensoredIntervals)

this time we need to calculate window updates

times2 <- rep(0,(Ndep+Nexo2)Na ) indegs_window <- rep(0,(Ndep+Nexo2)Na ) outcomes2 <- rep(0,(Ndep+Nexo2)*Na ) cptdep <- 1 cptexo <- 1

updates_dep <- prepData2$dependentStatsChange updates_rc <- prepData2$dependentStatsChange current_stats <- rep(0,Na) for(e in 1:(Ndep+Nexo2)){

dep or exo?

isdep <- prepData2$orderEvents[e] == 1 if(isdep) {

update windowed stat

up <- updates_dep[[cptdep]][[1]]
for(a in 1:Na) {
  if(!is.null(up) && any(up[,1] == a))
    current_stats[a] <- up[max(which(up[,1] == a)),3]
}
indegs_window[((e-1)*Na+1) : (e*Na)] <- current_stats
times2[((e-1)*Na+1) : (e*Na)] <- prepData2$intervals[cptdep]
outcomes2[(e-1)*Na + prepData2$eventSender[e]] <- 1
cptdep <- cptdep + 1

} if(!isdep) {

update windowed stat

up <- updates_rc[[cptexo]][[1]]
for(a in 1:Na) {
  if(!is.null(up) && any(up[,1] == a))
    current_stats[a] <- up[max(which(up[,1] == a)),3]
}
indegs_window[((e-1)*Na+1) : (e*Na)] <- current_stats
times2[((e-1)*Na+1) : (e*Na)] <- prepData2$rightCensoredIntervals[cptexo]
cptexo <- cptexo + 1

} }

3. check glm results

df2 <- data.frame(times = times2, logtimes = log(times2), indegs_window, outcomes = outcomes2) df2[1:(Na+1),1] <- 1 df2[1:(Na+1),2] <- 0 df2 <- df2[1:((Ndep+Nexo2-2)*Na),] modGLM2 <- glm(outcomes ~ offset(logtimes) + indegs_window, family = poisson(link = "log"), data = df2) coef(modGLM2)

4. check previous models

mod04RateCSimple2 <- estimate( FormulaSimple2, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default_c", startTime = 1220733469) ) coef(mod04RateCSimple2)

mod04RateRSimple2 <- estimate( FormulaSimple2, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default", startTime = 1220733469) ) coef(mod04RateRSimple2)

coef(mod04RateCSimple2) - coef(mod04RateRSimple2) # 0.1557715 -0.9546496 coef(mod04RateCSimple2) - coef(modGLM2) # 0.1444659 -1.5702790 coef(mod04RateRSimple2) - coef(modGLM2) # -0.0113056 -0.6156294

`

auzaheta commented 2 years ago

Hopefully, this simulation helps with the discussion @stadtfeldethz and @marion-hoffman. It only has one exogenous event.

### small simulation
actDf <- data.frame(
  label = LETTERS,
  xVar = rbinom(length(LETTERS), 1, 0.2)
) 

xVarChange <- rbinom(length(LETTERS), 1, 0.7)

X <- cbind(1, actDf$xVar)

endTime <- 30
exEventTime <- 15
parms <- c(log(1000 / endTime / length(LETTERS)), 1 )

expXb <- exp(X %*% parms)
sumRate <- sum(expXb)

time <- 0
events <- NULL
dfglm <- NULL
first <- TRUE

while (time < endTime) {
  # cat("Time:", time, "\n")
  timeEvent <- rexp(1, sumRate)
  if (first && time + timeEvent > exEventTime) {
    dfglm <- rbind(
      dfglm,
      data.frame(
        time = time,
        diffTime = exEventTime - time,
        xVar = X[, 2],
        outcomes = 0 
      )
    )

    X <- cbind(1, xVarChange)
    expXb <- exp(X %*% parms)
    sumRate <- sum(expXb)
    time <- exEventTime
    first <- FALSE
  } else {
    time <- time + timeEvent
    sender <- sample(LETTERS, 1, prob = expXb)
    receiver <- sample(LETTERS[sender != LETTERS], 1)
    events <- rbind(
      events,
      data.frame(
        time = time,
        sender = sender,
        receiver = receiver,
        increment = 1
      )
    )

    dfglm <- rbind(
      dfglm,
      data.frame(
        time = time,
        diffTime = timeEvent,
        xVar = X[, 2],
        outcomes = 1 * (LETTERS == sender) 
      )
    )
  }
}

changeAttr <- data.frame(
  time = exEventTime,
  node = LETTERS,
  replace = xVarChange
)

actDf <- defineNodes(actDf) |> 
  linkEvents(changeEvents = changeAttr, attribute = "xVar")

netEvents <- defineNetwork(nodes = actDf) |> 
  linkEvents(changeEvents = events, nodes = actDf)

depEvents <- defineDependentEvents(events, nodes = actDf,
                                   defaultNetwork = netEvents)

# model exclude last event
preproTest <- estimate(depEvents ~ 1 + ego(actDf$xVar),
                       model = "DyNAM", subModel = "rate",
                       estimationInit = list(startTime = 0, endTime = 30))
summary(modTest)
confint(modTest)

# model exclude last event
dfglm$logtimes <- log(dfglm$diffTime)
modTestGLM <-  glm(outcomes ~ offset(logtimes) + xVar,
                   family = poisson(link = "log"),
                   data = dfglm[seq.int(1, (nrow(events)) * length(LETTERS)), ])
summary(modTestGLM)
confint(modTestGLM)

library(broom)
library(pixiedust)
tidy(modTestGLM, conf.int = TRUE)
tidy(modTest, conf.int = TRUE)

From one run

> library(broom)
> library(pixiedust)
> parms
[1] 0.2484614 1.0000000
> tidy(modTestGLM, conf.int = TRUE) |> 
+   dust() |> 
+   sprinkle(col = c(2:4, 6:7), round = 6) |> 
+   sprinkle(col = 5, fn = quote(pvalString(value)))
         term estimate std.error statistic p.value conf.low conf.high
1 (Intercept) 0.224695  0.045268  4.963687 < 0.001 0.134638  0.312124
2        xVar 1.011514  0.052867 19.133289 < 0.001 0.908709  1.115992

> tidy(modTest, conf.int = TRUE) |> 
+   dust() |> 
+   sprinkle(col = c(2:4, 6:7), round = 6) |> 
+   sprinkle(col = 5, fn = quote(pvalString(value)))
            term estimate std.error statistic p.value conf.low conf.high
1      Intercept 0.224695  0.045268  4.963685 < 0.001 0.135972  0.313419
2 actDf xVar ego 1.011514  0.052867 19.133283 < 0.001 0.907897  1.115131

> coef(modTestGLM) - coef(modTest)
  (Intercept)          xVar 
-3.640075e-10  3.640201e-10

Seems like the last time interval during preprocessing is calculated wrong and produce the differences between glm and goldfish estimations.

stocnet / goldfish

`C` and `R` engine results differ for rate model #71

First test with glm

1. we get the preprocessed stats for a simple model

2. we calculate the exposure timesand the stats for each dependent event

we only need to calculate the update of the friendship net between the first and second event

dep or exo?

3. check glm results

df <- df[(Na+1):((Ndep+Nexo)*Na),]

4. check previous models

Second test with glm

1. we get the preprocessed stats for another simple model with time windows

2. we calculate the exposure times and the stats for each dependent event

this time we need to calculate window updates

dep or exo?

update windowed stat

update windowed stat

3. check glm results

4. check previous models