Restore edge probability decay model as an option for graph clustering

ArtPoon commented 1 year ago

v1.0 GLMs are fit here: https://github.com/PoonLab/clustuneR/blob/ef294dcf1f4121f5ec389257a24faaf403da4a4f/scripts/comp_Lib.R#L238-L242

df2 is the null model, where growth is predicted from the cluster size as a proportion of the total number of known cases. Since the regression is over clusters, the total numbers are irrelevant and the same regression fit is obtained from glm(Growth ~ c).

However, something goes awry when we add cluster recency. In v1.0, we derived a predictor variable from the expected decay in the probability of an edge between cases separated by time $\delta t$ (see Chato et al. 2020, equations 3 and 4). In the current branch we are taking the collection year directly as a predictor variable.

The expected edge probability is not simply a transformation of collection year because it is a function of the distance threshold.

Originally posted by @ArtPoon in https://github.com/PoonLab/clustuneR/issues/12#issuecomment-1823412896

ArtPoon commented 1 year ago

Script to run models under v1.0 code:

require(dplyr)
source("MountainPlot/comp_Lib.R")
iFile <- "data/chato2020/naFull-subB.tn93.csv"
iG <- impTN93(iFile)  # this takes a while

steps <- head(hist(subset(iG$e, Distance<0.05)$Distance, plot=FALSE)$breaks, -5)
cutoffs <- seq(0 , max(steps), max(steps)/50) 
gs <- lapply(cutoffs, function(d) {dFilt(iG, d)})

res <- lapply(gs, function(subG) {compAnalyze(subG)})
names(res) <- cutoffs

Note I modified the impTN93 function in comp_Lib.R to convert dates to years:

  el <- data.frame(ID1=as.character(temp1), t1=year(as.Date(temp2)), 
                   ID2=as.character(temp3), t2=year(as.Date(temp4)), 
                   Distance = as.numeric(idf$Distance), stringsAsFactors= F)

Script to fit models to same data with current main branch code:

require(ape)
setwd('~/git/clustuneR')
edge.info <- read.csv("data/chato2020/naFull-subB.tn93.csv")

source("R/sequence.setup.R")
headers <- unique(c(edge.info$ID1, edge.info$ID2))
seq.info <- parse.headers(headers, sep="_", var.names = c("id", "coldate", "subtype"),
                          var.transformations = c(as.character, as.Date, as.factor))
seq.info$subtype <- NULL
seq.info$colyear <- year(seq.info$coldate)
seq.info$coldate <- NULL

which.new <- seq.info$colyear == max(seq.info$colyear)

source("R/graph.setup.R")
obj <- read.edges(seq.info, edge.info, which.new)

source("R/graph.clustering.R")
source("R/analysis.R")
param.list <- list()
vals <- seq(0, 0.04, 8e-4)
for (i in 1:length(vals)) {
  param.list[[i]] <- list(dist.thresh=vals[i])
}
cluster.data <- multi.cluster(obj, param.list, component.cluster)

ptrans <- list(colyear=mean)
pmods <- list(
  "NullModel" = function(x) {
    glm(Growth ~ Size, data=x, family="poisson")
  },
  "AltModel" = function(x) {
    glm(Growth ~ Size + colyear, data=x, family="poisson")
  })

model.fits <- fit.analysis(cluster.data, ptrans, pmods)

ArtPoon commented 1 year ago

> res[[10]]$ageFit

Call:  glm(formula = Growth ~ Pred, family = "poisson", data = df1)

Coefficients:
(Intercept)         Pred  
     -3.115        3.827  

Degrees of Freedom: 520 Total (i.e. Null);  519 Residual
Null Deviance:      304.4 
Residual Deviance: 218.6    AIC: 284.7
> model.fits$AltModel[[10]]

Call:  glm(formula = Growth ~ Size + colyear, family = "poisson", data = x)

Coefficients:
(Intercept)         Size      colyear  
  -540.6831       0.3290       0.2674  

Degrees of Freedom: 520 Total (i.e. Null);  518 Residual
Null Deviance:      304.4 
Residual Deviance: 211.1    AIC: 279.2

ArtPoon commented 1 year ago

> old.data <- res[[10]]$ageFit$data
> summary(old.data$Pred)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.02221 0.03690 0.08283 0.09143 0.11043 1.38383 
> hist(old.data$Pred)

> new.data <- model.fits$AltModel[[10]]$data
> table(new.data$colyear)

            2007           2007.5             2008           2008.5 
              36                1              105                2 
2008.66666666667 2008.77777777778             2009           2009.2 
               2                1              103                1 
          2009.5           2009.6 2009.66666666667             2010 
               7                1                4               76 
        2010.125         2010.375           2010.5        2010.5625 
               1                1                7                1 
2010.66666666667           2010.9             2011          2011.25 
               1                1               85                1 
          2011.5             2012 
               1               83

ArtPoon commented 1 year ago

Analyze original edge density model in comp_Lib.R compAnalyze function:

# retrieve object from list
> subG <- gs[[10]]

# start stepping through compAnalyze
>   dMax <- max(subG$e$Distance)
>   tTab <- table(subG$f$tMax)
>   vTab <- table(subG$v$Time)
>   eTab <- sapply(as.numeric(names(vTab)), function(t){
+     nrow(subset(subG$e, (t1==t|t2==t)))
+   })
>   names(eTab) <- names(vTab)

subG$e is an edge list filtered by the distance threshold:

> head(subG$e)
              ID1   t1        ID2   t2    Distance tMax tDiff
18957  KU190058.1 2007 KU190059.1 2007 0.004946230 2007     0
124296 KU190037.1 2007 KU190038.1 2007 0.004938560 2007     0
183995 KU190068.1 2007 KU190069.1 2007 0.002956920 2007     0
4      KU190152.1 2008 KU190153.1 2008 0.001970860 2008     0
38218  KU190084.1 2008 KU190099.1 2008 0.000985752 2008     0
44769  KU190033.1 2007 KU190107.1 2008 0.000985057 2008     1
> summary(subG$e$Distance)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000000 0.001970 0.003944 0.003574 0.005929 0.006954

subG$f is another edge list with no distance thresholding, but excluding edges to the most recent cases and with a minimum time difference of 1 between cases:

> head(subG$f)
         ID1   t1        ID2   t2  Distance tMax tDiff
1 KU190052.1 2007 KU190175.1 2008 0.0169226 2008     1
2 KU190069.1 2007 KU190152.1 2008 0.0261166 2008     1
3 KU190058.1 2007 KU190217.1 2008 0.0149402 2008     1
4 KU190042.1 2007 KU190196.1 2008 0.0334008 2008     1
5 KU190041.1 2007 KU190106.1 2008 0.0428548 2008     1
6 KU190053.1 2007 KU190129.1 2008 0.0312729 2008     1
> table(subG$e$tDiff)

  0   1   2   3   4   5 
 98 104  71  37  30  12 
> table(subG$f$tDiff)

  1   2   3   4   5 
358 179  83  30   2 
> table(subG$e$tMax)

2007 2008 2009 2010 2011 2012 2013 
   3   10   37   55   80  121   46 
> table(subG$f$tMax)
2008 2009 2010 2011 2012 
 142  142  114  124  130
> summary(subG$f$Distance)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000000 0.009938 0.019964 0.020968 0.031206 0.065128

ArtPoon commented 1 year ago

  ageD <- bind_rows(lapply(as.numeric(names(tTab)), function(t) {
    temp <- subset(subG$f, tMax==t)
    dfs <- split(temp, temp$tDiff)
    Positive <- sapply(dfs, function(df){length(which(df$Distance<=dMax))})
    vTotal <- rep((vTab[[as.character(t)]]),length(dfs))
    tDiff <- as.numeric(names(Positive))
    oeDens <- sapply(tDiff, function(tD){
      oTime <- t-tD
      return(eTab[as.character(oTime)]/vTab[as.character(oTime)])
    })
    res <- data.frame(Positive=as.numeric(Positive), vTotal=vTab[[as.character(t)]], oeDens=oeDens, tDiff)
    return(res)
  }))

tTab is a table of max time (tMax) values in edgelist subG$f
temp filters rows with a given tMax
dfs is a list of rows by tDiff, i.e., number of years before tMax for second case (edge parent)
Positive counts the number of edges in dfs with distance below threshold

ArtPoon commented 12 months ago

Current unit test for fit.decay:

>   # load test fixture
>   tn93 <- read.csv(test_path("test2.tn93.csv"))
>   seqs <- ape::read.FASTA(test_path("test2.fasta"))
>   seq.info <- parse.headers(
+     names(seqs), var.names=c('accn', 'coldate', 'subtype'),
+     var.transformations=c(as.character, as.Date, as.factor))
>   seq.info$colyear <- data.table::year(seq.info$coldate)
>   which.new <- (seq.info$colyear >= 2012)
>   obj <- read.edges(tn93, seq.info, which.new)
>   edges <- obj$edge.info[obj$edge.info$Distance < 0.02, ]
>   times <- obj$seq.info$colyear
>   result <- fit.decay(edges, times)
> result$data
  dt positives total
1  1         2    19
2  2         1    10
3  3         0     6
4  4         0     2

where dt is time difference, positives is the number of bipartite edges, and total is the total number of possible bipartite edges.

Running the same test data through the original v1.0 code:

> require(dplyr)
> source("MountainPlot/comp_Lib.R")
> iFile <- "tests/testthat/test2.tn93.csv"
> iG <- impTN93(iFile, minNS=1)
> subG <- dFilt(iG, 0.02)  # current test case
> 
> dMax <- max(subG$e$Distance)
> dMax
[1] 0.0199944
> 
> tTab <- table(subG$f$tMax)
> tTab

2008 2009 2010 2011 
   4    2    1    1 
> 
> vTab <- table(subG$v$Time)
> vTab

2007 2008 2009 2010 2011 2012 
   2    4    2    1    1    4 
> 
> eTab <- sapply(as.numeric(names(vTab)), function(t){
+   nrow(subset(subG$e, (t1==t|t2==t)))
+ })
> names(eTab) <- names(vTab)
> eTab
2007 2008 2009 2010 2011 2012 
   1    5    7    3    0    3
>   ageD <- bind_rows(lapply(as.numeric(names(tTab)), function(t) {
+     temp <- subset(subG$f, tMax==t)
+     dfs <- split(temp, temp$tDiff)
+     
+     Positive <- sapply(dfs, function(df){length(which(df$Distance<=dMax))})
+     vTotal <- rep((vTab[[as.character(t)]]),length(dfs))
+     tDiff <- as.numeric(names(Positive))
+     oeDens <- sapply(tDiff, function(tD){
+       oTime <- t-tD
+       return(eTab[as.character(oTime)]/vTab[as.character(oTime)])
+     })
+     
+     
+     res <- data.frame(Positive=as.numeric(Positive), vTotal=vTab[[as.character(t)]], oeDens=oeDens, tDiff)
+     return(res)
+   }))
> ageD
         Positive vTotal oeDens tDiff
2007            0      4   0.50     1
2008...2        2      2   1.25     1
2008...3        1      1   1.25     2
2008...4        0      1   1.25     3
> mod <- glm(cbind(Positive, vTotal) ~ tDiff+oeDens, data=ageD, family='binomial')
> subG$v$Weight <- predict(mod, type='response',
+                          data.frame(tDiff=max(subG$v$Time)-subG$v$Time, 
+                                     oeDens=as.numeric(eTab[as.character(subG$v$Time)]/vTab[as.character(subG$v$Time)])))
> subG$v
            ID Time       Weight
2   KU190031.1 2007 1.568653e-11
38  KU190840.1 2007 1.568653e-11
6   KU190147.1 2008 7.839703e-02
24  KU190209.1 2008 7.839703e-02
40  KU190111.1 2008 7.839703e-02
101 KU189996.1 2008 7.839703e-02
3   KU190253.1 2009 1.000000e+00
29  KU190227.1 2009 1.000000e+00
1   KU190389.1 2010 1.000000e+00
22  KU190560.1 2011 2.220446e-16
4   KU190613.1 2012 1.660245e-06
7   KU190602.1 2012 1.660245e-06
8   KU190617.1 2012 1.660245e-06
21  KU190609.1 2012 1.660245e-06

PoonLab / clustuneR

Restore edge probability decay model as an option for graph clustering #22