Closed ArtPoon closed 10 months ago
Script to run models under v1.0 code:
require(dplyr)
source("MountainPlot/comp_Lib.R")
iFile <- "data/chato2020/naFull-subB.tn93.csv"
iG <- impTN93(iFile) # this takes a while
steps <- head(hist(subset(iG$e, Distance<0.05)$Distance, plot=FALSE)$breaks, -5)
cutoffs <- seq(0 , max(steps), max(steps)/50)
gs <- lapply(cutoffs, function(d) {dFilt(iG, d)})
res <- lapply(gs, function(subG) {compAnalyze(subG)})
names(res) <- cutoffs
Note I modified the impTN93
function in comp_Lib.R
to convert dates to years:
el <- data.frame(ID1=as.character(temp1), t1=year(as.Date(temp2)),
ID2=as.character(temp3), t2=year(as.Date(temp4)),
Distance = as.numeric(idf$Distance), stringsAsFactors= F)
Script to fit models to same data with current main
branch code:
require(ape)
setwd('~/git/clustuneR')
edge.info <- read.csv("data/chato2020/naFull-subB.tn93.csv")
source("R/sequence.setup.R")
headers <- unique(c(edge.info$ID1, edge.info$ID2))
seq.info <- parse.headers(headers, sep="_", var.names = c("id", "coldate", "subtype"),
var.transformations = c(as.character, as.Date, as.factor))
seq.info$subtype <- NULL
seq.info$colyear <- year(seq.info$coldate)
seq.info$coldate <- NULL
which.new <- seq.info$colyear == max(seq.info$colyear)
source("R/graph.setup.R")
obj <- read.edges(seq.info, edge.info, which.new)
source("R/graph.clustering.R")
source("R/analysis.R")
param.list <- list()
vals <- seq(0, 0.04, 8e-4)
for (i in 1:length(vals)) {
param.list[[i]] <- list(dist.thresh=vals[i])
}
cluster.data <- multi.cluster(obj, param.list, component.cluster)
ptrans <- list(colyear=mean)
pmods <- list(
"NullModel" = function(x) {
glm(Growth ~ Size, data=x, family="poisson")
},
"AltModel" = function(x) {
glm(Growth ~ Size + colyear, data=x, family="poisson")
})
model.fits <- fit.analysis(cluster.data, ptrans, pmods)
> res[[10]]$ageFit
Call: glm(formula = Growth ~ Pred, family = "poisson", data = df1)
Coefficients:
(Intercept) Pred
-3.115 3.827
Degrees of Freedom: 520 Total (i.e. Null); 519 Residual
Null Deviance: 304.4
Residual Deviance: 218.6 AIC: 284.7
> model.fits$AltModel[[10]]
Call: glm(formula = Growth ~ Size + colyear, family = "poisson", data = x)
Coefficients:
(Intercept) Size colyear
-540.6831 0.3290 0.2674
Degrees of Freedom: 520 Total (i.e. Null); 518 Residual
Null Deviance: 304.4
Residual Deviance: 211.1 AIC: 279.2
> old.data <- res[[10]]$ageFit$data
> summary(old.data$Pred)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.02221 0.03690 0.08283 0.09143 0.11043 1.38383
> hist(old.data$Pred)
> new.data <- model.fits$AltModel[[10]]$data
> table(new.data$colyear)
2007 2007.5 2008 2008.5
36 1 105 2
2008.66666666667 2008.77777777778 2009 2009.2
2 1 103 1
2009.5 2009.6 2009.66666666667 2010
7 1 4 76
2010.125 2010.375 2010.5 2010.5625
1 1 7 1
2010.66666666667 2010.9 2011 2011.25
1 1 85 1
2011.5 2012
1 83
Analyze original edge density model in comp_Lib.R
compAnalyze
function:
# retrieve object from list
> subG <- gs[[10]]
# start stepping through compAnalyze
> dMax <- max(subG$e$Distance)
> tTab <- table(subG$f$tMax)
> vTab <- table(subG$v$Time)
> eTab <- sapply(as.numeric(names(vTab)), function(t){
+ nrow(subset(subG$e, (t1==t|t2==t)))
+ })
> names(eTab) <- names(vTab)
subG$e
is an edge list filtered by the distance threshold:
> head(subG$e)
ID1 t1 ID2 t2 Distance tMax tDiff
18957 KU190058.1 2007 KU190059.1 2007 0.004946230 2007 0
124296 KU190037.1 2007 KU190038.1 2007 0.004938560 2007 0
183995 KU190068.1 2007 KU190069.1 2007 0.002956920 2007 0
4 KU190152.1 2008 KU190153.1 2008 0.001970860 2008 0
38218 KU190084.1 2008 KU190099.1 2008 0.000985752 2008 0
44769 KU190033.1 2007 KU190107.1 2008 0.000985057 2008 1
> summary(subG$e$Distance)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000000 0.001970 0.003944 0.003574 0.005929 0.006954
subG$f
is another edge list with no distance thresholding, but excluding edges to the most recent cases and with a minimum time difference of 1 between cases:
> head(subG$f)
ID1 t1 ID2 t2 Distance tMax tDiff
1 KU190052.1 2007 KU190175.1 2008 0.0169226 2008 1
2 KU190069.1 2007 KU190152.1 2008 0.0261166 2008 1
3 KU190058.1 2007 KU190217.1 2008 0.0149402 2008 1
4 KU190042.1 2007 KU190196.1 2008 0.0334008 2008 1
5 KU190041.1 2007 KU190106.1 2008 0.0428548 2008 1
6 KU190053.1 2007 KU190129.1 2008 0.0312729 2008 1
> table(subG$e$tDiff)
0 1 2 3 4 5
98 104 71 37 30 12
> table(subG$f$tDiff)
1 2 3 4 5
358 179 83 30 2
> table(subG$e$tMax)
2007 2008 2009 2010 2011 2012 2013
3 10 37 55 80 121 46
> table(subG$f$tMax)
2008 2009 2010 2011 2012
142 142 114 124 130
> summary(subG$f$Distance)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000000 0.009938 0.019964 0.020968 0.031206 0.065128
ageD <- bind_rows(lapply(as.numeric(names(tTab)), function(t) {
temp <- subset(subG$f, tMax==t)
dfs <- split(temp, temp$tDiff)
Positive <- sapply(dfs, function(df){length(which(df$Distance<=dMax))})
vTotal <- rep((vTab[[as.character(t)]]),length(dfs))
tDiff <- as.numeric(names(Positive))
oeDens <- sapply(tDiff, function(tD){
oTime <- t-tD
return(eTab[as.character(oTime)]/vTab[as.character(oTime)])
})
res <- data.frame(Positive=as.numeric(Positive), vTotal=vTab[[as.character(t)]], oeDens=oeDens, tDiff)
return(res)
}))
tTab
is a table of max time (tMax
) values in edgelist subG$f
temp
filters rows with a given tMax
dfs
is a list of rows by tDiff
, i.e., number of years before tMax
for second case (edge parent)Positive
counts the number of edges in dfs
with distance below thresholdCurrent unit test for fit.decay
:
> # load test fixture
> tn93 <- read.csv(test_path("test2.tn93.csv"))
> seqs <- ape::read.FASTA(test_path("test2.fasta"))
> seq.info <- parse.headers(
+ names(seqs), var.names=c('accn', 'coldate', 'subtype'),
+ var.transformations=c(as.character, as.Date, as.factor))
> seq.info$colyear <- data.table::year(seq.info$coldate)
> which.new <- (seq.info$colyear >= 2012)
> obj <- read.edges(tn93, seq.info, which.new)
> edges <- obj$edge.info[obj$edge.info$Distance < 0.02, ]
> times <- obj$seq.info$colyear
> result <- fit.decay(edges, times)
> result$data
dt positives total
1 1 2 19
2 2 1 10
3 3 0 6
4 4 0 2
where dt
is time difference, positives
is the number of bipartite edges, and total
is the total number of possible bipartite edges.
Running the same test data through the original v1.0 code:
> require(dplyr)
> source("MountainPlot/comp_Lib.R")
> iFile <- "tests/testthat/test2.tn93.csv"
> iG <- impTN93(iFile, minNS=1)
> subG <- dFilt(iG, 0.02) # current test case
>
> dMax <- max(subG$e$Distance)
> dMax
[1] 0.0199944
>
> tTab <- table(subG$f$tMax)
> tTab
2008 2009 2010 2011
4 2 1 1
>
> vTab <- table(subG$v$Time)
> vTab
2007 2008 2009 2010 2011 2012
2 4 2 1 1 4
>
> eTab <- sapply(as.numeric(names(vTab)), function(t){
+ nrow(subset(subG$e, (t1==t|t2==t)))
+ })
> names(eTab) <- names(vTab)
> eTab
2007 2008 2009 2010 2011 2012
1 5 7 3 0 3
> ageD <- bind_rows(lapply(as.numeric(names(tTab)), function(t) {
+ temp <- subset(subG$f, tMax==t)
+ dfs <- split(temp, temp$tDiff)
+
+ Positive <- sapply(dfs, function(df){length(which(df$Distance<=dMax))})
+ vTotal <- rep((vTab[[as.character(t)]]),length(dfs))
+ tDiff <- as.numeric(names(Positive))
+ oeDens <- sapply(tDiff, function(tD){
+ oTime <- t-tD
+ return(eTab[as.character(oTime)]/vTab[as.character(oTime)])
+ })
+
+
+ res <- data.frame(Positive=as.numeric(Positive), vTotal=vTab[[as.character(t)]], oeDens=oeDens, tDiff)
+ return(res)
+ }))
> ageD
Positive vTotal oeDens tDiff
2007 0 4 0.50 1
2008...2 2 2 1.25 1
2008...3 1 1 1.25 2
2008...4 0 1 1.25 3
> mod <- glm(cbind(Positive, vTotal) ~ tDiff+oeDens, data=ageD, family='binomial')
> subG$v$Weight <- predict(mod, type='response',
+ data.frame(tDiff=max(subG$v$Time)-subG$v$Time,
+ oeDens=as.numeric(eTab[as.character(subG$v$Time)]/vTab[as.character(subG$v$Time)])))
> subG$v
ID Time Weight
2 KU190031.1 2007 1.568653e-11
38 KU190840.1 2007 1.568653e-11
6 KU190147.1 2008 7.839703e-02
24 KU190209.1 2008 7.839703e-02
40 KU190111.1 2008 7.839703e-02
101 KU189996.1 2008 7.839703e-02
3 KU190253.1 2009 1.000000e+00
29 KU190227.1 2009 1.000000e+00
1 KU190389.1 2010 1.000000e+00
22 KU190560.1 2011 2.220446e-16
4 KU190613.1 2012 1.660245e-06
7 KU190602.1 2012 1.660245e-06
8 KU190617.1 2012 1.660245e-06
21 KU190609.1 2012 1.660245e-06
v1.0 GLMs are fit here: https://github.com/PoonLab/clustuneR/blob/ef294dcf1f4121f5ec389257a24faaf403da4a4f/scripts/comp_Lib.R#L238-L242
df2
is the null model, where growth is predicted from the cluster size as a proportion of the total number of known cases. Since the regression is over clusters, the total numbers are irrelevant and the same regression fit is obtained fromglm(Growth ~ c)
.However, something goes awry when we add cluster recency. In v1.0, we derived a predictor variable from the expected decay in the probability of an edge between cases separated by time $\delta t$ (see Chato et al. 2020, equations 3 and 4). In the current branch we are taking the collection year directly as a predictor variable.
The expected edge probability is not simply a transformation of collection year because it is a function of the distance threshold.
Originally posted by @ArtPoon in https://github.com/PoonLab/clustuneR/issues/12#issuecomment-1823412896