OSCA-source / OSCA.multisample

The multi-sample subbook of OSCA
5 stars 7 forks source link

'"CD3D" %in% head(demo$Symbol) is not TRUE' error in BioC 3.14, 3.15, and 3.16 #6

Closed hpages closed 2 years ago

hpages commented 2 years ago

https://bioconductor.org/checkResults/3.14/books-LATEST/OSCA.multisample/nebbiolo2-buildsrc.html https://bioconductor.org/checkResults/3.15/books-LATEST/OSCA.multisample/nebbiolo1-buildsrc.html https://bioconductor.org/checkResults/3.16/books-LATEST/OSCA.multisample/nebbiolo2-buildsrc.html

I've started to look into this. Using this issue as a convenient way to share my progress.

First milestone: I managed to extract the code from tenx-filtered-pbmc3k-4k-8k.Rmd and using-corrected-values.Rmd that lead to this error. This allows me to reproduce the error in about 5 min. on my laptop (Ubuntu 22.04 LTS, 16Gb of RAM):

# Required packages: igraph, uwot, TENxPBMCData, scater, scran, BiocSingular, batchelor.

# Data loading.
library(TENxPBMCData)
all.sce <- list(
    pbmc3k=TENxPBMCData('pbmc3k'),
    pbmc4k=TENxPBMCData('pbmc4k'),
    pbmc8k=TENxPBMCData('pbmc8k')
)

# Quality control.
library(scater)
for (n in names(all.sce)) {
    current <- all.sce[[n]]
    is.mito <- grep("MT", rowData(current)$Symbol_TENx)
    stats <- perCellQCMetrics(current, subsets=list(Mito=is.mito))
    high.mito <- isOutlier(stats$subsets_Mito_percent, type="higher")
    all.sce[[n]] <- current[,!high.mito]
}

# Normalization.
all.sce <- lapply(all.sce, logNormCounts)

# Variance modelling.
library(scran)
all.dec <- lapply(all.sce, modelGeneVar)
all.hvgs <- lapply(all.dec, getTopHVGs, prop=0.1)

# Dimensionality reduction.
library(BiocSingular)
set.seed(10000)
all.sce <- mapply(FUN=runPCA, x=all.sce, subset_row=all.hvgs,
    MoreArgs=list(ncomponents=25, BSPARAM=RandomParam()),
    SIMPLIFY=FALSE)

set.seed(100000)
all.sce <- lapply(all.sce, runTSNE, dimred="PCA")

set.seed(1000000)
all.sce <- lapply(all.sce, runUMAP, dimred="PCA")

# Clustering.
for (n in names(all.sce)) {
    g <- buildSNNGraph(all.sce[[n]], k=10, use.dimred='PCA')
    clust <- igraph::cluster_walktrap(g)$membership
    colLabels(all.sce[[n]])  <- factor(clust)
}

# Intersecting the common genes.
universe <- Reduce(intersect, lapply(all.sce, rownames))
all.sce2 <- lapply(all.sce, "[", i=universe,)
all.dec2 <- lapply(all.dec, "[", i=universe,)

# Renormalizing to adjust for differences in depth.
library(batchelor)
normed.sce <- do.call(multiBatchNorm, all.sce2)

# Identifying a set of HVGs using stats from all batches.
combined.dec <- do.call(combineVar, all.dec2)
combined.hvg <- getTopHVGs(combined.dec, n=5000)

set.seed(1000101)
merged.pbmc <- do.call(fastMNN, c(normed.sce,
    list(subset.row=combined.hvg, BSPARAM=RandomParam())))

g <- buildSNNGraph(merged.pbmc, use.dimred="corrected")
colLabels(merged.pbmc) <- factor(igraph::cluster_louvain(g)$membership)

all.sce2 <- lapply(all.sce2, function(x) {
    rowData(x) <- rowData(all.sce2[[1]])
    x
})
combined <- do.call(cbind, all.sce2)
combined$batch <- rep(c("3k", "4k", "8k"), vapply(all.sce2, ncol, 0L))
clusters.mnn <- colLabels(merged.pbmc)

# Marker detection with block= set to the batch factor.
library(scran)
m.out <- findMarkers(combined, clusters.mnn, block=combined$batch,
    direction="up", lfc=1, row.data=rowData(combined)[,3,drop=FALSE])

demo <- m.out[["1"]]

# Checking that the genes  are there.
stopifnot("CD3D" %in% head(demo$Symbol))

sessionInfo():

R version 4.2.0 Patched (2022-05-04 r82318)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04 LTS

Matrix products: default
BLAS:   /home/hpages/R/R-4.2.r82318/lib/libRblas.so
LAPACK: /home/hpages/R/R-4.2.r82318/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB              LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] batchelor_1.13.0            BiocSingular_1.13.0        
 [3] scran_1.25.0                scater_1.25.1              
 [5] ggplot2_3.3.6               scuttle_1.7.0              
 [7] TENxPBMCData_1.15.0         HDF5Array_1.25.0           
 [9] rhdf5_2.41.0                DelayedArray_0.23.1        
[11] Matrix_1.4-1                SingleCellExperiment_1.19.0
[13] SummarizedExperiment_1.27.1 Biobase_2.57.0             
[15] GenomicRanges_1.49.0        GenomeInfoDb_1.33.3        
[17] IRanges_2.31.0              S4Vectors_0.35.0           
[19] BiocGenerics_0.43.0         MatrixGenerics_1.9.0       
[21] matrixStats_0.62.0         

loaded via a namespace (and not attached):
  [1] Rtsne_0.16                    ggbeeswarm_0.6.0             
  [3] colorspace_2.0-3              ellipsis_0.3.2               
  [5] bluster_1.7.0                 XVector_0.37.0               
  [7] BiocNeighbors_1.15.0          ggrepel_0.9.1                
  [9] bit64_4.0.5                   interactiveDisplayBase_1.35.0
 [11] AnnotationDbi_1.59.0          RSpectra_0.16-1              
 [13] fansi_1.0.3                   codetools_0.2-18             
 [15] sparseMatrixStats_1.9.0       cachem_1.0.6                 
 [17] ResidualMatrix_1.7.0          cluster_2.1.3                
 [19] dbplyr_2.1.1                  png_0.1-7                    
 [21] uwot_0.1.11                   shiny_1.7.1                  
 [23] BiocManager_1.30.17           compiler_4.2.0               
 [25] httr_1.4.3                    dqrng_0.3.0                  
 [27] assertthat_0.2.1              fastmap_1.1.0                
 [29] limma_3.53.0                  cli_3.3.0                    
 [31] later_1.3.0                   htmltools_0.5.2              
 [33] tools_4.2.0                   rsvd_1.0.5                   
 [35] igraph_1.3.1                  gtable_0.3.0                 
 [37] glue_1.6.2                    GenomeInfoDbData_1.2.8       
 [39] dplyr_1.0.9                   rappdirs_0.3.3               
 [41] Rcpp_1.0.8.3                  vctrs_0.4.1                  
 [43] Biostrings_2.65.0             rhdf5filters_1.9.0           
 [45] ExperimentHub_2.5.0           DelayedMatrixStats_1.19.0    
 [47] beachmat_2.13.0               mime_0.12                    
 [49] lifecycle_1.0.1               irlba_2.3.5                  
 [51] statmod_1.4.36                AnnotationHub_3.5.0          
 [53] edgeR_3.39.1                  zlibbioc_1.43.0              
 [55] scales_1.2.0                  promises_1.2.0.1             
 [57] parallel_4.2.0                yaml_2.3.5                   
 [59] curl_4.3.2                    memoise_2.0.1                
 [61] gridExtra_2.3                 RSQLite_2.2.14               
 [63] BiocVersion_3.16.0            ScaledMatrix_1.5.0           
 [65] filelock_1.0.2                BiocParallel_1.31.3          
 [67] rlang_1.0.2                   pkgconfig_2.0.3              
 [69] bitops_1.0-7                  lattice_0.20-45              
 [71] purrr_0.3.4                   Rhdf5lib_1.19.0              
 [73] bit_4.0.4                     tidyselect_1.1.2             
 [75] RcppAnnoy_0.0.19              magrittr_2.0.3               
 [77] R6_2.5.1                      generics_0.1.2               
 [79] metapod_1.5.0                 DBI_1.1.2                    
 [81] pillar_1.7.0                  withr_2.5.0                  
 [83] KEGGREST_1.37.0               RCurl_1.98-1.6               
 [85] tibble_3.1.7                  crayon_1.5.1                 
 [87] utf8_1.2.2                    BiocFileCache_2.5.0          
 [89] viridis_0.6.2                 locfit_1.5-9.5               
 [91] grid_4.2.0                    blob_1.2.3                   
 [93] FNN_1.1.3                     digest_0.6.29                
 [95] xtable_1.8-4                  httpuv_1.6.5                 
 [97] munsell_0.5.0                 beeswarm_0.4.0               
 [99] viridisLite_0.4.0             vipor_0.4.5                  
vjcitn commented 2 years ago

Just a guess on my part: demo <- m.out[["1"]] is using the label name "1" to identify a cluster of cells that is not the intended one (in 3.14-3.16). "1" was the name of the cluster that has strong expression of CD3D in some previous version. What changed to give the desired cluster a different name is the mystery. (It would be good to go over the other elements of m.out and find which ones have (or which one has) CD3D in the head of the Symbol field.)

hpages commented 2 years ago

Exactly. This is what m.out[["1"]] used to look like (about 1 year ago):

demo <- m.out[["1"]]
as.data.frame(demo[1:10,c("Symbol", "Top", "p.value", "FDR")])
#                 Symbol Top       p.value           FDR
# ENSG00000172116   CD8B   1 7.352025e-100  8.831478e-97
# ENSG00000167286   CD3D   1 1.545028e-204 4.825433e-201
# ENSG00000111716   LDHB   1 5.445106e-146 9.447864e-143
# ENSG00000213741  RPS29   1  0.000000e+00  0.000000e+00
# ENSG00000171858  RPS21   1  0.000000e+00  0.000000e+00
# ENSG00000171223   JUNB   1 8.879883e-235 4.622275e-231
# ENSG00000177954  RPS27   2 1.045283e-296 6.529257e-293
# ENSG00000153563   CD8A   2  1.000000e+00  1.000000e+00
# ENSG00000136942  RPL35   2  0.000000e+00  0.000000e+00
# ENSG00000198851   CD3E   2 6.772585e-174 1.410143e-170

This is what we actually see in OSCA.multisample 1.1.0, the last successful build of OSCA.multisample (see "3.3 After blocking on the batch" section here).

But at some point something changed and today's m.out[["1"]] looks like this:

demo <- m.out[["1"]]
as.data.frame(demo[1:10,c("Symbol", "Top", "p.value", "FDR")])
#                 Symbol Top       p.value           FDR
# ENSG00000177606    JUN   1 3.830494e-203 1.495425e-199
# ENSG00000196154 S100A4   1 3.113656e-260 1.389225e-256
# ENSG00000227507    LTB   1  0.000000e+00  0.000000e+00
# ENSG00000008517   IL32   1  0.000000e+00  0.000000e+00
# ENSG00000171858  RPS21   1  0.000000e+00  0.000000e+00
# ENSG00000124614  RPS10   2  0.000000e+00  0.000000e+00
# ENSG00000135046  ANXA1   2  1.062389e-56  1.276174e-53
# ENSG00000167286   CD3D   2 1.883686e-277 9.805215e-274
# ENSG00000111716   LDHB   2 1.080598e-168 2.812436e-165
# ENSG00000170345    FOS   2 1.861859e-202 6.461063e-199

A very different cluster! (even though CD3D is still here but is no longer in the head).

And the old m.out[["1"]] can be seen in today's m.out[["10"]]:

demo <- m.out[["10"]]
as.data.frame(demo[1:10,c("Symbol", "Top", "p.value", "FDR")])
#                 Symbol Top       p.value           FDR
# ENSG00000172116   CD8B   1 6.165199e-101  7.405827e-98
# ENSG00000167286   CD3D   1 5.348343e-205 1.855994e-201
# ENSG00000111716   LDHB   1 4.784764e-149 8.790456e-146
# ENSG00000213741  RPS29   1  0.000000e+00  0.000000e+00
# ENSG00000171858  RPS21   1  0.000000e+00  0.000000e+00
# ENSG00000171223   JUNB   1 1.023935e-233 4.568506e-230
# ENSG00000177954  RPS27   2 2.482226e-294 1.550498e-290
# ENSG00000153563   CD8A   2  1.000000e+00  1.000000e+00
# ENSG00000124614  RPS10   2  0.000000e+00  0.000000e+00
# ENSG00000198851   CD3E   2 1.518088e-173 3.160861e-170

Looks like the clustering is taken care of by igraph::cluster_walktrap and/or igraph::cluster_louvain and it seems that these functions have changed in recent versions of igraph. If I downgrade igraph to version 1.2.6 (this is the version that was used to build OSCA.multisample 1.1.0), then I get an m.out[["1"]] that looks like the old m.out[["1"]] again.

So what should we do? Replace m.out[["1"]] with m.out[["10"]] or replace the CD3D/CD8B pair with the S100A4/JUN pair?

@LTLA ?

H.

vjcitn commented 2 years ago

Nice work. I think it would be appropriate to replace "1" by "10" at this point. But beyond this I think it would be a good idea to come up with labels that have semantic value. I don't have a general approach, but a step in this direction would be to take a digest of the top 10 gene symbols in each cluster, for example. Use those digests as the names of the components of m.out. The clustering procedure might permute the labels "1", ..., "10" for whatever reason, but we would superimpose labels that are based on the genes, and interrogate the list using digest-based labels. The digest-based label might also map to a "cell type". This should work if the only change between versions concerns permutation of the labels. If in addition the clustering/analysis procedure does not preserve the ordered gene lists, the digests in the previous version won't be present and an error would be thrown. This could obviate the need for the stopifnot() clauses.

LTLA commented 2 years ago

Yes, replacing 1->10 would be the immediate fix.

The general solution would be what Vince described, and is just something that we have to accommodate when dependencies inevitably change from under us. (In this case, igraph's updates.)

A possibly elegant solution would be to implement a chooseCluster() function somewhere that chooses a cluster based on its top markers. We could then replace all hard-coded cluster numbers with the output of chooseCluster(), run on the marker genes. Similarly, the inline text could use the output of chooseCluster() to make it look like we knew which cluster we wanted all along.

There are still some stop()s unrelated to marker genes, though those should be much less prominent.

hpages commented 2 years ago

Definitely a change in the ordering of the clusters returned by igraph::cluster_louvain(). With the following graph:

library(igraph)
g <- make_full_graph(4) %du% make_full_graph(2) %du% make_full_graph(8) %du% make_full_graph(5)
g <- add_edges(g, c(6, 15))

igraph 1.2.6 produces:

igraph::cluster_louvain(g)
# IGRAPH clustering multi level, groups: 3, mod: 0.54
# + groups:
#   $`1`
#   [1] 1 2 3 4
#  
#   $`2`
#   [1]  7  8  9 10 11 12 13 14
#   
#   $`3`
#   [1]  5  6 15 16 17 18 19

igraph 1.3.1 produces:

igraph::cluster_louvain(g)
# IGRAPH clustering multi level, groups: 3, mod: 0.54
# + groups:
#   $`1`
#   [1] 1 2 3 4
#   
#   $`2`
#   [1]  5  6 15 16 17 18 19
#   
#   $`3`
#   [1]  7  8  9 10 11 12 13 14

Also with graphs with thousands of nodes (like in the OSCA.multisample book), the function can produce clusters that are slightly different between the 2 versions of igraph:

g <- buildSNNGraph(all.sce[[n]], k=10, use.dimred='PCA')  # graph with 14948 nodes and 1612016 edges
mbship <- igraph::cluster_walktrap(g)$membership

igraph 1.2.6 produces:

sort(table(mbship))
# mbship
#    6   10    7   14    8   11   12    3    1    2    4   13    5    9 
#   23  107  155  225  331  542  623 1100 1325 1708 1852 2040 2194 2723 

igraph 1.3.1 produces:

sort(table(mbship))
# mbship
#   13   14    9   12   11    3    4    5   10    1    6    2    7    8 
#   23  103  155  225  359  525  623 1096 1324 1707 1855 2040 2197 2716 

For example cluster "1" in 1.2.6 became cluster "10" in 1.3.1 and lost one node.

Anyways, I'll do the 1->10 replacement in the book.

Thanks @vjcitn @LTLA for the feedback!

H.