Closed ixxmu closed 2 years ago
对应原版教程第10章http://bioconductor.org/books/release/OSCA/overview.html
“物以类聚,人以群分” 分群步骤即将基因表达(降维结果)相似的细胞归为同一个群体,往往对应一种特定的细胞类型或者细胞轨迹状态。从这一步开始,就可以开始叙述我们的生物学故事了~
笔记要点
1、clustering是一个显微镜 2、基于图聚类的分群 3、其它分群算法(k均值与层次聚类) 4、分群结果评价
可简单分为3步
sce.pbmc #来源参考原教程
igraph
包提供的Walktrap算法进行cluster的划分library(scran)
g <- buildSNNGraph(sce.pbmc, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
table(clust)
#clust
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#205 508 541 56 374 125 46 432 302 867 47 155 166 61 84 16
library(scater)
colLabels(sce.pbmc) <- factor(clust)
plotReducedDim(sce.pbmc, "TSNE", colour_by="label")
值得注意的是:t-SNE二维转换与cluster计算是基于分别PCA降维结果的独立计算的过程。经过上图的可视化呈现,可以起到相互验证的作用。但利用igraph包的系列可视化方式就是基于KNN产生的细胞间关系,进行排布的,如下代码。
set.seed(11000)
reducedDim(sce.pbmc, "force") <- igraph::layout_with_fr(g)
plotReducedDim(sce.pbmc, colour_by="label", dimred="force")
(1) Top n nearest neighbour的选择,即buildSNNGraph()
的k=
参数设置(*)
g.5 <- buildSNNGraph(sce.pbmc, k=5, use.dimred = 'PCA')
clust.5 <- igraph::cluster_walktrap(g.5)$membership
table(clust.5)
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
#523 302 125 45 172 573 249 439 293 95 772 142 38 18 62 38 30 16 15 9 16 13
g.50 <- buildSNNGraph(sce.pbmc, k=50, use.dimred = 'PCA')
clust.50 <- igraph::cluster_walktrap(g.50)$membership
table(clust.50)
# 1 2 3 4 5 6 7 8 9 10
#869 514 194 478 539 944 138 175 89 45
(2)其它评价细胞间关联性(权重weight)的方法
g.num <- buildSNNGraph(sce.pbmc, use.dimred="PCA", type="number")
g.jaccard <- buildSNNGraph(sce.pbmc, use.dimred="PCA", type="jaccard")
(3)其它划分cluster的算法
igraph
包提供的各种方法clust.louvain <- igraph::cluster_louvain(g)$membership
clust.infomap <- igraph::cluster_infomap(g)$membership
clust.fast <- igraph::cluster_fast_greedy(g)$membership
clust.labprop <- igraph::cluster_label_prop(g)$membership
clust.eigen <- igraph::cluster_leading_eigen(g)$membership
Pipelines involving
scran
default to rank-based weights followed by Walktrap clustering. In contrast,Seurat
uses Jaccard-based weights followed by Louvain clustering.
bluster
包的pairwiseModularity()
函数进行如上的计算。唯一的区别是用比值ratio代替了差值。ratio <- pairwiseModularity(g, clust, as.ratio=TRUE)
dim(ratio)
library(pheatmap)
pheatmap(log2(ratio+1), cluster_rows=FALSE, cluster_cols=FALSE,
color=colorRampPalette(c("white", "blue"))(100))
如下图,理想情况是对角线的结果最明显,上三角的结果越小越好。
cluster.gr <- igraph::graph_from_adjacency_matrix(log2(ratio+1),
mode="upper", weighted=TRUE, diag=FALSE)
# Increasing the weight to increase the visibility of the lines.
set.seed(11001010)
plot(cluster.gr, edge.width=igraph::E(cluster.gr)$weight*5,
layout=igraph::layout_with_lgl)
kmeans()
函数set.seed(100)
clust.kmeans <- kmeans(reducedDim(sce.pbmc, "PCA"), centers=10)
table(clust.kmeans$cluster)
# 1 2 3 4 5 6 7 8 9 10
#548 46 408 270 539 199 148 783 163 881
#结合t-SNE进行可视化
colLabels(sce.pbmc) <- factor(clust.kmeans$cluster)
plotReducedDim(sce.pbmc, "TSNE", colour_by="label")
如上图 k均值的分群结果在t-SNE分布还是较为理想的。
ncells <- tabulate(clust.kmeans$cluster) #统计1-10群细胞数量
tab <- data.frame(wcss=clust.kmeans$withinss, ncells=ncells)
tab$rms <- sqrt(tab$wcss/tab$ncells)
tab
# wcss ncells rms
# 1 20626.970 548 6.135182
# 2 6530.806 46 11.915286
# 3 9899.650 408 4.925835
# 4 6429.640 270 4.879906
# 5 12413.267 539 4.798977
# 6 13467.078 199 8.226406
# 7 4718.144 148 5.646180
# 8 27010.471 783 5.873341
# 9 7703.558 163 6.874670
# 10 9469.524 881 3.278507
cent.tree <- hclust(dist(clust.kmeans$centers), "ward.D2")
plot(cent.tree)
cluster
包的相关函数,相关代码如下,具体原理可参考原教程。library(cluster)
set.seed(110010101)
gaps <- clusGap(reducedDim(sce.pbmc, "PCA"), kmeans, K.max=20) #耗时
best.k <- maxSE(gaps$Tab[,"gap"], gaps$Tab[,"SE.sim"])
best.k
# 8
clusterRows {bluster}
提供一种联合图聚类与k-均值聚类的方法,可明显的优势是相对于单纯图聚类大大提高了分析速度。set.seed(0101010)
kgraph.clusters <- clusterRows(reducedDim(sce.pbmc, "PCA"),
TwoStepParam(
first=KmeansParam(centers=1000), #1000个中心点
second=NNGraphParam(k=5) #5个最近邻居
)
)
table(kgraph.clusters)
# 1 2 3 4 5 6 7 8 9 10 11 12
#191 854 506 541 541 892 46 120 29 132 47 86
complete linkage aims to merge clusters with the smallest maximum distance between their elements, while Ward’s method aims to minimize the increase in within-cluster variance.
hclust()
函数实操sce.416b
#含有185个细胞的sce子集。获取方式见原教程
dist.416b <- dist(reducedDim(sce.416b, "PCA"))
tree.416b <- hclust(dist.416b, "ward.D2") #Ward’s method
# Making a prettier dendrogram.
library(dendextend)
tree.416b$labels <- seq_along(tree.416b$labels)
dend <- as.dendrogram(tree.416b, hang=0.1)
combined.fac <- paste0(sce.416b$block, ".",
sub(" .*", "", sce.416b$phenotype))
labels_colors(dend) <- c(
`20160113.wild`="blue",
`20160113.induced`="red",
`20160325.wild`="dodgerblue",
`20160325.induced`="salmon"
)[combined.fac][order.dendrogram(dend)]
plot(dend)
cutree()
函数,有两个参数,可分别设置。k=
表示想分成多少个cluster;h=
表示基于什么距离阈值(上图的纵坐标)分隔;cutree(dend,k=4)
cutree(dend,h=200)
dynamicTreeCut
包的cutreeDynamic()
函数,可自动寻找一个最佳的阈值,进行分群library(dynamicTreeCut)
# minClusterSize needs to be turned down for small datasets.
# deepSplit controls the resolution of the partitioning.
clust.416b <- cutreeDynamic(tree.416b, distM=as.matrix(dist.416b),
minClusterSize=10, deepSplit=1)
table(clust.416b)
# 1 2 3 4
#78 69 24 14
labels_colors(dend) <- clust.416b[order.dendrogram(dend)]
plot(dend)
silhouette()
函数计算sil <- silhouette(clust.416b, dist = dist.416b)
plot(sil)
approxSilhouette()
函数采用近似的方法计算所有细胞的轮廓系数。# Performing the calculations on the PC coordinates, like before.
sil.approx <- approxSilhouette(reducedDim(sce.pbmc, "PCA"), clusters=clust)
sil.data <- as.data.frame(sil.approx)
sil.data$closest <- factor(ifelse(sil.data$width > 0, clust, sil.data$other))
sil.data$cluster <- factor(clust)
ggplot(sil.data, aes(x=cluster, y=width, colour=closest)) +
ggbeeswarm::geom_quasirandom(method="smiley")
如下图,横坐标为既定的分群结果,每个点代表一个细胞,点的颜色即根据所有cluster所属细胞的轮廓系数标注的分群评价。如果轮廓系数为负值,即归为相距最近的其它类。
neighborPurity
函数pure.pbmc <- neighborPurity(reducedDim(sce.pbmc, "PCA"), clusters=clust)
pure.data <- as.data.frame(pure.pbmc)
pure.data$maximum <- factor(pure.data$maximum)
pure.data$cluster <- factor(clust)
ggplot(pure.data, aes(x=cluster, y=purity, colour=maximum)) +
ggbeeswarm::geom_quasirandom(method="smiley")
如下图,横坐标为既定的分群结果,每个点代表一个细胞,点的颜色即根据每个细胞的近邻细胞中所属cluster占比最多的所决定。
tab <- table(Walktrap=clust, Louvain=clust.louvain) #2.3
#行为Walktrap, 列为Louvain
tab <- tab/rowSums(tab) #Louvain结果相对于Walktrap的分布比例(按行看)
pheatmap(tab, color=viridis::viridis(100), cluster_cols=FALSE, cluster_rows=FALSE)
library(clustree)
combined <- cbind(k.50=clust.50, k.10=clust, k.5=clust.5)
clustree(combined, prefix="k.", edge_arrow=FALSE)
pairwiseRand(clust, clust.5, mode="index")
# 0.7796
最后关于subclustering,即在clustering结果基础上,对某一个cluster进一步分群,是提高细胞亚群分辨率的一种方法。步骤算法基本同上,但可能也会遇到一些坑,详见原教程最后,这里就不展开介绍了。
以上学习了scRNA-seq分群涉及到的一些知识点。下一步就是找出每个cluster的marker gene了~
如果你对单细胞转录组研究感兴趣,但又不知道如何入门,也许你可以关注一下下面的课程
看完记得顺手点个“在看”哦!
长按扫码可关注
https://mp.weixin.qq.com/s/4hOqpPXnwH7bSgCnCBCijw