ScType代码实操：快来试试这款全自动的单细胞细胞类型工具吧

ScType代码实操：快来试试这款全自动的单细胞细胞类型工具吧 by 单细胞天地

前言

在分析单细胞数据时，细胞类型注释是最基础也最关键的步骤之一，相比于耗时且对生物学知识要求较高的纯人工细胞注释，使用准确的自动细胞注释工具不失为一种较好的选择。在现实操作中，很多研究者更倾向于先使用自动软件鉴定出大群，然后联合人工注释，这样才不失为一种最好的方式。

在前面的推文：ScType--超快准狠的全自动scRNA-seq数据细胞类型注释工具中，Immugent介绍了ScType的功能框架及其相对于其它自动注释软件的优势。那么本期推文，Immugent将通过代码实操的方式来介绍ScType的使用技巧。话不多说，让我们一起来学习一下如何使用ScType对 scRNA-seq 数据进行全自动的细胞类型注释吧。

代码流程

1. 数据准备

首先让我们加载来自seurat官网的 PBMC 3k 示例数据集。

数据下载：https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz

# load libraries
lapply(c("dplyr","Seurat","HGNChelper"), library, character.only = T)

# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "./filtered_gene_bc_matrices/hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)

# normalize data
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5) # make some filtering based on QC metrics visualizations, see Seurat tutorial: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# scale and run PCA
pbmc <- ScaleData(pbmc, features = rownames(pbmc))
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

# Check number of PC components (we selected 10 PCs for downstream analysis, based on Elbow plot)
ElbowPlot(pbmc)

# cluster and visualize
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.8)
pbmc <- RunUMAP(pbmc, dims = 1:10)
DimPlot(pbmc, reduction = "umap")

2. 细胞类型注释

# 首先加载两个相关函数
# load gene set preparation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
# load cell type annotation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")

# 接下来，从输入的细胞标记文件中准备基因集。默认情况使用内置的细胞标记 DB。
# 但ScType支持使用自己的数据。只要准备一个格式与ScType数据库文件相同的 XLSX 文件。
# DB 文件应该包含四列(组织类型-组织类型，cellName-细胞类型，基因 Symbolmore 1阳性标记基因，基因 Symbolmore 2-阴性标记基因)

# DB file
db_ = "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx";
# 提供数据所属的组织类型
tissue = "Immune system" # e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus 

# prepare gene sets
gs_list = gene_sets_prepare(db_, tissue)

# get cell-type by cell matrix
es.max = sctype_score(scRNAseqData = pbmc[["RNA"]]@scale.data, scaled = TRUE, 
                      gs = gs_list$gs_positive, gs2 = gs_list$gs_negative) 

# NOTE: scRNAseqData parameter should correspond to your input scRNA-seq matrix. 
# In case Seurat is used, it is either pbmc[["RNA"]]@scale.data (default), pbmc[["SCT"]]@scale.data, in case sctransform is used for normalization,
# or pbmc[["integrated"]]@scale.data, in case a joint analysis of multiple single-cell datasets is performed.

# merge by cluster
cL_resutls = do.call("rbind", lapply(unique(pbmc@meta.data$seurat_clusters), function(cl){
    es.max.cl = sort(rowSums(es.max[ ,rownames(pbmc@meta.data[pbmc@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
    head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(pbmc@meta.data$seurat_clusters==cl)), 10)
}))
sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)  

# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"
print(sctype_scores[,1:3])
# A tibble: 11 × 3
# Groups:   cluster [11]
   cluster type                    scores
   <fct>   <chr>                    <dbl>
 1 1       Memory CD4+ T cells      498. 
 2 2       Naive B cells           1172. 
 3 7       Non-classical monocytes  532. 
 4 8       Natural killer  cells    631. 
 5 3       CD8+ NKT-like cells      573. 
 6 6       Naive CD8+ T cells        72.8
 7 0       Naive CD8+ T cells       444. 
 8 4       Classical Monocytes      680. 
 9 5       Classical Monocytes      574. 
10 9       Myeloid Dendritic cells  181. 
11 10      Platelets                284. 
# sctype_score 函数通过 gs 和 gs2参数接受正标记和负标记。
# 如果没有负标记(即标记提供证据证明细胞具有特定的细胞类型) ，只需将 gs2参数设置为 NULL (即gs2 = NULL)。

3. 可视化

# 绘制UMAP图
pbmc@meta.data$customclassif = ""
for(j in unique(sctype_scores$cluster)){
  cl_type = sctype_scores[sctype_scores$cluster==j,]; 
  pbmc@meta.data$customclassif[pbmc@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}

DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE, group.by = 'customclassif')

在这里，还可以通过气泡图显示 ScType 注释的所有细胞类型。

# load libraries
lapply(c("ggraph","igraph","tidyverse", "data.tree"), library, character.only = T)

# prepare edges
cL_resutls=cL_resutls[order(cL_resutls$cluster),]; edges = cL_resutls; edges$type = paste0(edges$type,"_",edges$cluster); edges$cluster = paste0("cluster ", edges$cluster); edges = edges[,c("cluster", "type")]; colnames(edges) = c("from", "to"); rownames(edges) <- NULL

# prepare nodes
nodes_lvl1 = sctype_scores[,c("cluster", "ncells")]; nodes_lvl1$cluster = paste0("cluster ", nodes_lvl1$cluster); nodes_lvl1$Colour = "#f1f1ef"; nodes_lvl1$ord = 1; nodes_lvl1$realname = nodes_lvl1$cluster; nodes_lvl1 = as.data.frame(nodes_lvl1); nodes_lvl2 = c(); 
ccolss= c("#5f75ae","#92bbb8","#64a841","#e5486e","#de8e06","#eccf5a","#b5aa0f","#e4b680","#7ba39d","#b15928","#ffff99", "#6a3d9a","#cab2d6","#ff7f00","#fdbf6f","#e31a1c","#fb9a99","#33a02c","#b2df8a","#1f78b4","#a6cee3")
for (i in 1:length(unique(cL_resutls$cluster))){
  dt_tmp = cL_resutls[cL_resutls$cluster == unique(cL_resutls$cluster)[i], ]; nodes_lvl2 = rbind(nodes_lvl2, data.frame(cluster = paste0(dt_tmp$type,"_",dt_tmp$cluster), ncells = dt_tmp$scores, Colour = ccolss[i], ord = 2, realname = dt_tmp$type))
}
nodes = rbind(nodes_lvl1, nodes_lvl2); nodes$ncells[nodes$ncells<1] = 1;
files_db = openxlsx::read.xlsx(db_)[,c("cellName","shortName")]; files_db = unique(files_db); nodes = merge(nodes, files_db, all.x = T, all.y = F, by.x = "realname", by.y = "cellName", sort = F)
nodes$shortName[is.na(nodes$shortName)] = nodes$realname[is.na(nodes$shortName)]; nodes = nodes[,c("cluster", "ncells", "Colour", "ord", "shortName", "realname")]

mygraph <- graph_from_data_frame(edges, vertices=nodes)

# Make the graph
gggr<- ggraph(mygraph, layout = 'circlepack', weight=I(ncells)) + 
  geom_node_circle(aes(filter=ord==1,fill=I("#F5F5F5"), colour=I("#D3D3D3")), alpha=0.9) + geom_node_circle(aes(filter=ord==2,fill=I(Colour), colour=I("#D3D3D3")), alpha=0.9) +
  theme_void() + geom_node_text(aes(filter=ord==2, label=shortName, colour=I("#ffffff"), fill="white", repel = !1, parse = T, size = I(log(ncells,25)*1.5)))+ geom_node_label(aes(filter=ord==1,  label=shortName, colour=I("#000000"), size = I(3), fill="white", parse = T), repel = !0, segment.linetype="dotted")
library(cowplot)
library(gridExtra)
library(patchwork)
p1 <- DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE, cols = ccolss)
p_combined = p1 + gggr + plot_layout(ncol = 2)
print(p_combined)

外部(灰色)气泡对应于每个簇(气泡越大，簇中的细胞越多) ，而内部气泡对应于每个簇所考虑的细胞类型，最大的气泡对应于指定的细胞类型。

小结

在本期推文中，Immugent介绍了如何使用ScType对scRNA-seq数据进行细胞注释。我们可以看出ScType使用起来十分轻便，对新手十分友好。在具体进行细胞注释的时候，我们可以通过输入相应的组织信息以及阳性标记基因和阴性标记基因，这使得细胞注释的结果是比较准确的。因此，在大家手动注释遇到瓶颈时不妨来试一试这个高效且准确的细胞类型注释工具吧。

好啦，本期的分享到这里就结束了，我们下期再会~