你还缺scRNA-seq的workflow吗？

你还缺scRNA-seq的workflow吗？ by 生信菜鸟团

前言

之前曾老师给我看了一位在pipebio工作的生信工程师Roman Hillje的scRNA-seq的workflow，今天整理一下分享给大家。

Roman Hillje的github主页：https://github.com/romanhaa

workflow：https://romanhaa.github.io/projects/scrnaseq_workflow/

示例数据来源：GSE120221 (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE120221) 下的前三个样品:

GSM3396161Bone marrow A
GSM3396162Bone marrow B
GSM3396163Bone marrow C1

1.准备工作

#1.1 加载R包
rm(list = ls())
library(tidyverse)
library(Seurat)
library(scran)
library(patchwork)
library(viridis)
library(ggforce)
library(gghalves)
library(ggridges)
library(scDblFinder)
library(SingleR)
library(intrinsicDimension)

#1.2 设置可视化配色
custom_colors <- list()

colors_dutch <- c(
  '#FFC312','#C4E538','#12CBC4','#FDA7DF','#ED4C67',
  '#F79F1F','#A3CB38','#1289A7','#D980FA','#B53471',
  '#EE5A24','#009432','#0652DD','#9980FA','#833471',
  '#EA2027','#006266','#1B1464','#5758BB','#6F1E51'
)

colors_spanish <- c(
  '#40407a','#706fd3','#f7f1e3','#34ace0','#33d9b2',
  '#2c2c54','#474787','#aaa69d','#227093','#218c74',
  '#ff5252','#ff793f','#d1ccc0','#ffb142','#ffda79',
  '#b33939','#cd6133','#84817a','#cc8e35','#ccae62'
)

custom_colors$discrete <- c(colors_dutch, colors_spanish)

custom_colors$cell_cycle <- setNames(
  c('#45aaf2', '#f1c40f', '#e74c3c', '#7f8c8d'),
  c('G1',      'S',       'G2M',     '-')
)

#1.3 定义一些函数
load10xData <- function(
    sample_name,
    path,
    max_number_of_cells = NULL
) {
  
  ## read transcript count matrix
  transcript_counts <- list.files(
    path,
    recursive = TRUE,
    full.names = TRUE
  ) %>%
    .[grep(., pattern = 'mtx')] %>%
    Matrix::readMM()
  
  ## read cell barcodes and assign as column names of transcript count matrix
  colnames(transcript_counts) <- list.files(
    path,
    recursive = TRUE,
    full.names = TRUE
  ) %>%
    .[grep(., pattern = 'barcodes')] %>%
    .[grep(., pattern = 'tsv')] %>%
    read_tsv(col_names = FALSE, col_types = cols()) %>%
    pull(1) %>%
    gsub(., pattern = '-[0-9]{1,2}', replacement = '') %>%
    paste0(., '-', sample_name)
  
  ## read gene names
  gene_names <- list.files(
    path,
    recursive = TRUE,
    full.names = TRUE
  ) %>%
    .[grep(., pattern = 'genes|features')] %>%
    read_tsv(col_names = FALSE, col_types = cols()) %>%
    pull(ifelse(ncol(.) > 1, 2, 1))
  
  ## if necessary, merge counts from different transcripts of the same gene
  if ( any(duplicated(gene_names)) == TRUE ) {
    
    ## get names of genes with multiple entries
    duplicated_gene_names <- unique(gene_names[which(duplicated(gene_names))])
    
    ## log message
    message(
      glue::glue(
        "{format(Sys.time(), '[%Y-%m-%d %H:%M:%S]')} Summing up counts for ",
        "{length(duplicated_gene_names)} genes with multiple entries."
      )
    )
    
    ## go through genes with multiple entries
    for ( i in duplicated_gene_names ) {
      
      ## extract transcripts counts for current gene
      tmp_counts <- transcript_counts[which(gene_names == i),]
      
      ## make sure at least 2 rows are present
      if ( nrow(tmp_counts) >= 2 ) {
        
        ## sum up counts in each cell
        tmp_counts_sum <- Matrix::colSums(tmp_counts)
        
        ## remove original counts from transcript matrix and the gene name from
        ## the list of gene names
        transcript_counts <- transcript_counts[-c(which(gene_names == i)),]
        gene_names <- gene_names[-c(which(gene_names == i))]
        
        ## add summed counts to end of transcript count matrix and current gene
        ## name to end of gene names
        transcript_counts <- rbind(transcript_counts, tmp_counts_sum)
        gene_names <- c(gene_names, i)
      }
    }
  }
  
  ## assign gene names as row names
  rownames(transcript_counts) <- gene_names
  
  ## down-sample cells if more cells than `max_number_of_cells` are present in
  ## count matrix
  if (
    !is.null(max_number_of_cells) &&
    is.numeric(max_number_of_cells) &&
    max_number_of_cells < ncol(transcript_counts)
  ) {
    temp_cells_to_keep <- base::sample(1:ncol(transcript_counts), max_number_of_cells)
    transcript_counts <- transcript_counts[,temp_cells_to_keep]
  }
  
  ##
  message(
    glue::glue(
      "{format(Sys.time(), '[%Y-%m-%d %H:%M:%S]')} Loaded counts for sample "
    )
  )
  
  ##
  return(transcript_counts)
}

#When provided with a list of transcript count matrices, 
#this function checks whether the gene names are the same in all the count matrices, 
#which is an important premise for merging them.

#当提供许多待合并的转录组count矩阵后，这个函数检查所有count矩阵中基因名是否相同，
#这是合并他们的重要前提

sameGeneNames <- function(counts) {
  
  ## create place holder for gene names from each count matrix
  gene_names <- list()
  
  ## go through count matrices
  for ( i in names(counts) ) {
    
    ## extract gene names
    gene_names[[i]] <- rownames(counts[[i]])
  }
  
  ## check if all gene names are the same (using the first matrix as a reference
  ## for the others)
  return(all(sapply(gene_names, FUN = identical, gene_names[[1]])))
}

#This function outputs a table of mean and median transcripts and expressed genes for every sample in our data set.
#这个函数输出数据集中每个样本产生转录表达的平均数和中位数

averagenCountnFeature <- function(cells) {
  output <- tibble(
    'sample' = character(),
    'mean(nCount)' = numeric(),
    'median(nCount)' = numeric(),
    'mean(nFeature)' = numeric(),
    'median(nFeature)' = numeric()
  )
  for ( i in levels(cells$sample) ) {
    tmp <- tibble(
      'sample' = i,
      'mean(nCount)' = cells %>% filter(sample == i) %>% pull(nCount) %>% mean(),
      'median(nCount)' = cells %>% filter(sample == i) %>% pull(nCount) %>% median(),
      'mean(nFeature)' = cells %>% filter(sample == i) %>% pull(nFeature) %>% mean(),
      'median(nFeature)' = cells %>% filter(sample == i) %>% pull(nFeature) %>% median()
    )
    output <- bind_rows(output, tmp)
  }
  return(output)
}

#1.4 创建文件夹
dir.create('data')
dir.create('plots')

2.载入数据

首先用之前定义的load10xData函数读取10x样本。我们这里使用参数max_number_of_cells随机取2000个细胞为了快速演示这个流程

#2.1 读入10x数据
sample_names <- list.dirs('RAW', recursive = FALSE) %>% basename()

transcripts <- list(
  raw = list()
)

for ( i in sample_names ) {
  transcripts$raw[[i]] <- load10xData(
    i, paste0('RAW/', i, '/'),
    max_number_of_cells = 2000
  )
}

#2.2 合并样本
#在合并count矩阵之前，需要检查他们的基因名是否相同
sameGeneNames(transcripts$raw)
# TRUE
#本文数据来源的三个样本经过同样的上游处理
#他们之间的基因名相同所以我们可以直接合并
transcripts$raw$merged <- do.call(cbind, transcripts$raw)

dim(transcripts$raw$merged)
save(transcripts,file = "./data/raw_merged_count.RData")
# 33660  6000

#注意:如果基因名称不相同，有两种策略。
#您可以将转录组count矩阵转换为数据框格式，然后使用full_join()函数合并它们。
#根据你正在处理的数据集的大小，这可能会导致内存问题。
#或者，你将缺失的、充满0计数的基因添加到缺失基因的矩阵中。
#在合并之前，必须确保基因的顺序相同。
#你可以在下面找到第一种方法的大纲:
# for ( i in sample_names ) {
#   gene_names <- rownames(transcripts$raw[[i]])
#   transcripts$raw[[i]] <- as.data.frame(as.matrix(transcripts$raw[[i]]))
#   transcripts$raw[[i]] <- transcripts$raw[[i]] %>%
#     mutate(gene = gene_names) %>%
#     select(gene, everything())
# }
# 
# transcripts$raw$merged <- full_join(transcripts$raw$A, transcripts$raw$B, by = "gene") %>%
#   full_join(., transcripts$raw$E, by = "gene")
# 
# transcripts$raw$merged[ is.na(transcripts$raw$merged) ] <- 0

3.质控

在我们进行分析之前，我们将从合并的count矩阵中移除低质量的细胞，以及低表达的基因

## 3.1 Filter cells过滤细胞
### 3.1.1 Prepare data准备数据
我们先获取转录组count，每个细胞表达的基因数量，以及每个细胞线粒体转录本占总的百分比
load("./data/raw_merged_count.RData")
cells <- data.frame(
  cell = colnames(transcripts$raw$merged),
  sample = colnames(transcripts$raw$merged) %>%
    strsplit('-') %>%
    vapply(FUN.VALUE = character(1), `[`, 2) %>%
    factor(levels = sample_names),
  nCount = colSums(transcripts$raw$merged),
  nFeature = colSums(transcripts$raw$merged != 0)
)

#这里他给的代码载入了一个含有研究物种对应线粒体基因名的配置文件，
#可我们没有啊，我们只能粗略根据"^MT-"找了

#内存不够的话不要这样写
transcripts$raw$merged -> merged_count
library(stringr)
cells$percent_MT <- apply(merged_count[str_detect(rownames(merged_count),"^MT-"),],2,sum)/apply(merged_count,2,sum)
rm(merged_count)

### 3.1.2 Doublets去除双细胞
#使用scDblFinder找doublets
sce <- SingleCellExperiment(assays = list(counts = transcripts$raw$merged))
#这里笔者遇到一个matrix报错：
#程序包'as_cholmod_sparse'不提供'Matrix'这样的函数
#太玄学了，重装irlba包和Matrix包就ok了
# remove.packages("Matrix")
# remove.packages("irlba")
# BiocManager::install("irlba",force = T)
# BiocManager::install("Matrix",force = T)
#remotes::install_version("Matrix", version = "1.6-1.1")
sce <- scDblFinder(sce)
#"SingleCellExperiment"对象
class(sce)
#在multiplet_class列标注出单细胞还是双细胞
cells$multiplet_class <- colData(sce)$scDblFinder.class
#可视化,各个样本中双细胞数目
cells[cells$multiplet_class != 'singlet',"sample"] %>% table %>%
  as.data.frame() ->plotdf
colnames(plotdf)<-c("sample","count")

p<-ggplot(plotdf ,aes(x = sample, y = count, fill = sample)) +
  geom_col(color = 'black') +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete(limits = rev(levels(cells$sample))) +
  scale_y_continuous(name = 'Number of doublets', labels = scales::comma) +
  theme(
    axis.title.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    legend.position = 'none'
  ) +
  coord_flip()

p

ggsave(
  'plots/qc_number_of_doublets_by_sample.png',
  p, height = 3, width = 6
)
#接下来我们可视化单细胞和双细胞中nCount和nFeature的分布
p1 <- ggplot(cells, aes(x = multiplet_class, y = nCount, fill = multiplet_class)) +
  geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) +
  geom_jitter(aes(color=multiplet_class),width = 0.4,size=0.3,alpha=0.7)+
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete(limits = rev(levels(cells$multiplet_class))) +
  scale_y_continuous(labels = scales::comma) +
  labs(title = 'Number of transcripts', subtitle = 'linear scale') +
  theme(
    axis.title = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    legend.position = 'none'
  ) +
  coord_flip()
p2 <- ggplot(cells, aes(x = multiplet_class, y = nCount, fill = multiplet_class)) +
  geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) +
  geom_jitter(aes(color=multiplet_class),width = 0.4,size=0.3,alpha=0.7)+
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete(limits = rev(levels(cells$multiplet_class))) +
  scale_y_log10(labels = scales::comma) +
  labs(title = 'Number of transcripts', subtitle = 'log-scale') +
  theme(
    axis.title = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    legend.position = 'none'
  ) +
  coord_flip()
p3 <- ggplot(cells, aes(x = multiplet_class, y = nFeature, fill = multiplet_class)) +
  geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) +
  geom_jitter(aes(color=multiplet_class),width = 0.4,size=0.3,alpha=0.7)+
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete(limits = rev(levels(cells$multiplet_class))) +
  scale_y_continuous(labels = scales::comma) +
  labs(title = 'Number of expressed genes', subtitle = 'linear scale') +
  theme(
    axis.title = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    legend.position = 'none'
  ) +
  coord_flip()
p4 <- ggplot(cells, aes(x = multiplet_class, y = nFeature, fill = multiplet_class)) +
  geom_violin(draw_quantiles = c(0.5), scale = 'area', trim = FALSE) +
  geom_jitter(aes(color=multiplet_class),width = 0.4,size=0.3,alpha=0.7)+
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_discrete(limits = rev(levels(cells$multiplet_class))) +
  scale_y_log10(labels = scales::comma) +
  labs(title = 'Number of expressed genes', subtitle = 'log-scale') +
  theme(
    axis.title = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    legend.position = 'none'
  ) +
  coord_flip()


p1 + p3 +
    p2 + p4 + plot_layout(ncol = 2)

ggsave(
  'plots/qc_ncount_nfeature_by_multiplet_class.png',
  p1 + p3 +
    p2 + p4 + plot_layout(ncol = 2),
  height = 14, width = 20
)
#去除双细胞
cells <- cells[cells$multiplet_class == 'singlet',]

在去除任何细胞之前，我们先看看转录本count的分布，每个细胞表达的基因数，以及每个细胞线粒体转录本的百分比。这些分布可以用不同的可视化方式表示。哪一个看起来最好取决于个人习惯和数据集。原文有四种可视化方法，展现出大佬雄厚的ggplot2绘图功力，我选我最喜欢的一种放在下面。

### 3.1.3 Before filtering

p1 <- ggplot(cells, aes(nCount, fill = sample)) +
  geom_histogram(bins = 200) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_continuous(labels = scales::comma) +
  labs(title = 'Number of transcripts', subtitle = 'linear scale', y = 'Number of cells') +
  theme(legend.position = 'none', axis.title.x = element_blank())

p2 <- ggplot(cells, aes(nCount, fill = sample)) +
  geom_histogram(bins = 200) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_log10(labels = scales::comma) +
  labs(title = 'Number of transcripts', subtitle = 'log-scale', y = 'Number of cells') +
  theme(legend.position = 'none', axis.title.x = element_blank())

p3 <- ggplot(cells, aes(nFeature, fill = sample)) +
  geom_histogram(bins = 200) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_continuous(labels = scales::comma) +
  labs(title = 'Number of expressed genes', subtitle = 'linear scale', y = 'Number of cells') +
  theme(legend.position = 'none',axis.title.x = element_blank())

p4 <- ggplot(cells, aes(nFeature, fill = sample)) +
  geom_histogram(bins = 200) +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_log10(labels = scales::comma) +
  labs(title = 'Number of expressed genes', subtitle = 'log-scale', y = 'Number of cells', fill = 'Sample') +
  theme(legend.position = 'none', axis.title.x = element_blank())

p5 <- ggplot(cells, aes(percent_MT, fill = sample)) +
  geom_histogram(bins = 200) +
  theme_bw() +
  scale_x_continuous(labels = scales::percent) +
  scale_fill_manual(name = 'Sample', values = custom_colors$discrete) +
  labs(title = 'Percent MT transcripts', subtitle = 'linear scale', y = 'Number of cells') +
  theme(legend.position = 'right', axis.title.x = element_blank(), legend.text = element_text(size = 8))

p1 + p3 + p5 +
    p2 + p4 + plot_layout(ncol = 3)

ggsave(
  'plots/qc_histogram_ncount_nfeature_percentmt_before_filtering_3.png',
  p1 + p3 + p5 +
    p2 + p4 + plot_layout(ncol = 3),
  height = 7, width = 10
)

另一个有趣的关系是每个细胞中表达基因的数量比转录本的数量。这通常会产生如下所示的曲线。有时有许多细胞具有特定的模式，即表达很多基因和高转录本数量的细胞可能是垂死的细胞。

p <- ggplot(cells, aes(nCount, nFeature, color = percent_MT)) +
  geom_point(size = 0.5) +
  scale_x_continuous(name = 'Number of transcripts', labels = scales::comma) +
  scale_y_continuous(name = 'Number of expressed genes', labels = scales::comma) +
  theme_bw() +
  scale_color_viridis(
    name = 'Percent MT\ntranscripts',
    limits = c(0,1),
    labels = scales::percent,
    guide = guide_colorbar(frame.colour = 'black', ticks.colour = 'black')
  )

p

ggsave('plots/qc_nfeature_over_ncount_before_filtering.png', p, height = 4, width = 6)

现在我们需要确定我们正在处理的矩阵的上下限。有很多种办法来做，一种就是肉眼看，另一种则是估计中位数和MAD(中位数绝对偏差)。如果你决定使用后者处理，一般建议保持在中位数附近的3-5MADs范围内。在本文中，我们将慷慨地接受任何低于中位数加上5倍的MAD。我们不需要下限，因为矩阵显然已经预先用每个细胞转录本和表达基因数量的可接受下限进行了过滤。

### 3.1.4 Define thresholds确定阈值
median_nCount <- median(cells$nCount)
# 4255
mad_nCount <- mad(cells$nCount)
# 2630.874

median_nFeature <- median(cells$nFeature)
# 803
mad_nFeature <- mad(cells$nFeature)
# 544.1142

median_percent_MT <- median(cells$percent_MT)
# 0.02760973
mad_percent_MT <- mad(cells$percent_MT)
# 0.02103674

thresholds_nCount <- c(0, median_nCount + 5*mad_nCount)
# 0.00 17409.37
thresholds_nFeature <- c(0, median_nFeature + 5*mad_nFeature)
# 0.000 3523.571
thresholds_percent_MT <- c(0, median_percent_MT + 5*mad_percent_MT)
# 0.0000000 0.1327934
save(thresholds_nCount,thresholds_nFeature,thresholds_percent_MT,file = "thresholds.RData")

p1 <- ggplot(cells, aes(nCount, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_nCount, color = 'black') +
  geom_vline(xintercept = thresholds_nCount, color = 'red') +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_continuous(labels = scales::comma) +
  labs(title = 'Number of transcripts', subtitle = 'linear scale', y = 'Number of cells') +
  theme(legend.position = 'none', axis.title.x = element_blank())

p2 <- ggplot(cells, aes(nCount, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_nCount, color = 'black') +
  geom_vline(xintercept = thresholds_nCount, color = 'red') +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_log10(labels = scales::comma) +
  labs(title = 'Number of transcripts', subtitle = 'log-scale', y = 'Number of cells') +
  theme(legend.position = 'none', axis.title.x = element_blank())

p3 <- ggplot(cells, aes(nFeature, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_nFeature, color = 'black') +
  geom_vline(xintercept = thresholds_nFeature, color = 'red') +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_continuous(labels = scales::comma) +
  labs(title = 'Number of expressed genes', subtitle = 'linear scale', y = 'Number of cells') +
  theme(legend.position = 'none',axis.title.x = element_blank())

p4 <- ggplot(cells, aes(nFeature, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_nFeature, color = 'black') +
  geom_vline(xintercept = thresholds_nFeature, color = 'red') +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_log10(labels = scales::comma) +
  labs(title = 'Number of expressed genes', subtitle = 'log-scale', y = 'Number of cells', fill = 'Sample') +
  theme(legend.position = 'none', axis.title.x = element_blank())

p5 <- ggplot(cells, aes(percent_MT, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_percent_MT, color = 'black') +
  geom_vline(xintercept = thresholds_percent_MT, color = 'red') +
  theme_bw() +
  scale_x_continuous(labels = scales::percent) +
  scale_fill_manual(name = 'Sample', values = custom_colors$discrete) +
  labs(title = 'Percent MT transcripts', subtitle = 'linear scale', y = 'Number of cells') +
  theme(legend.position = 'right', axis.title.x = element_blank(), legend.text = element_text(size = 8))

p1 + p3 + p5 +
    p2 + p4 + plot_layout(ncol = 3)

ggsave(
  'plots/qc_histogram_ncount_nfeature_percentmt_thresholds.png',
  p1 + p3 + p5 +
    p2 + p4 + plot_layout(ncol = 3),
  height = 7, width = 10
)

#对另一种表现方法的图加入阈值

p <- ggplot(cells, aes(nCount, nFeature, color = percent_MT)) +
  geom_point(size = 0.5) +
  geom_hline(yintercept = thresholds_nFeature, color = 'red') +
  geom_vline(xintercept = thresholds_nCount, color = 'red') +
  scale_x_continuous(name = 'Number of transcripts', labels = scales::comma) +
  scale_y_continuous(name = 'Number of expressed genes', labels = scales::comma) +
  theme_bw() +
  scale_color_viridis(
    name = 'Percent MT\ntranscripts',
    limits = c(0,1),
    labels = scales::percent,
    guide = guide_colorbar(frame.colour = 'black', ticks.colour = 'black')
  )

p

ggsave('plots/qc_nfeature_over_ncount_thresholds.png', p, height = 4, width = 6)
```

我们识别出低质量的细胞，现在进行过滤

### 3.1.5 After filtering过滤
cells_filtered <- cells[cells$nCount >= thresholds_nCount[1]&
    cells$nCount <= thresholds_nCount[2]&
    cells$nFeature >= thresholds_nFeature[1]&
    cells$nFeature <= thresholds_nFeature[2]&
    cells$percent_MT >= thresholds_percent_MT[1]&
    cells$percent_MT <= thresholds_percent_MT[2],]

p1 <- ggplot(cells_filtered, aes(nCount, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_nCount, color = 'black') +
  geom_vline(xintercept = thresholds_nCount, color = 'red') +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_continuous(labels = scales::comma) +
  labs(title = 'Number of transcripts', subtitle = 'linear scale', y = 'Number of cells') +
  theme(legend.position = 'none', axis.title.x = element_blank())

p2 <- ggplot(cells_filtered, aes(nCount, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_nCount, color = 'black') +
  geom_vline(xintercept = thresholds_nCount, color = 'red') +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_log10(labels = scales::comma) +
  labs(title = 'Number of transcripts', subtitle = 'log-scale', y = 'Number of cells') +
  theme(legend.position = 'none', axis.title.x = element_blank())

p3 <- ggplot(cells_filtered, aes(nFeature, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_nFeature, color = 'black') +
  geom_vline(xintercept = thresholds_nFeature, color = 'red') +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_continuous(labels = scales::comma) +
  labs(title = 'Number of expressed genes', subtitle = 'linear scale', y = 'Number of cells') +
  theme(legend.position = 'none',axis.title.x = element_blank())

p4 <- ggplot(cells_filtered, aes(nFeature, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_nFeature, color = 'black') +
  geom_vline(xintercept = thresholds_nFeature, color = 'red') +
  theme_bw() +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_x_log10(labels = scales::comma) +
  labs(title = 'Number of expressed genes', subtitle = 'log-scale', y = 'Number of cells', fill = 'Sample') +
  theme(legend.position = 'none', axis.title.x = element_blank())

p5 <- ggplot(cells_filtered, aes(percent_MT, fill = sample)) +
  geom_histogram(bins = 200) +
  geom_vline(xintercept = median_percent_MT, color = 'black') +
  geom_vline(xintercept = thresholds_percent_MT, color = 'red') +
  theme_bw() +
  scale_x_continuous(labels = scales::percent) +
  scale_fill_manual(name = 'Sample', values = custom_colors$discrete) +
  labs(title = 'Percent MT transcripts', subtitle = 'linear scale', y = 'Number of cells') +
  theme(legend.position = 'right', axis.title.x = element_blank(), legend.text = element_text(size = 8))

p1 + p3 + p5 +
    p2 + p4 + plot_layout(ncol = 3)

ggsave(
  'plots/qc_histogram_ncount_nfeature_percentmt_filtered.png',
  p1 + p3 + p5 +
    p2 + p4 + plot_layout(ncol = 3),
  height = 7, width = 10
)

如前文提到，我们应当过滤掉在小于五个细胞中表达的基因

## 3.2 Filter genes过滤基因
genes <- data.frame(
  gene = rownames(transcripts$raw$merged),
  count = Matrix::rowSums(transcripts$raw$merged),
  cells = Matrix::rowSums(transcripts$raw$merged != 0)
)

genes_to_keep <- genes[genes$cells >= 5,"gene"]
length(genes_to_keep)
#33000个过滤到16000个

## 3.3 Generate filtered transcript matrix生成过滤后的转录本矩阵
transcripts$raw$filtered <- transcripts$raw$merged[genes_to_keep,cells_to_keep]
save(transcripts,file = "./data/filtered_transcripts_list.RData")

cells_per_sample_after_filtering <- data.frame(
  sample = character(),
  before = numeric(),
  after = numeric()
)

for ( i in sample_names ) {
  tmp <- data.frame(
    sample = i,
    before = grep(colnames(transcripts$raw$merged), pattern = paste0('-', i)) %>% length(),
    after = grep(colnames(transcripts$raw$filtered), pattern = paste0('-', i)) %>% length()
  )
  cells_per_sample_after_filtering <- rbind(cells_per_sample_after_filtering, tmp)
}

knitr::kable(cells_per_sample_after_filtering)
#在下表中，我们可以看到过滤前后每个样品中有多少细胞:

4.生成Seurat对象

#准备好转录组count矩阵后，我们现在可以初始化我们的Seurat对象。之后，我们在meta data中存储每个样本赋值。
seurat <- CreateSeuratObject(
  counts = transcripts$raw$filtered,
  min.cells = 0,
  min.features = 0
)

seurat@meta.data$sample <- Cells(seurat) %>%
  strsplit('-') %>%
  vapply(FUN.VALUE = character(1), `[`, 2) %>%
  factor(levels = sample_names)

#为了确保万无一失，让我们再看一次样本中每个细胞转录本和表达基因的分布。
temp_labels <- seurat@meta.data$sample %>%
  table %>% as.data.frame()
colnames(temp_labels) <- c("sample","n")

p1 <- ggplot() +
  geom_half_violin(
    data = seurat@meta.data, aes(sample, nCount_RNA, fill = sample),
    side = 'l', show.legend = FALSE, trim = FALSE
  ) +
  geom_half_boxplot(
    data = seurat@meta.data, aes(sample, nCount_RNA, fill = sample),
    side = 'r', outlier.color = NA, width = 0.4, show.legend = FALSE
  ) +
  geom_text(
    data = temp_labels,
    aes(x = sample, y = -Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1),
    color = 'black', size = 2.8
  ) +
  scale_color_manual(values = custom_colors$discrete) +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_y_continuous(labels = scales::comma, expand = c(0.08,0)) +
  theme_bw() +
  labs(x = '', y = 'Number of transcripts') +
  theme(
    panel.grid.major.x = element_blank(),
    axis.title.x = element_blank()
  )

p2 <- ggplot() +
  geom_half_violin(
    data = seurat@meta.data, aes(sample, nFeature_RNA, fill = sample),
    side = 'l', show.legend = FALSE, trim = FALSE
  ) +
  geom_half_boxplot(
    data = seurat@meta.data, aes(sample, nFeature_RNA, fill = sample),
    side = 'r', outlier.color = NA, width = 0.4, show.legend = FALSE
  ) +
  geom_text(
    data = temp_labels,
    aes(x = sample, y = -Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1),
    color = 'black', size = 2.8
  ) +
  scale_color_manual(values = custom_colors$discrete) +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_y_continuous(
    name = 'Number of expressed genes',
    labels = scales::comma, expand = c(0.08,0)
  ) +
  theme_bw() +
  theme(
    panel.grid.major.x = element_blank(),
    axis.title.x = element_blank()
  )

p1 + p2 + plot_layout(ncol = 2)

ggsave(
  'plots/ncount_nfeature_by_sample.png',
  p1 + p2 + plot_layout(ncol = 2), height = 4, width = 10
)

5.Normalize expression data归一化

接下来，我们将原始转录本count归一化(以便每个细胞具有相同数量的转录本)并使用对数刻度。这可以用不同的方法和不同的工具来完成。Seurat包中的SCTransform()和NormalizeData()方法以及scran中的logNormCounts()函数有着很好的运行效果。

这个函数调用sctransform::vst。sctransform包可从 https://github.com/satijalab/sctransform 获得。可使用此函数作为NormalizeData、FindVariableFeatures、ScaleData workflow的替代方法。结果保存在新的assay中(默认命名为SCT)，counts为校正后counts，data为log1p(counts)，scale.data为Pearson残差。sctransform::vst的中间结果保存在新的assay的misc槽中。

if(!require(glmGamPoi)) {
  BiocManager::install('glmGamPoi')
}
library(glmGamPoi)
seurat <- SCTransform(seurat, assay = 'RNA')
save(seurat,file = "SCT_seurat.RData")

6. Data integration 数据整合

在一些情况下，在不同预处理之后显示有明显批次效应，整合数据集则是有意义的。同样，有许多方法可以执行此任务。Seurat包中内置的数据整合过程已经被证明有很好的表现，我个人也有过很好的使用经验。对于这里分析的样本，我们不需要它，因此将跳过这一部分。

7. Principal component analysis 主成分分析

主成分分析(PCA)经常被用作数据集降维的第一步。这里，我们计算前50个主成分。然后，我们必须选择用于继续分析的主成分数量。在最近的一篇论文中Germain等人建议使用intrinsicDimension包中的maxLikGlobalDimEst()函数来测试数据集的维数。为此，还必须为参数k选择一个值。根据作者的经验，10-20范围内的值给出了合理且没有太大差异的结果。在下面的PCA结果图中，作者突出显示了maxLikGlobalDimEst()(蓝线)的输出值，它在这里接近8。在作者看来，这个估计偏低了，所以将继续使用前15的主成分(红线)。此外，通常情况下，与使用太少的主成分相比，使用更多的主成分要好得多。

seurat <- RunPCA(seurat, assay = 'SCT', npcs = 50)

intrinsicDimension::maxLikGlobalDimEst(seurat@reductions$pca@cell.embeddings, k = 10)
# 8.202795 

p <- data.frame(
    PC = 1:50,
    stdev = seurat@reductions$pca@stdev
  ) %>%
  ggplot(aes(PC, stdev)) +
  geom_point() +
  geom_vline(xintercept = 8, color = 'blue') +
  geom_vline(xintercept = 15, color = 'red') +
  theme_bw() +
  labs(x = 'Principal components', y = 'Standard deviation')

p

ggsave('plots/principal_components.png', p, height = 4, width = 5)

8 Clustering聚类

鉴定具有相似转录谱的细胞亚群

## 8.1 Define cluster聚类
#我们使用Seurat的FindNeighbors和FindClusters聚类我们做的数据集中的细胞
seurat <- FindNeighbors(seurat, reduction = 'pca', dims = 1:15)
seurat <- FindClusters(seurat, resolution = 0.8)

## 8.2 Number of transcripts and expressed genes
如前文我们对样本执行过的操作，我们检查每一个聚类中的平均转录本和表达的基因数量。
temp_labels <- seurat@meta.data[,"seurat_clusters"] %>% 
  table %>% as.data.frame
colnames(temp_labels)<-c("seurat_clusters","n")

p1 <- ggplot() +
  geom_half_violin(
    data = seurat@meta.data, aes(seurat_clusters, nCount_RNA, fill = seurat_clusters),
    side = 'l', show.legend = FALSE, trim = FALSE
  ) +
  geom_half_boxplot(
    data = seurat@meta.data, aes(seurat_clusters, nCount_RNA, fill = seurat_clusters),
    side = 'r', outlier.color = NA, width = 0.4, show.legend = FALSE
  ) +
  geom_text(
    data = temp_labels,
    aes(x = seurat_clusters, y = -Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1),
    color = 'black', size = 2.8
  ) +
  scale_color_manual(values = custom_colors$discrete) +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_y_continuous(name = 'Number of transcripts', labels = scales::comma, expand = c(0.08, 0)) +
  theme_bw() +
  theme(
    panel.grid.major.x = element_blank(),
    axis.title.x = element_blank()
  )

p2 <- ggplot() +
  geom_half_violin(
    data = seurat@meta.data, aes(seurat_clusters, nFeature_RNA, fill = seurat_clusters),
    side = 'l', show.legend = FALSE, trim = FALSE
  ) +
  geom_half_boxplot(
    data = seurat@meta.data, aes(seurat_clusters, nFeature_RNA, fill = seurat_clusters),
    side = 'r', outlier.color = NA, width = 0.4, show.legend = FALSE
  ) +
  geom_text(
    data = temp_labels,
    aes(x = seurat_clusters, y = -Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1),
    color = 'black', size = 2.8
  ) +
  scale_color_manual(values = custom_colors$discrete) +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_y_continuous(name = 'Number of expressed genes', labels = scales::comma, expand = c(0.08, 0)) +
  theme_bw() +
  theme(
    panel.grid.major.x = element_blank(),
    axis.title.x = element_blank()
  )

p1 + p2 + plot_layout(ncol = 1)

ggsave(
  'plots/ncount_nfeature_by_cluster.png',
  p1 + p2 + plot_layout(ncol = 1),
  height = 7, width = 14
)

9.Cell cycle analysis细胞周期分析

使用Seurat包的CellCycleScoring()函数，我们可以为每个细胞分配细胞周期：G1, S, G2M

## 9.1 Infer cell cycle status推断细胞周期状态
seurat <- CellCycleScoring(
  seurat,
  assay = 'SCT',
  s.features = cc.genes$s.genes,
  g2m.features = cc.genes$g2m.genes
)

seurat@meta.data$cell_cycle_seurat <- seurat@meta.data$Phase
seurat@meta.data$Phase <- NULL
#字符串转因子
seurat@meta.data$cell_cycle_seurat <- factor(
  seurat@meta.data$cell_cycle_seurat, levels = c('G1', 'S', 'G2M')
)

## 9.2 Composition of samples and clusters by cell cycle样本和聚类中细胞的细胞周期组成
#现在检查每个样本和聚类中的细胞周期阶段分布
#样本中：
table_samples_by_cell_cycle<-data.frame()
for(i in unique(seurat@meta.data$sample)){
  seurat@meta.data[seurat@meta.data$sample==i,"cell_cycle_seurat"] %>% table -> nCycle
  c(i,sum(nCycle),nCycle) ->nCycle
  names(nCycle)[1:2]<-c("sample","total_cell_count")
  rbind(table_samples_by_cell_cycle,nCycle)->table_samples_by_cell_cycle
}
colnames(table_samples_by_cell_cycle)<-names(nCycle)
for (i in 2:5) {
  as.numeric(table_samples_by_cell_cycle[,i])->table_samples_by_cell_cycle[,i]
}

knitr::kable(table_samples_by_cell_cycle)

#聚类中：
table_clusters_by_cell_cycle<-data.frame()
for(i in unique(seurat@meta.data$seurat_clusters)){
  seurat@meta.data[seurat@meta.data$seurat_clusters==i,"cell_cycle_seurat"] %>% table -> nCycle
  c(i,sum(nCycle),nCycle) ->nCycle
  names(nCycle)[1:2]<-c("cluster","total_cell_count")
  rbind(table_clusters_by_cell_cycle,nCycle)->table_clusters_by_cell_cycle
}
colnames(table_clusters_by_cell_cycle)<-names(nCycle)

for (i in 1:5) {
  as.numeric(table_clusters_by_cell_cycle[,i])->table_clusters_by_cell_cycle[,i]
}

table_clusters_by_cell_cycle[order(table_clusters_by_cell_cycle$cluster,decreasing = F),]->table_clusters_by_cell_cycle
rownames(table_clusters_by_cell_cycle)<-NULL

knitr::kable(table_clusters_by_cell_cycle)

#不同阶段细胞所占比例
temp_labels <- seurat@meta.data$sample %>% table %>% as.data.frame
colnames(temp_labels)<-c("sample","n")

p1 <- table_samples_by_cell_cycle %>%
  select(-c('total_cell_count')) %>%
  reshape2::melt(id.vars = 'sample') %>%
  mutate(sample = factor(sample, levels = levels(seurat@meta.data$sample))) %>%
  ggplot(aes(sample, value)) +
  geom_bar(aes(fill = variable), position = 'fill', stat = 'identity') +
  geom_text(
    data = temp_labels,
    aes(x = sample, y = Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1),
    color = 'black', size = 2.8
  ) +
  scale_fill_manual(name = 'Cell cycle', values = custom_colors$cell_cycle) +
  scale_y_continuous(name = 'Percentage [%]', labels = scales::percent_format(), expand = c(0.01,0)) +
  coord_cartesian(clip = 'off') +
  theme_bw() +
  theme(
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5),
    text = element_text(size = 16),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    plot.margin = margin(t = 20, r = 0, b = 0, l = 0, unit = 'pt')
  )

p1

ggsave(
  'plots/composition_samples_clusters_by_cell_cycle_by_percent.png',
  p1 + p2 +
  plot_layout(ncol = 2, widths = c(
    seurat@meta.data$sample %>% unique() %>% length(),
    seurat@meta.data$seurat_clusters %>% unique() %>% length()
  )),
  width = 18, height = 8
)

save(seurat,file = "Cyclin_seurat.RData")

10.Dimensional reduction降维

在下文中我们讲数据集中细胞的表达谱投影到二维或三维中展示，通常使用t-SNE或UMAP的方法

##10.1 t-SNE图
library(ggnetwork)
load("Cyclin_seurat.RData")
SCT_snn <- seurat@graphs$SCT_snn %>%
  as.matrix() %>%
  ggnetwork() %>%
  left_join(seurat@meta.data %>% mutate(vertex.names = rownames(.)), by = 'vertex.names')

p1 <- ggplot(SCT_snn, aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_edges(color = 'grey50', alpha = 0.05) +
  geom_nodes(aes(color = sample), size = 0.5) +
  scale_color_manual(
    name = 'Sample', values = custom_colors$discrete,
    guide = guide_legend(ncol = 1, override.aes = list(size = 2))
  ) +
  theme_blank() +
  theme(legend.position = 'left') +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

p2 <- ggplot(SCT_snn, aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_edges(color = 'grey50', alpha = 0.05) +
  geom_nodes(aes(color = seurat_clusters), size = 0.5) +
  scale_color_manual(
    name = 'Cluster', values = custom_colors$discrete,
    guide = guide_legend(ncol = 1, override.aes = list(size = 2))
  ) +
  theme_blank() +
  theme(legend.position = 'right') +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

ggsave(
  'plots/snn_graph_by_sample_cluster.png',
  p1 + p2 + plot_layout(ncol = 2),
  height = 5, width = 11
)

## 10.2 UMAP
### 10.2.1 Calculate UMAP计算UMAP
#我们使用与前文用于聚类一致的15个主成分进行UMAP计算
seurat <- RunUMAP(
  seurat,
  reduction.name = 'UMAP',
  reduction = 'pca',
  dims = 1:15,
  n.components = 2,
  seed.use = 100
)

### 10.2.2 Overview预览
#在计算完UMAP之后，我们分别根据转录本数量，样本来源，聚类结果，和细胞周期对其进行上色

plot_umap_by_nCount <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = nCount_RNA)) +
  geom_point(size = 0.2) +
  theme_bw() +
  scale_color_viridis(
    guide = guide_colorbar(frame.colour = 'black', ticks.colour = 'black'),
    labels = scales::comma,
  ) +
  labs(color = 'Number of\ntranscripts') +
  theme(legend.position = 'left') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

plot_umap_by_sample <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = sample)) +
  geom_point(size = 0.2) +
  theme_bw() +
  scale_color_manual(values = custom_colors$discrete) +
  labs(color = 'Sample') +
  guides(colour = guide_legend(override.aes = list(size = 2))) +
  theme(legend.position = 'right') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

plot_umap_by_cluster <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = seurat_clusters)) +
  geom_point(size = 0.2) +
  theme_bw() +
  scale_color_manual(
    name = 'Cluster', values = custom_colors$discrete,
    guide = guide_legend(ncol = 2, override.aes = list(size = 2))
  ) +
  theme(legend.position = 'left') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

plot_umap_by_cell_cycle <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = cell_cycle_seurat)) +
  geom_point(size = 0.2) +
  theme_bw() +
  scale_color_manual(values = custom_colors$cell_cycle) +
  labs(color = 'Cell cycle') +
  guides(colour = guide_legend(override.aes = list(size = 2))) +
  theme(legend.position = 'right') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

ggsave(
  'plots/umap.png',
  plot_umap_by_nCount + plot_umap_by_sample +
  plot_umap_by_cluster + plot_umap_by_cell_cycle +
  plot_layout(ncol = 2),
  height = 6,
  width = 8.5
)

### 10.2.3 Samples样本间UMAP
#为了探究不同样本的结果是否能较好重合，我们根据样本划分板块再次做UMAP。
temp_labels <- seurat@meta.data %>%
  group_by(sample) %>%
  tally()

p <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = sample)) +
  geom_point(size = 0.2, show.legend = FALSE) +
  geom_text(
    data = temp_labels,
    aes(x = Inf, y = -Inf, label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)), vjust = -1.5, hjust = 1.25),
    color = 'black', size = 2.8
  ) +
  theme_bw() +
  scale_color_manual(values = custom_colors$discrete) +
  coord_fixed() +
  theme(
    legend.position = 'none',
    strip.text = element_text(face = 'bold', size = 10)
  ) +
  facet_wrap(~sample, ncol = 5)

ggsave(
  'plots/umap_by_sample_split.png',
  p,
  height = 4,
  width = 10
)

### 10.2.4 聚类
#对于聚类，我们在每个聚类的几何中心添个标签或自己的注释。
UMAP_centers_cluster <- tibble(
    UMAP_1 = as.data.frame(seurat@reductions$UMAP@cell.embeddings)$UMAP_1,
    UMAP_2 = as.data.frame(seurat@reductions$UMAP@cell.embeddings)$UMAP_2,
    cluster = seurat@meta.data$seurat_clusters
  ) %>%
  group_by(cluster) %>%
  summarize(x = median(UMAP_1), y = median(UMAP_2))

p <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = seurat_clusters)) +
  geom_point(size = 0.2) +
  geom_label(
    data = UMAP_centers_cluster,
    mapping = aes(x, y, label = cluster),
    size = 4.5,
    fill = 'white',
    color = 'black',
    fontface = 'bold',
    alpha = 0.5,
    label.size = 0,
    show.legend = FALSE
  ) +
  theme_bw() +
  scale_color_manual(
    name = 'Cluster', values = custom_colors$discrete,
    guide = guide_legend(override.aes = list(size = 2))
  ) +
  theme(legend.position = 'right') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

ggsave(
  'plots/umap_by_clusters.png',
  p, height = 5.5, width = 5.5
)

11 Cell type annotation with SingleR使用SingleR进行细胞类型注释

细胞类型的推断有多种软件包可以实现，或者根据自己生物学背景进行手动注释。作者使用SingleR包进行了自动注释

##### 11.1 Assign labels分配标签
#首先我们给每个细胞分配了标签作为参考。然后提取标签和分配分数，并将它们添加到Seurat对象中。
library(SingleR)

singler_ref <- celldex::BlueprintEncodeData()

singler_results_blueprintencode_main <- SingleR::SingleR(
  test = GetAssayData(seurat, assay = 'SCT', layer = 'data'),
  ref = singler_ref,
  labels = singler_ref@colData@listData$label.main
)

saveRDS(singler_results_blueprintencode_main, "data/singler_results_blueprintencode_main.rds")

p <- plotScoreHeatmap(
    singler_results_blueprintencode_main,
    show.labels = TRUE,
    annotation_col = data.frame(
      donor = seurat@meta.data$sample,
      row.names = rownames(singler_results_blueprintencode_main)
    )
  )

p

ggsave(
  'plots/singler_score_heatmap_blueprintencode_main.png', p,
  height = 11, width = 14
)

seurat@meta.data$cell_type_singler_blueprintencode_main <- singler_results_blueprintencode_main@listData$labels

singler_scores <- singler_results_blueprintencode_main@listData$scores %>%
  as_tibble() %>%
  dplyr::mutate(assigned_score = NA)

for ( i in seq_len(nrow(singler_scores)) ) {
  singler_scores$assigned_score[i] <- singler_scores[[singler_results_blueprintencode_main@listData$labels[i]]][i]
}

seurat@meta.data$cell_type_singler_blueprintencode_main_score <- singler_scores$assigned_score

save(seurat,file="labels_assinged_seurat.Rdata")

## 11.2 Assignment score by sample, cluster and cell type
#在这里，我们绘制了每个样本、聚类和细胞类型的分配打分分布。
load("labels_assinged_seurat.Rdata")
temp_labels <- seurat@meta.data %>%
  group_by(sample) %>%
  tally()
as.data.frame(temp_labels) -> temp_labels

p1 <- ggplot() +
  geom_half_violin(
    data = seurat@meta.data,
    aes(
      x = sample,
      y = cell_type_singler_blueprintencode_main_score,
      fill = sample
    ),
    side = 'l', show.legend = FALSE, trim = FALSE
  ) +
  geom_half_boxplot(
    data = seurat@meta.data,
    aes(
      x = sample,
      y = cell_type_singler_blueprintencode_main_score,
      fill = sample
    ),
    side = 'r', outlier.color = NA, width = 0.4, show.legend = FALSE
  ) +
  geom_text(
    data = temp_labels,
    aes(
      x = sample,
      y = -Inf,
      label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
      vjust = -1
    ),
    color = 'black', size = 2.8
  ) +
  scale_color_manual(values = custom_colors$discrete) +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_y_continuous(
    name = 'Assignment score',
    labels = scales::comma,
    limits = c(0,1)
  ) +
  theme_bw() +
  labs(title = 'Samples') +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    axis.title.x = element_blank(),
    panel.grid.major.x = element_blank()
  )

temp_labels <- seurat@meta.data %>%
  group_by(seurat_clusters) %>%
  tally()
as.data.frame(temp_labels) -> temp_labels

p2 <- ggplot() +
  geom_half_violin(
    data = seurat@meta.data,
    aes(
      x = seurat_clusters,
      y = cell_type_singler_blueprintencode_main_score,
      fill = seurat_clusters
    ),
    side = 'l', show.legend = FALSE, trim = FALSE
  ) +
  geom_half_boxplot(
    data = seurat@meta.data,
    aes(
      x = seurat_clusters,
      y = cell_type_singler_blueprintencode_main_score,
      fill = seurat_clusters
    ),
    side = 'r', outlier.color = NA, width = 0.4, show.legend = FALSE
  ) +
  geom_text(
    data = temp_labels,
    aes(
      x = seurat_clusters,
      y = -Inf,
      label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
      vjust = -1
    ),
    color = 'black', size = 2.8
  ) +
  scale_color_manual(values = custom_colors$discrete) +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_y_continuous(
    name = 'Assignment score',
    labels = scales::comma,
    limits = c(0,1)
  ) +
  labs(title = 'Clusters') +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    panel.grid.major.x = element_blank()
  )

temp_labels <- seurat@meta.data %>%
  group_by(cell_type_singler_blueprintencode_main) %>%
  tally()
as.data.frame(temp_labels) -> temp_labels

temp_labels[temp_labels[,1]!="Fibroblasts",] -> temp_labels

#table(seurat@meta.data$cell_type_singler_blueprintencode_main)
#DC类细胞只有一个画不了图，得删掉
data.frame(seurat@meta.data$cell_type_singler_blueprintencode_main,
seurat@meta.data$cell_type_singler_blueprintencode_main_score) -> tmpPlotDf
tmpPlotDf[tmpPlotDf[,1]!="DC",] -> tmpPlotDf
table(tmpPlotDf[,1])
as.character(tmpPlotDf[,1]) -> tmpPlotDf[,1]

p3 <- ggplot() +
  geom_half_violin(
    data = tmpPlotDf, #seurat@meta.data
    aes(
      x = seurat.meta.data.cell_type_singler_blueprintencode_main,
      y = seurat.meta.data.cell_type_singler_blueprintencode_main_score,
      fill = seurat.meta.data.cell_type_singler_blueprintencode_main
    ),
    side = 'l', show.legend = FALSE, trim = FALSE,
  ) +
  geom_half_boxplot(
    data = tmpPlotDf,
    aes(
      x = seurat.meta.data.cell_type_singler_blueprintencode_main,
      y = seurat.meta.data.cell_type_singler_blueprintencode_main_score,
      fill = seurat.meta.data.cell_type_singler_blueprintencode_main
    ),
    side = 'r', outlier.color = NA, width = 0.4, show.legend = FALSE
  ) +
  geom_text(
    data = temp_labels,
    aes(
      x = cell_type_singler_blueprintencode_main,
      y = -Inf,
      label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
      vjust = -1
    ),
    color = 'black', size = 2.8
  ) +
  scale_color_manual(values = custom_colors$discrete) +
  scale_fill_manual(values = custom_colors$discrete) +
  scale_y_continuous(
    name = 'Assignment score',
    labels = scales::comma,
    limits = c(0,1)
  ) +
  labs(title = 'Cell types') +
  theme_bw() +
  theme(
    axis.title.x = element_blank(),
    panel.grid.major.x = element_blank()
  )

(p1 | p3) / p2 + plot_layout(ncol = 1)

#若出现以下报错，说明数据不足以画小提琴图，可能是某个种类细胞只有一个，剔除后就正常了
# #Error in `geom_half_violin()`:
# ! Problem while converting geom to grob.
# ℹ Error occurred in the 1st layer.
# Caused by error in `if ((is_panel & (side[1] == "l")) | is_group) ...`:
# ! 需要TRUE/FALSE值的地方不可以用缺少值
# Run `rlang::last_trace()` to see where the error occurred.
# Warning message:
# Groups with fewer than two datapoints have been dropped.
# ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes. 

ggsave(
  'plots/singler_blueprintencode_main_assignment_scores_by_sample_cluster_cell_type.png',
  (p1 | p3) / p2 + plot_layout(ncol = 1), height = 13, width = 16
)

我们然后检查样本和聚类在细胞类型方面是怎样构成的

## 11.3 Composition of samples and clusters by cell
table_samples_by_cell_type <- seurat@meta.data %>%
  dplyr::group_by(sample, cell_type_singler_blueprintencode_main) %>%
  dplyr::summarize(count = n()) %>%
  tidyr::spread(cell_type_singler_blueprintencode_main, count, fill = 0) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(total_cell_count = rowSums(.[c(2:ncol(.))])) %>%
  dplyr::select(c("sample", "total_cell_count", dplyr::everything()))

table_samples_by_cell_type %>% knitr::kable()

table_clusters_by_cell_type <- seurat@meta.data %>%
  dplyr::group_by(seurat_clusters, cell_type_singler_blueprintencode_main) %>%
  dplyr::rename(cluster = seurat_clusters) %>%
  dplyr::summarize(count = n()) %>%
  tidyr::spread(cell_type_singler_blueprintencode_main, count, fill = 0) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(total_cell_count = rowSums(.[c(2:ncol(.))])) %>%
  dplyr::select(c("cluster", "total_cell_count", dplyr::everything()))

table_clusters_by_cell_type %>% knitr::kable()

#下图是按样本/聚类的细胞百分比组成。
temp_labels <- seurat@meta.data %>%
  group_by(sample) %>%
  tally()

p1 <- table_samples_by_cell_type %>%
  select(-c('total_cell_count')) %>%
  reshape2::melt(id.vars = 'sample') %>%
  mutate(sample = factor(sample, levels = levels(seurat@meta.data$sample))) %>%
  ggplot(aes(sample, value)) +
  geom_bar(aes(fill = variable), position = 'fill', stat = 'identity', show.legend = FALSE) +
  geom_text(
    data = temp_labels,
    aes(
      x = sample,
      y = Inf,
      label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
      vjust = -1
    ),
    color = 'black', size = 2.8
  ) +
  scale_fill_manual(name = 'Cell type', values = custom_colors$discrete) +
  scale_y_continuous(
    name = 'Percentage [%]',
    labels = scales::percent_format(),
    expand = c(0.01,0)
  ) +
  coord_cartesian(clip = 'off') +
  theme_bw() +
  theme(
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5),
    text = element_text(size = 16),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    plot.margin = margin(t = 20, r = 0, b = 0, l = 0, unit = 'pt')
  )

temp_labels <- seurat@meta.data %>%
  group_by(seurat_clusters) %>%
  tally() %>%
  dplyr::rename('cluster' = seurat_clusters)

p2 <- table_clusters_by_cell_type %>%
  select(-c('total_cell_count')) %>%
  reshape2::melt(id.vars = 'cluster') %>%
  mutate(cluster = factor(cluster, levels = levels(seurat@meta.data$seurat_clusters))) %>%
  ggplot(aes(cluster, value)) +
  geom_bar(aes(fill = variable), position = 'fill', stat = 'identity') +
  geom_text(
    data = temp_labels,
    aes(
      x = cluster,
      y = Inf,
      label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
      vjust = -1
    ),
    color = 'black', size = 2.8
  ) +
  scale_fill_manual(name = 'Cell type', values = custom_colors$discrete) +
  scale_y_continuous(
    name = 'Percentage [%]',
    labels = scales::percent_format(),
    expand = c(0.01,0)
  ) +
  coord_cartesian(clip = 'off') +
  theme_bw() +
  theme(
    legend.position = 'right',
    plot.title = element_text(hjust = 0.5),
    text = element_text(size = 16),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title = element_blank(),
    plot.margin = margin(t = 20, r = 0, b = 0, l = 10, unit = 'pt')
  )

p1 + p2 +
  plot_layout(ncol = 2, widths = c(
    seurat@meta.data$sample %>% unique() %>% length(),
    seurat@meta.data$seurat_clusters %>% unique() %>% length()
  ))

ggsave(
  'plots/composition_samples_clusters_by_cell_type_singler_blueprintencode_main_by_percent.png',
  p1 + p2 +
  plot_layout(ncol = 2, widths = c(
    seurat@meta.data$sample %>% unique() %>% length(),
    seurat@meta.data$seurat_clusters %>% unique() %>% length()
  )),
  width = 18, height = 8
)

冲积图

看细胞类型在样本中的分布

### 11.3.1 Alluvial plots冲积图
## get sample and cluster names
samples <- levels(seurat@meta.data$sample)
clusters <- levels(seurat@meta.data$seurat_clusters)

## create named vector holding the color assignments for both samples and
## clusters
color_assignments <- setNames(
  c(custom_colors$discrete[1:length(samples)], custom_colors$discrete[1:length(clusters)]),
  c(samples,clusters)
)

## prepare data for the plot; factor() calls are necessary for the right order
## of columns (first samples then clusters) and boxes within each column (
## cluster 1, 2, 3, ..., not 1, 10, 11, ...)
data <- seurat@meta.data %>%
  group_by(sample,seurat_clusters) %>%
  tally() %>%
  ungroup() %>%
  gather_set_data(1:2) %>%
  dplyr::mutate(
    x = factor(x, levels = unique(x)),
    y = factor(y, levels = unique(y))
  )

DataFrame(data)

data_labels <- tibble(
    group = c(
      rep('sample', length(samples)),
      rep('seurat_clusters', length(clusters))
    )
 ) %>%
  mutate(
    hjust = ifelse(group == 'sample', 1, 0),
    nudge_x = ifelse(group == 'sample', -0.1, 0.1)
  )

DataFrame(data_labels)

## create plot
p1 <- ggplot(data, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = seurat_clusters), alpha = 0.75, axis.width = 0.15) +
  geom_parallel_sets_axes(aes(fill = y), color = 'black', axis.width = 0.1) +
  geom_text(
    aes(y = n, split = y), stat = 'parallel_sets_axes', fontface = 'bold',
    hjust = data_labels$hjust, nudge_x = data_labels$nudge_x
  ) +
  scale_x_discrete(labels = c('Sample','Cluster')) +
  scale_fill_manual(values = color_assignments) +
  theme_bw() +
  theme(
    legend.position = 'none',
    axis.title = element_blank(),
    axis.text.x = element_text(face = 'bold', colour = 'black', size = 15),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank()
  )
```

聚类结果对细胞类型
```{r}
clusters <- levels(seurat@meta.data$seurat_clusters)
cell_types <- sort(unique(seurat@meta.data$cell_type_singler_blueprintencode_main))

color_assignments <- setNames(
  c(custom_colors$discrete[1:length(clusters)], custom_colors$discrete[1:length(cell_types)]),
  c(clusters,cell_types)
)

data <- seurat@meta.data %>%
  dplyr::rename(cell_type = cell_type_singler_blueprintencode_main) %>%
  dplyr::mutate(cell_type = factor(cell_type, levels = cell_types)) %>%
  group_by(seurat_clusters, cell_type) %>%
  tally() %>%
  ungroup() %>%
  gather_set_data(1:2) %>%
  dplyr::mutate(
    x = factor(x, levels = unique(x)),
    y = factor(y, levels = c(clusters,cell_types))
  )

data_labels <- tibble(
    group = c(
      rep('seurat_clusters', length(clusters)),
      rep('cell_type', length(cell_types))
    )
 ) %>%
  mutate(
    hjust = ifelse(group == 'seurat_clusters', 1, 0),
    nudge_x = ifelse(group == 'seurat_clusters', -0.1, 0.1)
  )

p2 <- ggplot(data, aes(x, id = id, split = y, value = n)) +
  geom_parallel_sets(aes(fill = seurat_clusters), alpha = 0.75, axis.width = 0.15) +
  geom_parallel_sets_axes(aes(fill = y), color = 'black', axis.width = 0.1) +
  geom_text(
    aes(y = n, split = y), stat = 'parallel_sets_axes', fontface = 'bold',
    hjust = data_labels$hjust, nudge_x = data_labels$nudge_x
  ) +
  scale_x_discrete(labels = c('Cluster','Cell type')) +
  scale_fill_manual(values = color_assignments) +
  theme_bw() +
  theme(
    legend.position = 'none',
    axis.title = element_blank(),
    axis.text.x = element_text(face = 'bold', colour = 'black', size = 15),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank()
  )
p1 + p2 + plot_layout(ncol = 2)

ggsave(
  'plots/samples_clusters_cell_types_alluvial.png',
  p1 + p2 + plot_layout(ncol = 2),
  height = 6, width = 8
)

类似于根据聚类结果做UMAP，我们在UMAP中加入细胞类型标签

## 11.4 UMAP by cell type根据细胞类型画UMAP
UMAP_centers_cell_type <- tibble(
    UMAP_1 = as.data.frame(seurat@reductions$UMAP@cell.embeddings)$UMAP_1,
    UMAP_2 = as.data.frame(seurat@reductions$UMAP@cell.embeddings)$UMAP_2,
    cell_type = seurat@meta.data$cell_type_singler_blueprintencode_main
  ) %>%
  group_by(cell_type) %>%
  summarize(x = median(UMAP_1), y = median(UMAP_2))

p <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = cell_type_singler_blueprintencode_main)) +
  geom_point(size = 0.2) +
  geom_label(
    data = UMAP_centers_cell_type,
    mapping = aes(x, y, label = cell_type),
    size = 3.5,
    fill = 'white',
    color = 'black',
    fontface = 'bold',
    alpha = 0.5,
    label.size = 0,
    show.legend = FALSE
  ) +
  theme_bw() +
  expand_limits(x = c(-22,15)) +
  scale_color_manual(values = custom_colors$discrete) +
  labs(color = 'Cell type') +
  guides(colour = guide_legend(override.aes = list(size = 2))) +
  theme(legend.position = 'right') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

p

ggsave(
  'plots/umap_by_cell_type_singler_blueprintencode_main.png',
  p,
  height = 4,
  width = 6
)

为了了解UMAP中的细胞类型是否重叠(这在之前的图中很难看到)，我们并按细胞类型拆分画板绘制了UMAP图。

temp_labels <- seurat@meta.data %>%
  group_by(cell_type_singler_blueprintencode_main) %>%
  tally()

p <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = cell_type_singler_blueprintencode_main)) +
  geom_point(size = 0.2, show.legend = FALSE) +
  geom_text(
    data = temp_labels,
    aes(
      x = Inf,
      y = -Inf,
      label = paste0('n = ', format(n, big.mark = ',', trim = TRUE)),
      vjust = -1.5,
      hjust = 1.25
    ),
    color = 'black', size = 2.8
  ) +
  theme_bw() +
  scale_color_manual(values = custom_colors$discrete) +
  labs(color = 'Cell type') +
  guides(colour = guide_legend(override.aes = list(size = 2))) +
  coord_fixed() +
  facet_wrap(~cell_type_singler_blueprintencode_main, ncol = 4) +
  theme(
    legend.position = 'right',
    strip.text = element_text(face = 'bold', size = 10)
  )

p

ggsave(
  'plots/umap_by_cell_type_singler_blueprintencode_main_split.png',
  p, height = 5, width = 8
)

我们最后做一次UMAP，根据细胞注释时的分配分数

## 11.5 UMAP by assignment score根据分配分数进行UMAP
p <- bind_cols(seurat@meta.data, as.data.frame(seurat@reductions$UMAP@cell.embeddings)) %>%
  ggplot(aes(UMAP_1, UMAP_2, color = cell_type_singler_blueprintencode_main_score)) +
  geom_point(size = 0.2) +
  theme_bw() +
  scale_color_viridis(
    name = 'Score', limits = c(0,1), oob = scales::squish,
    guide = guide_colorbar(frame.colour = 'black', ticks.colour = 'black')
  ) +
  labs(color = 'Cell type') +
  theme(legend.position = 'right') +
  coord_fixed() +
  annotate(
    geom = 'text', x = Inf, y = -Inf,
    label = paste0('n = ', format(nrow(seurat@meta.data), big.mark = ',', trim = TRUE)),
    vjust = -1.5, hjust = 1.25, color = 'black', size = 2.5
  )

p

ggsave(
  'plots/umap_by_cell_type_singler_blueprintencode_main_score.png',
  p,
  height = 5,
  width = 5.5
)

后续的进阶分析可以在seurat对象上进行，这里篇幅有限就不做赘述了，感兴趣的可可以跟着原github的workflow做一做。但是因为他是基于seurat4的workflow作者还没有更新，所以需要进行修改才能跑通。下期会更新后续的marker基因的点图、热图，以及GSVA分析等。

ixxmu / mp_duty