Supercell代码实操（二）：COVID-19数据的整合分析

Supercell代码实操（二）：COVID-19数据的整合分析 by 生信宝库

前言

在上一期推文：Supercell代码实操（一）：5种肿瘤细胞系的整合分析中，Immugent已经初步带大家感受了一下基于Supercell流程的快捷，本期推文小编将会通过一个更大的数据集带大家更加深入的感受基于元细胞分析的重要性。

说到大的单细胞数据集，除了各种大型图谱之外，就数新冠相关研究的数据了。说到新冠研究的相关数据集，小到几千个细胞的数量级，大到上百万的数量级。Immugent本来想为大家展示的是1.4 millions of cells distributed in 284 samples coming from 196 patients的数据。但考虑到有很多小伙伴没有服务器，于是Immugent退而求其次，下面就使用了其中26个样本的数据进行实操。

主要内容

Analysis at the single cell level with Seurat

library(SuperCell)
library(Seurat)
library(SingleCellExperiment)
library(ggplot2)
library(harmony)
library(reshape2)

gene_blacklist <- readRDS("../data/gene_blacklist.rds")
pbmc <- readRDS("../data/pbmc_COVID19_sce/S-S086-2_sce.rds")

sceRawToSeurat<- function(pbmc,nfeatures = 1500) {
pbmc <- CreateSeuratObject(counts = assay(pbmc),meta.data = data.frame(colData(pbmc)))
pbmc <- FindVariableFeatures(pbmc,nfeatures = nfeatures)
VariableFeatures(pbmc) <- VariableFeatures(pbmc)[!VariableFeatures(pbmc) %in% gene_blacklist]
pbmc <- NormalizeData(pbmc)
return(pbmc)
}

pbmc <- sceRawToSeurat(pbmc)
pbmc <- ScaleData(pbmc)
pbmc <- RunPCA(pbmc, npcs = 20)
ElbowPlot(pbmc) 
PCAPlot(pbmc,group.by = "majorType")
pbmc <-RunUMAP(pbmc, dims = c(1:20))
UMAPPlot(pbmc, group.by = "majorType")

We compute purity of metacells according to their assignment to the majorType annotation.

makeSeuratMC <- function(pbmc,
                         genes = NULL,
                         metaFields = c("majorType", "sampleID", "Age", "Sex", "celltype", "PatientID", "SARS.CoV.2", "Outcome", "datasets", "CoVID.19.severity", "Sample.time"),
                         returnMC = F) {
  
  if(is.null(genes)) {
    genes <- VariableFeatures(pbmc)
  }

MC <- SCimplify(GetAssayData(pbmc,slot = "data"),  # normalized gene expression matrix 
                 n.pc = 20,
                 k.knn = 5, # number of nearest neighbors to build kNN network
                 gamma = 20, # graining level
                 genes.use = genes )# will be the ones used for integration if input is seurat integrated data


MC$purity <- supercell_purity(clusters = pbmc$majorType,
                           supercell_membership = MC$membership)


for (m in metaFields) {
MC[[m]]<- supercell_assign(clusters = pbmc@meta.data[,m], # single-cell assigment to cell lines (clusters)
                                    supercell_membership = MC$membership, # single-cell assignment to super-cells
                                    method = "absolute")
}

GE <- supercell_GE(as.matrix(GetAssayData(pbmc,slot = "data")),groups = MC$membership)

seuratMC <- supercell_2_Seurat(SC.GE = GE,MC,fields = c(metaFields,"purity"))

res <- seuratMC

if (returnMC) {
  res <- list(seuratMC = seuratMC,SC = MC)
}


return(res)
}

supercells <- makeSeuratMC(pbmc,returnMC = T)
seuratMC <- supercells$seuratMC
seuratMC <- RunUMAP(seuratMC,dims = c(1:20))
UMAPPlot(seuratMC,group.by = "majorType")

gene<-as.matrix(GetAssayData(seuratMC)["CXCL8",])
correlations<-apply(as.matrix(GetAssayData(seuratMC)),1,function(x){cor(gene,x)})

tf <- read.table("../data/transcription.factor.activity.GO0003700.symbol.list")

correlations <- correlations[names(correlations) %in% tf$V1]
head(sort(abs(correlations),decreasing = T),n = 10)
##     KLF10    ZNF467   ZNF385A      GAS7      KLF4      SPI1     CREB5       FOS 
## 0.7478192 0.7443197 0.7143247 0.7117197 0.7097348 0.7096040 0.7070228 0.7046132 
##     CEBPD      LMO2 
## 0.6903713 0.6868367

FeatureScatter(object = pbmc, feature1 = "CXCL8", feature2 = 'KLF10',group.by = "majorType")
FeatureScatter(object = seuratMC, feature1 = "CXCL8", feature2 = 'KLF10',group.by = "majorType")

supercell_GeneGenePlot(GetAssayData(seuratMC),
                       gene_x = "CXCL8",gene_y = "FOS",
                       supercell_size = seuratMC$size,
                       clusters = seuratMC$majorType)

files <- list.files("../data/pbmc_COVID19_sce",full.names = T)

# Uncomment if you have less than 20GB of memory
 files <- readRDS("../data/lowMemFileList.rds")

# Uncomment if you less than 16GB of memory
files <- files[c(1:6)]

MC.list <- list()
MC.list[["S-S086-2"]] <- seuratMC
for (f in files[which(files != "../data/pbmc_COVID19_sce/S-S086-2_sce.rds")]) {
  smp <- readRDS(f)
  smp <- sceRawToSeurat(smp)
  seuratMC <- makeSeuratMC(smp)
  MC.list[[seuratMC$sampleID[1]]] <- seuratMC
}
features <- SelectIntegrationFeatures(MC.list,nfeatures = 1500)
features <- features[!features %in% gene_blacklist]
nSingleCells <- 0
for (i in 1:length(MC.list)) {
  MC.list[[i]] <- RenameCells(MC.list[[i]],add.cell.id = unique(MC.list[[i]]$sampleID))
  MC.list[[i]] <- ScaleData(MC.list[[i]],features = features)
  MC.list[[i]] <- RunPCA(MC.list[[i]] ,features = features,npcs = 20)
  nSingleCells <- nSingleCells + sum(MC.list[[i]]$size)
}
nSingleCells  ## [1] 51595

reference <- which(c("S-S086-2","S-M064") %in% names(MC.list)) 
  
anchors <- FindIntegrationAnchors(object.list = MC.list,
                                  reference = reference, 
                                  reduction = "rpca",
                                  anchor.features = features,
                                  dims = 1:20)

integrated <- IntegrateData(anchorset = anchors, dims = 1:20)
DefaultAssay(integrated) = "integrated"
integrated <- ScaleData(integrated, verbose = T)
integrated <- RunPCA(integrated, verbose = FALSE)
integrated <- RunUMAP(integrated, dims = 1:20)
UMAPPlot(integrated,group.by = "sampleID")
UMAPPlot(integrated,group.by = "majorType")

majorTypeCounts <- aggregate(integrated$size, by=list(majorType = integrated$majorType,Severity = integrated$CoVID.19.severity,Time = integrated$Sample.time), FUN=sum)

majorTypeCounts$group = paste(majorTypeCounts$Severity,majorTypeCounts$Time,sep = "_")
contingencyTable <- xtabs(x ~ group + majorType,data = majorTypeCounts)
res <- chisq.test(contingencyTable)
Roe <- res$observed/res$expected

melted_Roe <- melt(Roe, na.rm = TRUE)
# Heatmap
ggplot(data = melted_Roe, aes(majorType,group,  fill = value))+
 geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 1, space = "Lab",
   name="Ro/e") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()

Idents(integrated) <- "majorType"
markersMega <- FindMarkers(integrated,ident.1 = "Mega", only.pos = T)
markersMega <- markersMega[markersMega$p_val_adj < 0.05, ]
markersMega[order(markersMega$avg_log2FC, decreasing = T), ]
cytokines_Mega_ori_study <- c("PDGFA", "TGFB1", "TNFSF4", "PF4V1", "PF4", "PPBP")
cytokines_Mega_ori_study %in% rownames(markersMega)

Idents(integrated) <- "majorType"
VlnPlot(integrated, features = cytokines_Mega_ori_study, pt.size = 0.1)
Idents(MC.list[["S-S086-2"]]) <- "majorType"

pbmc2 <- pbmc[, pbmc$majorType %in% levels(MC.list[["S-S086-2"]])] #to have the same majorType in MC and single cell object see below
pbmc2$majorType <- factor(pbmc2$majorType,levels = levels(MC.list[["S-S086-2"]]))
Idents(pbmc2) <- "majorType"

VlnPlot(pbmc2, features = cytokines_Mega_ori_study, pt.size = 0.1)

About metacell purity

Neu <- colnames(pbmc[,pbmc$majorType == "Neu"])
Neu
supercells$seuratMC$majorType[supercells$SC$membership[Neu]]

MC_majorTypes <- lapply(X = unique(supercells$seuratMC$majorType), FUN = function(x){SC <- SCimplify(GetAssayData(pbmc[,pbmc$majorType == x],slot = "data"),
       n.pc = 20,
       k.knn = 5, # number of nearest neighbors to build kNN network
       gamma = 20 # graining level
);SC$majorType = rep(x,SC$N.SC);return(SC)})



Neu_MC <- SCimplify(GetAssayData(pbmc[,pbmc$majorType == "Neu"],slot = "data"),
       n.pc = 3,
       k.knn = 4, # number of nearest neighbors to build kNN network
       gamma = 1 # graining level
)

Neu_MC$majorType <- rep("Neu",Neu_MC$N.SC)

MC_majorTypes <- c(MC_majorTypes,list(Neu_MC))

MC_majorTypes2 <- supercell_merge(MC_majorTypes, fields = "majorType")

MC_majorTypes_GE <- supercell_GE(as.matrix(GetAssayData(pbmc)[,names(MC_majorTypes2$membership)]),groups = MC_majorTypes2$membership)

MC_majorTypes2$genes.use <- VariableFeatures(pbmc)

seuratPureMC <- supercell_2_Seurat(SC.GE = MC_majorTypes_GE,
                                   SC = MC_majorTypes2,
                                   fields = c("majorType"))

cell.split.condition option of the SCimplify function that is faster but lead to a less good separation of T, B and NK cells at the metacell level…

seuratPureMC <- RunUMAP(seuratPureMC,dims = c(1:20))
UMAPPlot(seuratPureMC,group.by = "majorType")
pureMC <- SCimplify(GetAssayData(pbmc,slot = "data"),  # normalized gene expression matrix
                 n.pc = 20,
                 k.knn = 5, # number of nearest neighbors to build kNN network
                 gamma = 20, # graining level
                 genes.use = VariableFeatures(pbmc),
                 cell.split.condition = pbmc$majorType)




pureMC[["majorType"]]<- supercell_assign(clusters = pbmc@meta.data[,"majorType"], # single-cell assignment to cell lines (clusters)
                                    supercell_membership = pureMC$membership, # single-cell assignment to super-cells
                                    method = "absolute")


GE <- supercell_GE(as.matrix(GetAssayData(pbmc,slot = "data")),groups = pureMC$membership)


seuratPureMC <- supercell_2_Seurat(SC.GE = GE,
                                   SC = pureMC,
                                   fields = c("majorType"))
                                   
seuratPureMC <- RunUMAP(seuratPureMC,dims = c(1:20))
UMAPPlot(seuratPureMC,group.by = "majorType")                     

seuratPureMC <- ScaleData(seuratPureMC)
seuratPureMC <- RunPCA(seuratPureMC, npcs = 20)
seuratPureMC <- RunUMAP(seuratPureMC,dims = c(1:20))
UMAPPlot(seuratPureMC,group.by = "majorType")

展望

想必跟着今天的代码进行实操的小伙伴就会有更深刻的体会，使用原始数据跑seurat流程和使用supercell整合后的元细胞数据跑seurat流程有天差地别的时间体验。对于supercell整合后的元细胞数据的其它优点，Immugent在前面一篇推文中就已经介绍过了，这里就不再赘述。

此外，我们还需要注意的是并不是所有的单细胞数据分析流程都是适合使用supercell进行整合的，很大一定程度上取决于要解决的科学问题。比如，想通过单细胞数据来找到一群占比很少的稀有细胞，那么就不能使用supercell进行整合，因为这样很可能就会丢失这部分细胞的信息。相反，如果只是想基于单细胞数据分析找到某一种处理或者疾病状态下对主要细胞类型的影响，那使用upercell进行数据整合是非常具有优势的。

好啦，本期分享到这就结束了，我们下期再会~~

ixxmu / mp_duty

Supercell代码实操（二）：COVID-19数据的整合分析 #2877

Supercell代码实操（二）：COVID-19数据的整合分析 by 生信宝库

前言

主要内容

展望

关注下方公众号下回更新不迷路