Closed ixxmu closed 2 years ago
之前我有一个帖子写了R包scMetabolism
,详见 跟着Cancer Discovery学代谢 R包scMetabolism。
但是这个包有一个小问题,只能用来跑两个固定的代谢相关通路。所以,有个朋友希望我能出一个帖子,把这个R包改造一下,适用于跑其他的任何通路。同时他还发现这个包有几个小问题,例如可视化无法修改x轴和y轴的标签顺序,希望我修复一下。
其实修改R包源代码为己用,本身是举一反三的事,本帖也希望能够抛砖引玉。需要强调的是,修改R包源代码是为了更好满足自己的实际需求,切勿剽窃人家的代码!
scMetabolism
的文章出处,原理和环境部署,详见 跟着Cancer Discovery学代谢 R包scMetabolism及作者的原文介绍《Spatiotemporal Immune Landscape of Colorectal Cancer Liver Metastasis at Single-Cell Level》。
网页版工具https://www.cancerdiversity.asia/scMetabolism。
测试数据在:https://figshare.com/articles/dataset/scMetabolism_-_pbmc_demo_rda/13670038
github:https://github.com/wu-yc/scMetabolism
下面是R代码实现:
library(scMetabolism)
library(ggplot2)
library(rsvd)
library(Seurat)
library(dplyr)
读入测试数据:
load(file = "~/tmp_data/pbmc_demo.rda")
DimPlot(countexp.Seurat)
运行scMetabolism
计算代谢通路活性:
## Quantify single-cell metabolism with Seurat (Recommended)
countexp.Seurat<-sc.metabolism.Seurat(obj = countexp.Seurat,
method = "VISION",
imputation = F,
ncores = 2,
metabolism.type = "KEGG")
我们看一下这个函数的底层代码,源代码有一点长,大家可以自行查看。可见源代码的metabolism.type
参数选择“KEGG”或者“REACTOME”即可加载内置的基因list。优点在于非常方便用户使用,缺点在于缺少一点自由度。
如果不会查看函数的底层代码,百度一下吧。
让我们直接来改造这个函数,以便像Y叔的clusterProfiler
富集分析R包那样,能够读入自定义的基因集,这里以50条Hallmark gene list的gmt文件为例:
sc.metabolism.Seurat.2 = function (obj, method = "VISION", imputation = F, ncores = 2,
geneList = "KEGG")
{
countexp <- obj@assays$RNA@counts
countexp <- data.frame(as.matrix(countexp))
signatures_KEGG_metab <- system.file("data", "KEGG_metabolism_nc.gmt",
package = "scMetabolism")
signatures_REACTOME_metab <- system.file("data", "REACTOME_metabolism.gmt",
package = "scMetabolism")
if (geneList == "KEGG") {
gmtFile <- signatures_KEGG_metab
cat("Your choice is: KEGG\n")
}
if (geneList == "REACTOME") {
gmtFile <- signatures_REACTOME_metab
cat("Your choice is: REACTOME\n")
}
##########
if(!geneList %in% c("KEGG","REACTOME")){
gmtFile <- geneList
cat("Custom gene sets by users\n")
}
##########
if (imputation == F) {
countexp2 <- countexp
}
if (imputation == T) {
cat("Start imputation...\n")
cat("Citation: George C. Linderman, Jun Zhao, Yuval Kluger. Zero-preserving imputation of scRNA-seq data using low-rank approximation. bioRxiv. doi: https://doi.org/10.1101/397588 \n")
result.completed <- alra(as.matrix(countexp))
countexp2 <- result.completed[[3]]
row.names(countexp2) <- row.names(countexp)
}
cat("Start quantify the pathway activity...\n")
if (method == "VISION") {
library(VISION)
n.umi <- colSums(countexp2)
scaled_counts <- t(t(countexp2)/n.umi) * median(n.umi)
vis <- Vision(scaled_counts, signatures = gmtFile)
options(mc.cores = ncores)
vis <- analyze(vis)
signature_exp <- data.frame(t(vis@SigScores))
}
if (method == "AUCell") {
library(AUCell)
library(GSEABase)
cells_rankings <- AUCell_buildRankings(as.matrix(countexp2),
nCores = ncores, plotStats = F)
geneSets <- getGmt(gmtFile)
cells_AUC <- AUCell_calcAUC(geneSets, cells_rankings)
signature_exp <- data.frame(getAUC(cells_AUC))
}
if (method == "ssGSEA") {
library(GSVA)
library(GSEABase)
geneSets <- getGmt(gmtFile)
gsva_es <- gsva(as.matrix(countexp2), geneSets, method = c("ssgsea"),
kcdf = c("Poisson"), parallel.sz = ncores)
signature_exp <- data.frame(gsva_es)
}
if (method == "gsva") {
library(GSVA)
library(GSEABase)
geneSets <- getGmt(gmtFile)
gsva_es <- gsva(as.matrix(countexp2), geneSets, method = c("gsva"),
kcdf = c("Poisson"), parallel.sz = ncores)
signature_exp <- data.frame(gsva_es)
}
obj@assays$METABOLISM$score <- signature_exp
obj
}
50条Hallmark gene list的gmt文件下载自GSEA官网http://www.gsea-msigdb.org/gsea/downloads.jsp,放到当前文件夹下,然后运行上述写好的sc.metabolism.Seurat.2
函数即可:
后台回复
单细胞实战
也可以获取我下载好的一些常用的gene list gmt文件。
countexp.Seurat<-sc.metabolism.Seurat.2(obj = countexp.Seurat,
method = "VISION",
imputation = F,
ncores = 1,
geneList = "./h.all.v7.2.symbols.gmt")
随机挑了几条通路进行可视化,看一下运行结果:
input.pathway<-c("HALLMARK_TNFA_SIGNALING_VIA_NFKB",
"HALLMARK_HYPOXIA",
"HALLMARK_CHOLESTEROL_HOMEOSTASIS")
DotPlot.metabolism(obj = countexp.Seurat,
pathway = input.pathway,
phenotype = "ident",
norm = "y")
第二个问题是,这个R包自带的可视化函数,似乎不能够自由调整x轴和y轴的标签顺序,例如上图x轴的celltype是按照首字母排序的,此时同上述改造代码的逻辑一样,只需要:
这里的底层代码非常长,自行查看底层代码,然后我们直接对其进行改造:
看不懂下述代码也没关系,直接用我改好的也行。
DotPlot.metabolism.2 = function (obj, pathway, phenotype, norm = "y")
{
input.norm = norm
input.pathway <- as.character(pathway)
input.parameter <- phenotype
metadata <- obj@meta.data
metabolism.matrix <- obj@assays$METABOLISM$score
metadata[, input.parameter] <- as.character(metadata[, input.parameter])
metabolism.matrix_sub <- t(metabolism.matrix[input.pathway,
])
gg_table <- c()
for (i in 1:length(input.pathway)) {
gg_table <- rbind(gg_table, cbind(metadata[, input.parameter],
input.pathway[i], metabolism.matrix_sub[, i]))
}
gg_table <- data.frame(gg_table)
gg_table_median <- c()
input.group.x <- unique(as.character(gg_table[, 1]))
input.group.y <- unique(as.character(gg_table[, 2]))
for (x in 1:length(input.group.x)) {
for (y in 1:length(input.group.y)) {
gg_table_sub <- subset(gg_table, gg_table[, 1] ==
input.group.x[x] & gg_table[, 2] == input.group.y[y])
gg_table_median <- rbind(gg_table_median, cbind(input.group.x[x],
input.group.y[y], median(as.numeric(as.character(gg_table_sub[,
3])))))
}
}
gg_table_median <- data.frame(gg_table_median)
gg_table_median[, 3] <- as.numeric(as.character(gg_table_median[,
3]))
gg_table_median_norm <- c()
input.group.x <- unique(as.character(gg_table[, 1]))
input.group.y <- unique(as.character(gg_table[, 2]))
range01 <- function(x) {
(x - min(x))/(max(x) - min(x))
}
if (input.norm == "y")
for (y in 1:length(input.group.y)) {
gg_table_median_sub <- subset(gg_table_median, gg_table_median[,
2] == input.group.y[y])
norm_value <- range01(as.numeric(as.character(gg_table_median_sub[,
3])))
gg_table_median_sub[, 3] <- norm_value
gg_table_median_norm <- rbind(gg_table_median_norm,
gg_table_median_sub)
}
if (input.norm == "x")
for (x in 1:length(input.group.x)) {
gg_table_median_sub <- subset(gg_table_median, gg_table_median[,
1] == input.group.x[x])
norm_value <- range01(as.numeric(as.character(gg_table_median_sub[,
3])))
gg_table_median_sub[, 3] <- norm_value
gg_table_median_norm <- rbind(gg_table_median_norm,
gg_table_median_sub)
}
if (input.norm == "na")
gg_table_median_norm <- gg_table_median
gg_table_median_norm <- data.frame(gg_table_median_norm)
gg_table_median_norm[, 3] <- as.numeric(as.character(gg_table_median_norm[,
3]))
library(wesanderson)
pal <- wes_palette("Zissou1", 100, type = "continuous")
if(is.factor(pathway)){
gg_table_median_norm$X2 = factor(gg_table_median_norm$X2 ,levels = levels(pathway))
}
if(is.factor(countexp.Seurat@meta.data[,phenotype])){
gg_table_median_norm$X1 = factor(gg_table_median_norm$X1 ,
levels = levels(countexp.Seurat@meta.data[,phenotype]))
}
ggplot(data = gg_table_median_norm, aes(x = gg_table_median_norm[,
1], y = gg_table_median_norm[, 2], color = gg_table_median_norm[,
3])) + geom_point(data = gg_table_median_norm, aes(size = gg_table_median_norm[,
3])) + ylab("Metabolic Pathway") + xlab(input.parameter) +
theme_bw() + theme(axis.text.x = element_text(angle = 45,
hjust = 1), panel.grid.minor = element_blank(), panel.grid.major = element_blank()) +
scale_color_gradientn(colours = pal) + labs(color = "Value",
size = "Value") + NULL
}
然后进行x和y轴的因子水平设置:
### y轴
input.pathway<-c("HALLMARK_TNFA_SIGNALING_VIA_NFKB",
"HALLMARK_HYPOXIA",
"HALLMARK_CHOLESTEROL_HOMEOSTASIS") %>%
factor(levels = c("HALLMARK_TNFA_SIGNALING_VIA_NFKB",
"HALLMARK_CHOLESTEROL_HOMEOSTASIS",
"HALLMARK_HYPOXIA"))
### x轴
countexp.Seurat$ident = factor(countexp.Seurat$ident,
levels = c("Naive CD4 T",
"Memory CD4 T",
"CD14+ Mono",
"B",
"CD8 T",
"FCGR3A+ Mono",
"NK",
"DC",
"Platelet"))
在ggplot2中因子水平即代表可视化图表的最终标签的顺序:
DotPlot.metabolism2(obj = countexp.Seurat,
pathway = input.pathway,
phenotype = "ident",
norm = "y")
如上可见,x轴和y轴按照我想要的顺序进行了排列。
- END -
https://mp.weixin.qq.com/s/nSXm3gDBu9be2vogFnYxbQ