ixxmu / mp_duty

抓取网络文章到github issues保存
https://archives.duty-machine.now.sh/
127 stars 32 forks source link

ComplexHeatmap| 三、用富集分析结果做成词云注释热图 #2806

Closed ixxmu closed 2 years ago

ixxmu commented 2 years ago

https://mp.weixin.qq.com/s/_1MIoSoq5EC1wvssC51ZXQ

ixxmu commented 2 years ago

ComplexHeatmap| 三、用富集分析结果做成词云注释热图 by Biomamba 生信基地

零、写在前面

其实原作者已经开发出了anno_GO_keywords()一方面这个功能还在调试中(https://github.com/jokergoo/KeywordsEnrichment);另一方面anno_GO_keywords()只支持GO富集分析大家肯定也有做KEGG注释的需求,因此我就DIY了这么一个流程,可以在热图中以词云的形式注释上基因群富集分析的结果,并且可以按照富集的显著性调整字体大小


测试文件下载链接:

不要再说我的链接失效啦,我也不知道为啥腾讯和百度云不对付,微信页面或腾讯浏览器时常打不开百度云链接,链接最好在谷歌浏览器中打开,真的失效了再提醒客服更换链接哦。也不要说我的链接里没代码,html格式的文件大家用浏览器打开就可以看到整理好的笔记。

https://pan.baidu.com/s/1QU2gNom74wdBf3fnwT-QyQ?pwd=g5ef 



一、基本数据情况
这是一个normalized后的RNA-Seq表达矩阵,正好可以用来做此次的练习数据。
#数据:
mydata <- read.csv('mydata.csv',header = T,row.names = 1)
head(mydata)
## GroupA_1 GroupA_2 GroupA_3 GroupB_1 GroupB_2 GroupB_3
## Tbc1d19 7.749670 7.934043 7.785975 7.886533 6.771363 7.735374
## Cfc1 2.010601 2.023136 2.063938 3.473182 2.079826 2.448522
## Foxo3 5.173732 5.356384 5.217325 5.392354 5.810170 6.120307
## Spag9 7.747689 7.870940 7.996499 7.491094 7.949187 7.735845
## RGD1311899 11.910350 11.832828 11.772460 11.628933 12.640646 12.273074
## Olr390 7.750523 7.739519 7.860533 8.587599 8.408102 7.996499
#分组信息就藏在样本名里啦:
library(dplyr)
##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
group4diff <- strsplit(colnames(mydata),'_') %>%
lapply(., function(x){unlist(x)[1]}) %>% unlist()
group4diff
## [1] "GroupA" "GroupA" "GroupA" "GroupB" "GroupB" "GroupB"

二、差异分析
library(limma)
mod <- model.matrix(~ factor(group4diff))
colnames(mod) <- c(unique(group4diff)[1],
paste0(unique(group4diff)[1],'_vs_',
unique(group4diff)[2]))
colnames(mod)
## [1] "GroupA" "GroupA_vs_GroupB"
fit <- lmFit(mydata, mod)
fit <- eBayes(fit)
res <- decideTests(fit, p.value=0.01)
summary(res)
## GroupA GroupA_vs_GroupB
## Down 0 1912
## NotSig 52 16479
## Up 20218 1879
tt <- topTable(fit, coef=2, n=Inf)
head(tt)
## logFC AveExpr t P.Value adj.P.Val B
## Lipg 5.981587 4.929710 99.85876 9.574700e-10 1.940792e-05 11.06382
## Pde6a.1 6.264585 5.045929 81.06664 2.831943e-09 2.870174e-05 10.72319
## Adra1b 7.916472 8.844697 65.71015 8.440524e-09 4.610638e-05 10.26517
## Lgi2 -5.866347 4.905960 -64.76827 9.098446e-09 4.610638e-05 10.22902
## Tmem9 4.004675 9.735786 60.08272 1.344310e-08 4.700143e-05 10.03094
## Adam19.1 3.654913 11.428474 59.68721 1.391261e-08 4.700143e-05 10.01270
res.name <- paste0(unique(group4diff)[2],'vs',
unique(group4diff)[1],'.csv')
res.name
## [1] "GroupBvsGroupA.csv"
write.csv(tt,res.name)


三、浅画个火山图看看
m <- tt
m$Gene <- rownames(m)
m$type[m$P.Value < 0.05 & m$logFC > 1] = "up"
m$type[m$P.Value < 0.05 & m$logFC < -1] = "down"
m$type[m$P.Value < 0.05 & abs(m$logFC) <=1] = "non" ##abs绝对值
m$type[m$P.Value >= 0.05] = "non"
library(ggplot2)
library(ggthemes)
m$P.Value[-log10(m$P.Value)>200]<-10**(-200)#将p值超出范围的值同意等于200
p<-ggplot(m,aes(x=logFC,y=-1*log10(P.Value),colour=type))+
xlab("log2(Fold Change)")+
ylab("-log10(P.Value)")+
geom_point(size=2,alpha=0.6)+ # 设定点的大小
scale_color_manual(values =c("blue","grey","red"))+ #设定上下调颜色
geom_hline(yintercept=1.30103,linetype=3)+ # 增加水平间隔线
geom_vline(xintercept=c(-1,1),linetype=3)+ #增加垂直间隔线
theme_few()+ # 去掉网格背景
theme(legend.title = element_blank()) # 去掉图注标签
p

#标上上下调最显著的五个基因
top_m <- m %>% group_by(type) %>% top_n(n = 5, wt = -1*log10(P.Value)) %>% filter(type !='non') # 取上下调中显著差异表达基因前5个

up.p <- p + geom_text(data=top_m,aes(x=logFC, y=-1*log10(P.Value),label=as.character(Gene)),size=3)
print(up.p)


四、画热图,画热图,画热图
  • 4.1 基础的pheatmap

这部分内容我们也画专门的篇幅讲解过,可以看B站视频教程(企业级pheatmap 手把手教你画热图),与推送(企业级pheatmap教程)
suppressMessages(library(dplyr))
heatmapGene <- filter(m,adj.P.Val<0.05) %>%
filter(type %in% c('down','up')) %>%
group_by(type) %>% top_n(100,wt = logFC)

library(pheatmap)
gene4hmp <- heatmapGene$Gene
pheatmap(as.matrix(mydata[gene4hmp,]),scale = 'row',
fontsize_row = 2,cellheight = 3)
  • 4.2 ComplexHeatmap

  • 4.2.1 画一个单调的热图

suppressMessages(library(ComplexHeatmap))
Heatmap(mydata[gene4hmp,])#显然这个过于单调,啥也看不出来
## Warning: The input is a data frame-like object, convert it to a matrix.

#我们按行scale突出组间的差别而非基因间的差异
hmp.body <- Heatmap(t(scale(t(mydata[gene4hmp,]))),
column_title = "I'm a ComplexHeatmap designed by Biomamba" ,
row_km = 2,
column_split = c(rep('GroupA',3), rep('GroupB',3)),
border = TRUE)
draw(hmp.body)
加上分组变量的注释
ha = HeatmapAnnotation(foo = anno_block(gp = gpar(fill = 2:3),
labels = c('GroupA', 'GroupB'),#标签的值
labels_gp = gpar(col = "white", fontsize = 10))
)


Heatmap(t(scale(t(mydata[gene4hmp,]))),#按行scale一下,突出组间的差别而非基因间的
column_title = "I'm a ComplexHeatmap designed by Biomamba" ,
row_km = 2,
column_split = c(rep('GroupA',3), rep('GroupB',3)),
border = TRUE,
top_annotation = ha)
  • 4.2.2 基因标注

从上图可以看出,差异基因有时过多,字大了叠在一起,字小了又看不清,那就挑几个重要的标注一下吧。
这里先做一步富集分析,没有这部分知识的同学可以看这里:
基因集富集分析
#富集分析,这是一个大鼠的基因集
suppressMessages(library(org.Rn.eg.db))
suppressMessages(library(clusterProfiler))
gene.df <- bitr(gsub('\\.\\d$','',heatmapGene$Gene),fromType="SYMBOL",
toType=c("ENTREZID","ENSEMBL"),
OrgDb = org.Rn.eg.db)
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(gsub("\\.\\d$", "", heatmapGene$Gene), fromType = "SYMBOL", :
## 3.19% of input gene IDs are fail to map...
ekegg <- enrichKEGG(unique(gene.df$ENTREZID), organism='rno',
pvalueCutoff=0.05,pAdjustMethod='BH',
minGSSize=10,maxGSSize=500,use_internal_data=F)
## Reading KEGG annotation online:
## Warning in utils::download.file(url, quiet = TRUE, method = method, ...): the
## 'wininet' method is deprecated for http:// and https:// URLs
## Reading KEGG annotation online:
## Warning in utils::download.file(url, quiet = TRUE, method = method, ...): the
## 'wininet' method is deprecated for http:// and https:// URLs
ekegg <- setReadable(ekegg,'org.Rn.eg.db','ENTREZID')
my.kegg.res <- as.data.frame(ekegg@result)
head(my.kegg.res)
## ID Description
## rno00603 rno00603 Glycosphingolipid biosynthesis - globo and isoglobo series
## rno04610 rno04610 Complement and coagulation cascades
## rno04270 rno04270 Vascular smooth muscle contraction
## rno04350 rno04350 TGF-beta signaling pathway
## rno04934 rno04934 Cushing syndrome
## rno04530 rno04530 Tight junction
## GeneRatio BgRatio pvalue p.adjust qvalue
## rno00603 2/90 16/8964 0.01091743 0.514412 0.4912998
## rno04610 4/90 87/8964 0.01123072 0.514412 0.4912998
## rno04270 5/90 139/8964 0.01279988 0.514412 0.4912998
## rno04350 4/90 94/8964 0.01459563 0.514412 0.4912998
## rno04934 5/90 158/8964 0.02115092 0.514412 0.4912998
## rno04530 5/90 167/8964 0.02611845 0.514412 0.4912998
## geneID Count
## rno00603 Sec1/B3galt5 2
## rno04610 A2m/Serpinc1/C8b/Serpina1 4
## rno04270 Adra1b/Calml3/Gucy1b1/Avpr1a/Agtr1a 5
## rno04350 Cdkn2b/Fst/Nbl1/Tgif2 4
## rno04934 Cdkn1a/Cdkn2b/Wnt10b/Agtr1a/Scarb1 5
## rno04530 Cldn2/Runx1/Cldn4/Tuba8/Epb41l4b 5
#就挑富集分析里最显著通路包含的基因
marker.gene <- top_n(my.kegg.res,1, GeneRatio) %>%
pull(geneID) %>% .[1] %>%
strsplit(.,'/') %>% unlist()
marker.gene
## [1] "Adra1b" "Tac4" "Gal" "Chrng" "Chrnd" "Chrna1" "Avpr1a" "Agtr1a"
#加上标注:
Heatmap(t(scale(t(mydata[gene4hmp,]))),
column_title = "I'm a ComplexHeatmap designed by Biomamba" ,#加上标题
row_km = 2,#聚为两类
column_split = c(rep('GroupA',3), rep('GroupB',3)),#按照分组拆分
border = TRUE,#加上边框
top_annotation = ha,#加上顶部注释
right_annotation = rowAnnotation(
most.enrich.gene= anno_mark(at =
(1:length(heatmapGene$Gene))[heatmapGene$Gene %in% marker.gene]
, labels = marker.gene)),#加上指定基因的标注
show_row_names = F #堆叠的行名咱就不要了
)
  • 4.2.3 制作文本注释

#上调基因富集到最显著的10个通路
#下调基因富集到的最显著的10个通路
upgene <- filter(heatmapGene,type=='up') %>% pull(Gene)
up.gene.df <- bitr(gsub('\\.\\d$','',upgene),fromType="SYMBOL",
toType=c("ENTREZID","ENSEMBL"),
OrgDb = org.Rn.eg.db)
## 'select()' returned 1:1 mapping between keys and columns
up.ekegg <- enrichKEGG(unique(up.gene.df$ENTREZID), organism='rno',
pvalueCutoff=0.05,pAdjustMethod='BH',
minGSSize=10,maxGSSize=500,use_internal_data=F)


up.ekegg <- setReadable(up.ekegg,'org.Rn.eg.db','ENTREZID')
my.up.ekegg.res <- as.data.frame(up.ekegg@result)
head(my.up.ekegg.res)
## ID Description
## rno00603 rno00603 Glycosphingolipid biosynthesis - globo and isoglobo series
## rno05214 rno05214 Glioma
## rno05226 rno05226 Gastric cancer
## rno05220 rno05220 Chronic myeloid leukemia
## rno04744 rno04744 Phototransduction
## rno00601 rno00601 Glycosphingolipid biosynthesis - lacto and neolacto series
## GeneRatio BgRatio pvalue p.adjust qvalue
## rno00603 2/42 16/8964 0.002467033 0.157417 0.1464344
## rno05214 3/42 74/8964 0.004921876 0.157417 0.1464344
## rno05226 4/42 151/8964 0.005268313 0.157417 0.1464344
## rno05220 3/42 78/8964 0.005701901 0.157417 0.1464344
## rno04744 2/42 26/8964 0.006486425 0.157417 0.1464344
## rno00601 2/42 29/8964 0.008031452 0.157417 0.1464344
## geneID Count
## rno00603 Sec1/B3galt5 2
## rno05214 Cdkn1a/Shc3/Calml3 3
## rno05226 Cdkn1a/Cdkn2b/Shc3/Wnt10b 4
## rno05220 Runx1/Cdkn1a/Shc3 3
## rno04744 Pde6a/Calml3 2
## rno00601 Sec1/B3galt5 2
up.pathway <- top_n(my.up.ekegg.res,-10, pvalue)


downgene <- filter(heatmapGene,type=='down') %>% pull(Gene)
down.gene.df <- bitr(gsub('\\.\\d$','',downgene),fromType="SYMBOL",
toType=c("ENTREZID","ENSEMBL"),
OrgDb = org.Rn.eg.db)
## 'select()' returned 1:1 mapping between keys and columns
down.ekegg <- enrichKEGG(unique(down.gene.df$ENTREZID), organism='rno',
pvalueCutoff=0.05,pAdjustMethod='BH',
minGSSize=10,maxGSSize=500,use_internal_data=F)


down.ekegg <- setReadable(down.ekegg,'org.Rn.eg.db','ENTREZID')
my.down.ekegg.res <- as.data.frame(down.ekegg@result)
head(my.down.ekegg.res)
## ID Description GeneRatio BgRatio
## rno05415 rno05415 Diabetic cardiomyopathy 5/48 214/8964
## rno00020 rno00020 Citrate cycle (TCA cycle) 2/48 31/8964
## rno00620 rno00620 Pyruvate metabolism 2/48 46/8964
## rno04714 rno04714 Thermogenesis 4/48 237/8964
## rno04270 rno04270 Vascular smooth muscle contraction 3/48 139/8964
## rno04913 rno04913 Ovarian steroidogenesis 2/48 58/8964
## pvalue p.adjust qvalue geneID Count
## rno05415 0.005501832 0.4702128 0.4601589 Uqcr11/Ndufb4/Pdha1/Gys1/Agtr1a 5
## rno00020 0.011827091 0.4702128 0.4601589 Cs/Pdha1 2
## rno00620 0.025021909 0.4702128 0.4601589 Pdha1/Adh4 2
## rno04714 0.037417089 0.4702128 0.4601589 Uqcr11/Ndufb4/Coa7/Ndufaf4 4
## rno04270 0.038018231 0.4702128 0.4601589 Gucy1b1/Avpr1a/Agtr1a 3
## rno04913 0.038380010 0.4702128 0.4601589 Scarb1/Acot4 2
down.pathway <- top_n(my.down.ekegg.res,-10, pvalue)

#构建文本注释
text = list('up'=up.pathway$Description,'down'=down.pathway$Description)
text#就是一个list
## $up
## [1] "Glycosphingolipid biosynthesis - globo and isoglobo series"
## [2] "Glioma"
## [3] "Gastric cancer"
## [4] "Chronic myeloid leukemia"
## [5] "Phototransduction"
## [6] "Glycosphingolipid biosynthesis - lacto and neolacto series"
## [7] "Neuroactive ligand-receptor interaction"
## [8] "Estrogen signaling pathway"
## [9] "Mineral absorption"
## [10] "Adrenergic signaling in cardiomyocytes"
## [11] "Breast cancer"
##
## $down
## [1] "Diabetic cardiomyopathy" "Citrate cycle (TCA cycle)"
## [3] "Pyruvate metabolism" "Thermogenesis"
## [5] "Vascular smooth muscle contraction" "Ovarian steroidogenesis"
## [7] "Long-term depression" "Spinocerebellar ataxia"
## [9] "Amyotrophic lateral sclerosis" "Glycolysis / Gluconeogenesis"
## [11] "Cortisol synthesis and secretion"
library(ggsci)#答应大家的以后配色都用ggsci解决
Heatmap(t(scale(t(mydata[gene4hmp,]))),#按行scale一下,突出组间的差别而非基因间的
column_title = "I'm a ComplexHeatmap designed by Biomamba" ,
row_km = 2,
column_split = c(rep('GroupA',3), rep('GroupB',3)),
border = TRUE,
top_annotation = ha,
right_annotation = rowAnnotation(
most.enrich.gene= anno_mark(at =
(1:length(heatmapGene$Gene))[heatmapGene$Gene %in% marker.gene], labels = marker.gene)),
show_row_names = F,
left_annotation = rowAnnotation(
textbox = anno_textbox(heatmapGene$type,#词云注释的'break
text, #文本
by = "anno_block",
add_new_line = T,#让每一种通路都单独成行展示
gp = gpar(col = c(pal_npg()(10),pal_nejm()(10))#ggsci配色
),
round_corners = TRUE#圆角文本矩形
)
)
)

还不能让人满意,我希望能够按显著性给这些字加上大小
myfontsize = c(-log2(up.pathway$pvalue),-log2(up.pathway$pvalue))*1.7 #按显著性分配字体大小,约显著的通路字体越大
fln.hmp <- Heatmap(t(scale(t(mydata[gene4hmp,]))),#按行scale一下,突出组间的差别而非基因间的
column_title = "I'm a ComplexHeatmap designed by Biomamba" ,
row_km = 2,
column_split = c(rep('GroupA',3), rep('GroupB',3)),
border = TRUE,
top_annotation = ha,
right_annotation = rowAnnotation(
most.enrich.gene= anno_mark(at = (1:length(heatmapGene$Gene))[heatmapGene$Gene %in% marker.gene]
, labels = marker.gene)),
show_row_names = F,
left_annotation = rowAnnotation(
textbox = anno_textbox(heatmapGene$type, text,
by = "anno_block",
add_new_line = T,
gp = gpar(col = c(pal_npg()(10),pal_nejm()(8),pal_aaas()(4))
,fontsize=myfontsize),#输入fontsize,调整字体大小
round_corners = TRUE,
max_width= unit(3,'npc'))
)
)
draw(fln.hmp)



往期回顾

热图绘制天花板:ComplexHeatmap| 一、初探

热图绘制天花板| 二、两万字长文探索注释功能



如何联系我们


公众号后台消息更新不及时,超过48h便不允许回复读者消息,这里给大家留一下领取资料、免费服务器的微信号,方便随时交流、提建议(由助理接待)。此外呼声一直很高的交流群也建好了,欢迎大家入群讨论:
永久免费的千人生信、科研交流群

大家可以阅读完这几篇之后添加
如何搜索公众号过往发布内容
您点的每个赞,我都认真当成了喜欢