Closed ixxmu closed 2 years ago
上一周,我们做完差异分析后,看到了一个p值窜天高的现象,当时的我感觉它很新奇,具体链接见P值竟窜天高,我的流程错了吗。但这些天,我是出奇的幸运,曾老师根据差异基因数量随手一指,我竟又获得了一个P值串天高的数据集GSE186262。于是,我就想总结一下P值窜天高现象背后隐藏的内容,并且尝试探究P值窜天高现象出现的原因。起初的我只考虑到从看看P值窜天高的样本分组相关性如何、在不同分组的表达情况以及差异基因数量「三个角度去看」,曾老师告诉我,可以看看这些P值窜天高差异基因的通路富集情况,看看其富集的通路是否有什么特点,于是我又喜添一个新颖角度。所以,接下来,我们从上述谈到的「四个角度开始探究吧」。
今天,我们使用标题为 RNA-seq analysis of triple negative breast cancer cell line MDA-MB-468 with knockdown of cytokeratin 5 (CK5) 的数据集GSE186262进行探究,数据集的介绍链接如下 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE186262,感兴趣的小伙伴可以进点进去看作者的总体设计。想具体了解文章内容的,可以看看文章 Cytokeratins 5 and 17 Maintain an Aggressive Epithelial State in Basal-Like Breast Cancer,从文章中去了解一下CK5与三阴乳腺癌细胞侵袭的关系。
GSE186262数据集的样本分组如下, 3个样本是在三阴乳腺癌细胞系中敲低无义RNA作为对照组,3个样本是在三阴乳腺癌细胞系中敲低CK5作为实验组处理数据的话,作者上传了 Ensenmble count矩阵,我们就可以直接走上次Ensenmble count数据处理的差异分析流程进行分析,链接见 转录组数据三分组的差异分析方式 。
以下为GSE186262数据集 Ensenmble count矩阵的下载链接:https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE186262&format=file&file=GSE186262%5FGene%5Fraw%5Fread%5Fcounts%5F468%2Etxt%2Egz感兴趣的小伙伴可以下载下来,跑跑试试。
rm(list = ls())
library(edgeR)
library(DESeq2)
library(FactoMineR)
library(factoextra)
library(clusterProfiler)
library(org.Hs.eg.db)
library(org.Mm.eg.db)
library(stringr)
library(stringi)
library(tidyverse)
library(ggplot2)
library(patchwork)
library(pheatmap)
library(VennDiagram)
library(RColorBrewer)
library(patchwork)
library(ggplotify)
rawcount <- read.table("./all.id.txt",header = T,sep="\t",row.names = 1)
rawcount <- rawcount[,6:11] #根据列名,提取表达列与命名列名
colnames(rawcount)=c("shCon1","shCon2","shCon3","shCK51","shCK52","shCK53")
colnames(rawcount)
## [1] "shCon1" "shCon2" "shCon3" "shCK51" "shCK52" "shCK53"
rawcount[1:4,1:4]
## shCon1 shCon2 shCon3 shCK51
## ENSG00000186827 2 0 2 0
## ENSG00000186891 4 12 12 0
## ENSG00000160072 2327 2733 2752 1645
## ENSG00000260179 8 14 6 12
rawcount$median <- apply(rawcount,1,median) #先排序而后降重,保留最大值,一降版本号,二降多个ID对应一个Ensemble ID
rawcount$Ensmble=rownames(rawcount)
rawcount=rawcount[order(rawcount$Ensmble,rawcount$median,decreasing = T),]
rawcount=rawcount[!duplicated(rawcount$Ensmble),]#排序后,降EnsemleID的重,降完EnsembleID转换
id2symbol <- bitr(rownames(rawcount), ##转换ID,22%有没有对应的symbole。
fromType = "ENSEMBL",
toType = "SYMBOL",
OrgDb = org.Hs.eg.db)
rawcount <- cbind(GeneID=rownames(rawcount),rawcount) #多一列换成基因ID
rawcount <- merge(id2symbol,rawcount,
by.x="ENSEMBL",by.y="GeneID",all.y=T) #有没匹配的,需去除NA
rawcount=rawcount[!duplicated(rawcount$SYMBOL),]
rawcount <- drop_na(rawcount) #去除含有NA值的行
rawcount <- rawcount[,!colnames(rawcount)%in%c("ENSEMBL","Ensmble","median")]#正式获得基因rawcount矩阵
rownames(rawcount) <- rawcount$SYMBOL
rawcount <- rawcount[,2:7]
keep <- rowSums(rawcount>0) >= floor(0.75*ncol(rawcount))
filter_count <- rawcount[keep,] #获得filter_count矩阵
express_cpm <- log2(cpm(filter_count)+ 1)
express_cpm[1:4,1:4] #获得cpm矩阵
## shCon1 shCon2 shCon3 shCK51
## TSPAN6 3.894868 3.909638 3.898333 4.973152
## DPM1 6.094633 6.023308 6.107927 6.271558
## SCYL3 3.226588 3.156117 3.333095 3.365107
## C1orf112 4.977671 4.996168 4.777207 4.675710
#根据列的信息,提取分组信息
colnames(rawcount)
## [1] "shCon1" "shCon2" "shCon3" "shCK51" "shCK52" "shCK53"
group=rep(c("shCon","shCK5"),each=3)
group
## [1] "shCon" "shCon" "shCon" "shCK5" "shCK5" "shCK5"
group_list=factor(group,levels = c("shCon","shCK5"))
table(group_list)#检查一下组别数量
## group_list
## shCon shCK5
## 3 3
## 01绘制箱线图
exprSet <- express_cpm
data <- data.frame(expression=c(exprSet),
sample=rep(colnames(exprSet),each=nrow(exprSet)))
p1 <- ggplot(data = data,aes(x=sample,y=expression,fill=sample))+ geom_boxplot() + theme(axis.text.x = element_text(angle = 90)) +
xlab(NULL) + ylab("log2(CPM+1)")+theme_bw()
## 02绘制PCA图
dat <- express_cpm
dat <- as.data.frame(t(dat))
dat <- na.omit(dat)
dat$group_list <- group_list
dat_pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#画图仅需数值型数据,去掉最后一列的分组信息
p2 <- fviz_pca_ind(dat_pca,
geom.ind = "point", # 只显示点,不显示文字
col.ind = dat$group_list, # 用不同颜色表示分组
palette = c("#00AFBB", "#E7B800"),
addEllipses = T, # 是否圈起来,少于4个样圈不起来
legend.title = "Groups") + theme_bw()
p1+p2
## p3CorPheatmap(样本相关性热图)
exprSet <- express_cpm
library(corrplot)
# 先计算相关性
M <- cor(exprSet)
# 绘制样本相关性的热图
library(pheatmap)
anno <- data.frame(sampleType=group_list)
rownames(anno) <- colnames(exprSet)
p3 <- pheatmap::pheatmap(M,display_numbers = T,
annotation_col = anno,
fontsize = 12,cellheight = 30,
cellwidth = 30,cluster_rows = T,
cluster_cols = T)
p3
重点看head对应的P值为零的情况以及P值为零的原始count值表达情况,从中可以看出上调基因组中的shCon组相较于shCK5存在巨大的差异。table(DEG_edgeR$regulated)中有3470个下调与4554个上调;20000多个基因竟然有将近8000个差异基因,这说明大量P值为零的话,存在大量差异基(超过5000个)以及P值为零时正常组与疾病组基因表达量存在显著差异,两组中可能很多表达为零或者表达量显著下调。
exprSet <- filter_count
design <- model.matrix(~0+rev(factor(group_list)))
rownames(design) <- colnames(exprSet)
colnames(design) <- levels(factor(group_list))
DEG <- DGEList(counts=exprSet, #构建edgeR的DGEList对象
group=factor(group_list))
DEG$samples$lib.size
## [1] 68177629 72779922 76340908 60835268 67425139 63200342
DEG <- calcNormFactors(DEG)
DEG$samples$norm.factors
## [1] 0.9629136 0.9504745 0.9445087 1.0515069 1.0510786 1.0466920
# 计算线性模型的参数
DEG <- estimateGLMCommonDisp(DEG,design)
DEG <- estimateGLMTrendedDisp(DEG, design)
DEG <- estimateGLMTagwiseDisp(DEG, design)
# 拟合线性模型
fit <- glmFit(DEG, design)
lrt <- glmLRT(fit, contrast=c(1,-1))
# 提取过滤差异分析结果
DEG_edgeR <- as.data.frame(topTags(lrt, n=nrow(DEG)))
head(DEG_edgeR)
## logFC logCPM LR PValue FDR
## MGRN1 -13.52685 5.335601 10459.305 0 0
## ADAMTS12 -13.36525 5.963147 13911.774 0 0
## TCEAL9 -13.33841 5.148001 7789.511 0 0
## COL6A1 -13.05175 7.776522 26250.568 0 0
## BEX3 -12.93606 5.535200 10792.648 0 0
## IGFBP1 -12.64965 4.462320 4949.084 0 0
# 设定阈值,筛选显著上下调差异基因
fc <- 2.0
p <- 0.05
DEG_edgeR$regulated <- ifelse(DEG_edgeR$logFC>log2(fc)&DEG_edgeR$PValue<p,
"up",ifelse(DEG_edgeR$logFC<(-log2(fc))&DEG_edgeR$PValue<p,"down","normal"))
## 取出其p值为零的情况,看其整体表达情况
head(DEG_edgeR[DEG_edgeR$regulated=='up',])
## logFC logCPM LR PValue FDR regulated
## CLCA2 8.002057 3.339633 2229.040 0 0 up
## IGFBP5 7.703890 8.270502 23674.848 0 0 up
## S100A8 7.614268 5.160717 6688.843 0 0 up
## FRY 7.433570 4.839635 7398.049 0 0 up
## APCDD1 7.370616 4.597174 5841.049 0 0 up
## EN1 7.331482 2.485849 1617.960 0 0 up
pgene=c('CLCA2','IGFBP5','S100A8','FRY','APCDD1','EN1')
filter_count[rownames(filter_count)%in%pgene,]
## shCon1 shCon2 shCon3 shCK51 shCK52 shCK53
## FRY 24 19 25 3628 4117 3741
## IGFBP5 189 198 223 38124 43886 41643
## CLCA2 7 4 5 1404 1313 1352
## S100A8 28 26 21 4814 5055 4472
## APCDD1 19 25 16 2998 3430 3281
## EN1 2 12 0 721 765 762
table(DEG_edgeR$regulated)
##
## down normal up
## 3470 11207 4554
## 01绘制普通版火山图
data <- DEG_edgeR
colnames(data)
## [1] "logFC" "logCPM" "LR" "PValue" "FDR" "regulated"
library(ggplot2)
p4 <- ggplot(data=data, aes(x=logFC, y=-log10(PValue),color=regulated)) +
geom_point(alpha=0.5, size=1.8) +
theme_set(theme_set(theme_bw(base_size=20))) +
xlab("log2FC") + ylab("-log10(Pvalue)") +
scale_colour_manual(values = c('blue','black','red'))+theme_bw()
## 02绘制edgeR利用p值进行差异分析的热图
edgeR1_sigGene1 <- DEG_edgeR[DEG_edgeR$regulated!="normal",]
edgeR1_sigGene1 <-rownames(edgeR1_sigGene1)
dat1 <- express_cpm[match(edgeR1_sigGene1,rownames(express_cpm)),]
dat1[1:4,1:4]
## shCon1 shCon2 shCon3 shCK51
## MGRN1 6.289553 6.268774 6.276105 0.02352198
## ADAMTS12 6.928395 6.874852 6.899380 0.04666660
## TCEAL9 6.055717 6.073327 6.144717 0.00000000
## COL6A1 8.690808 8.705545 8.728090 0.00000000
group1 <- data.frame(group=group_list)
rownames(group1)=colnames(dat1)
# 绘制热图
library(pheatmap)
p5 <- pheatmap(dat1,scale = "row",show_colnames =T,show_rownames = F,
cluster_cols = T,
annotation_col=group1,
main = "edgeR's DEG",
color = colorRampPalette(colors = c("blue","white","red"))(100))
p5 <- as.ggplot(p5)
(plot_spacer()/p4+plot_layout(heights = c(1, 8)))|(p5)
## 00.清空环境,加载包
library(clusterProfiler)
library(org.Hs.eg.db)
R.utils::setOption( "clusterProfiler.download.method",'auto' ) #富集必须
## 01.输入数据准备
DEG_edgeR1=DEG_edgeR
edgeR1_sigGene <- DEG_edgeR1[DEG_edgeR1$regulated!="normal",]
DEG <- rownames(edgeR1_sigGene)
gene_all <- rownames(filter_count)
keytypes(org.Hs.eg.db) #可看具体能够转换的类型
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GENETYPE" "GO" "GOALL" "IPI" "MAP"
## [16] "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM"
## [21] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [26] "UNIPROT"
## 02将Symble ID转换为Entreze ID
allID <- bitr(gene_all, fromType = "SYMBOL",
toType = c( "ENTREZID" ),
OrgDb = org.Hs.eg.db )
degID <- bitr(DEG, fromType = "SYMBOL",
toType = c( "ENTREZID" ),
OrgDb = org.Hs.eg.db )
## 03富集并绘制差异基因的KEGG富集结果与GO富集结果
enrich <- enrichKEGG(gene =degID[,2],
organism='hsa',
universe=allID[,2],
pvalueCutoff=1,
qvalueCutoff=1)
p6 <- barplot(enrich,showCategory=10,title="KEGG Pathway",
colorBy="p.adjust",font.size = 21,label_format = 15)+scale_fill_gradient(low="#DC0000B2",high ="#00A087B2" )
# GO富集分析
enrich_all <- enrichGO(gene = degID[,2],OrgDb= 'org.Hs.eg.db',ont = "ALL",
universe=allID[,2],pvalueCutoff=1,qvalueCutoff=1)#GO的3个过程都画
p7 <- barplot(enrich_all ,showCategory=10,title="GO enrichment",colorBy="p.adjust",font.size = 21,label_format = 15)+
scale_fill_gradient(low="#925E9FFF",high ="#42B540FF" )
p6+p7
# 1.先看下调差异基因的GO与KEGG富集情况
## 01.输入数据准备
edgeR1_sigGenedown <- DEG_edgeR1[DEG_edgeR1$regulated=="down",]
DEGdown <- rownames(edgeR1_sigGenedown)
gene_all <- rownames(filter_count)
keytypes(org.Hs.eg.db) #可看具体能够转换的类型
## 02将Symble ID转换为Entreze ID
degIDdown <- bitr(DEGdown, fromType = "SYMBOL",
toType = c( "ENTREZID" ),
OrgDb = org.Hs.eg.db )
## 03富集并绘制差异基因的KEGG富集结果与GO富集结果
enrich_down <- enrichKEGG(gene =degIDdown[,2],
organism='hsa',
universe=allID[,2],
pvalueCutoff=1,
qvalueCutoff=1)
p9 <- barplot(enrich_down,showCategory=10,title="KEGG Pathway",
colorBy="p.adjust",font.size = 21,label_format = 15 )+scale_fill_gradient(low="#DC0000B2",high ="#00A087B2" )
p9
enrich_all_down <- enrichGO(gene = degIDdown[,2],OrgDb= 'org.Hs.eg.db',ont = "ALL",
universe=allID[,2],pvalueCutoff=1,qvalueCutoff=1)#GO的3个过程都画
p10 <- barplot(enrich_all_down ,showCategory=10,title="GO enrichment",colorBy="p.adjust",font.size = 21,label_format = 15)+
scale_fill_gradient(low="#925E9FFF",high ="#42B540FF" )
p10
p9+p10
# 2.再看上调差异基因的富集情况
## 01.输入数据准备
edgeR1_sigGeneup <- DEG_edgeR1[DEG_edgeR1$regulated=="up",]
DEGup <- rownames(edgeR1_sigGeneup)
gene_all <- rownames(filter_count)
keytypes(org.Hs.eg.db) #可看具体能够转换的类型
## 02将Symble ID转换为Entreze ID
degIDup <- bitr(DEGup, fromType = "SYMBOL",
toType = c( "ENTREZID" ),
OrgDb = org.Hs.eg.db )
## 03富集并绘制差异基因的KEGG富集结果与GO富集结果
enrich_up <- enrichKEGG(gene =degIDup[,2],
organism='hsa',
universe=allID[,2],
pvalueCutoff=1,
qvalueCutoff=1)
p11 <- barplot(enrich_up,showCategory=10,title="KEGG Pathway",
colorBy="p.adjust",font.size = 21,label_format = 20)+scale_fill_gradient(low="#DC0000B2",high ="#00A087B2" )
p11
enrich_all_up <- enrichGO(gene = degIDup[,2],OrgDb= 'org.Hs.eg.db',ont = "ALL",
universe=allID[,2],pvalueCutoff=1,qvalueCutoff=1)#GO的3个过程都画
p12 <- barplot(enrich_all_up ,showCategory=10,title="GO enrichment",colorBy="p.adjust",font.size = 21,label_format = 20)+
scale_fill_gradient(low="#925E9FFF",high ="#42B540FF" )
p12
p11+p12
以上就是P值窜天高背后内容的探究。接下来,就让我们基于开头提出的四个角度进行回顾,并分析一下作者的测序结果。
首先是第一个角度(样本相关性),3与4的PCA结果与样本热图聚类结果可以看出处理组shCK5与对照组shCon存在显著差异;第二个角度(差异基因数量),5中利用edgeR进行差异分析的差异分析结果表明处理组相较于对照组存在显著差异,竟然有超过8000个差异基因,想想目前人只有20000多个基因;第三个角度(差异基因表达),5中可以看见对照组与处理组P值为零的情况下,两者的基因表达量存在显著区别(某一组绝对的低表达)。最后一个角度(通路富集),我们可以看见KEGG通路富集分析的结果表明神经激活性配受体、蛋白消化与降解以及细胞粘附分子等通路发生显著变化;GO富集分析结果表明趋化因子、血液循环与循环通路过程发生显著变化。
关于最后一个角度富集分析,起初我为了图省事只是展示了一下差异基因的GO与KEGG富集结果,但是整体差异基因富集的结果并不能说明作者欲探究的科学问题。曾老师一看完结果就告诉我应该分别对上下调差异基因进行GO与KEGG富集分析。因此,我进一步对上下调差异基因进行了GO与KEGG富集分析,结果如表格中所示。
从GO富集的结果中可以看出,上调基因主要显著富集于如cell-cell adhesion via plasma membrane等与粘附相关的通路,下调差异基因主要显著富集于如positive regulation of migration等与侵袭迁移相关的通路;KEGG的富集结果表明,下调显著差异基因主要富集于如PI3K-AKT等促侵袭迁移的通路中,而上调差异基因主要显著富集于如Cell adhesion molecues等与粘附相关的通路。上下调差异基因分别的富集结果均表明,敲除CK5抑制肿瘤细胞侵袭,这说明作者的转录组测序结果挺不错的。 |
由于小编的生物学背景不足,KEGG富集的结果小编通过蛋白消化与降解猜测其可能与迁移相关;但是小编这里存在一个疑问,为什么GO富集结果中许多显著差异通路与循环相关,此研究不是主要集中在细胞水平吗,与循环有何关系呢,如果需要解释的话,是否能说癌细胞也能分泌一些与循环相关的因子,在动物水平能够促进血管生成,促进癌症的侵袭与迁移呢?希望有这方面知识的小伙伴可以谈谈自身的观点,感兴趣的小伙伴可以试试进一步挖掘下,看看能不能发现P值窜天高现象背后存在的新规律。
https://mp.weixin.qq.com/s/JOuyKGXttlOIXrG7ozglyw