ixxmu / mp_duty

抓取网络文章到github issues保存
https://archives.duty-machine.now.sh/
106 stars 30 forks source link

代谢基因和lncRNA表达量矩阵的相关性 #2784

Closed ixxmu closed 1 year ago

ixxmu commented 1 year ago

https://mp.weixin.qq.com/s/Oa-Adfbgw-K59jeAMz1Abw

ixxmu commented 1 year ago

代谢基因和lncRNA表达量矩阵的相关性 by 生信菜鸟团

上两讲中,我们拿到了长非编码基因的基因集和表达量矩阵,以及TCGA数据挖掘常见基因集合,见:把转录组测序表达量矩阵拆分成为编码与非编码TCGA的XENA的转录组测序表达量矩阵预处理,这次我们用lncRNA和代谢相关基因的表达量矩阵做相关性计算,并解析一下它的结果。

1. 获得表达矩阵

从LAML表达量矩阵拿出lncRNA和代谢相关基因的表达量矩阵:

#lncRNA表达矩阵
load( file = 'output/rdata/0.lnc_protein_expression_data.Rdata')
lnc_dat[1:4, 1:4]
# TCGA-AB-2949-03 TCGA-AB-2918-03 TCGA-AB-2943-03 TCGA-AB-2851-03
# ZSWIM8-AS1 15 10 10 14
# ZSCAN16-AS1 277 282 216 163
# ZRANB2-AS1 36 42 23 26
# ZNRD1-AS1 1880 1561 1744 929

#metabolism代谢表达矩阵
genes_target <- read.table('input/metabolism-genes-list.txt')[,1]
kp <- intersect(rownames(dat), genes_target)
target_dat <- dat[kp,]
target_dat[1:4, 1:4]
# TCGA-AB-2949-03 TCGA-AB-2918-03 TCGA-AB-2943-03 TCGA-AB-2851-03
# ZACN 104 103 128 117
# XYLT2 919 1633 839 1305
# XDH 1 3 1 1
# VKORC1L1 1578 1712 1436 954

save(lnc_dat,
genes_target,
file = 'output/rdata/0.lnc_metabolism_expression_data.Rdata')

2. 计算相关性

load( file = 'output/rdata/0.lnc_metabolism_expression_data.Rdata')
identical(colnames(lnc_dat), colnames(target_dat))
#[1] TRUE

m1=target_dat #代谢
m2=lnc_dat

#定义计算相关性的函数
cor_2_matrix <- function(m1,m2){
apply(m2 , 1, function(x){
# x = m2[2,]
unlist(apply(m1, 1,function(y){
# y = m1[1,]
cor(as.numeric(x), #cor
as.numeric(y))
}))
})
}

rdf[1:4,1:4]
# ZSWIM8-AS1 ZSCAN16-AS1 ZRANB2-AS1 ZNRD1-AS1
#ZACN 0.006748374 0.02089440 0.09556742 0.31465553
#XYLT2 0.215282382 0.27156587 -0.01498975 0.19231130
#XDH -0.062246960 -0.04867204 0.05621079 -0.02622855
#VKORC1L1 -0.272115650 0.15826855 0.24478046 0.35947612

if(F){ #运行时间比较久,下面保存结果,避免重复运行
rdf=cor_2_matrix( m1 , m2 )
}

#得到相关性p值的矩阵
corP_2_matrix <- function(m1,m2){
apply(m2 , 1, function(x){
# x = m2[2,]
unlist(apply(m1, 1,function(y){
# y = m1[1,]
cor.test(as.numeric(x), #cor.test
as.numeric(y))$p.value #cor.test()$p.value
}))
})
}

if(F){
pdf=corP_2_matrix( m1 , m2 )
}

pdf[1:4,1:4]
# ZSWIM8-AS1 ZSCAN16-AS1 ZRANB2-AS1 ZNRD1-AS1
#ZACN 0.9344580531 0.798994257 0.243102469 8.325045e-05
#XYLT2 0.0079389954 0.000743695 0.855051633 1.799850e-02
#XDH 0.4476827269 0.552864022 0.493008150 7.492120e-01
#VKORC1L1 0.0007247521 0.052267187 0.002452533 5.813088e-06

if(F){
save(rdf,pdf,file = 'output/rdata/step1_cor_results1.Rdata')
}
设置阈值
cor_thred <- 0.7            
pvalue_thred <- 0.01

library(reshape2)
pdfm = melt(pdf) #将数据宽边长
head(pdfm)
# Var1 Var2 value
#1 ZACN ZSWIM8-AS1 0.9344580531
#2 XYLT2 ZSWIM8-AS1 0.0079389954
#3 XDH ZSWIM8-AS1 0.4476827269
#4 VKORC1L1 ZSWIM8-AS1 0.0007247521
#5 VDAC2 ZSWIM8-AS1 0.3495754229
#6 VDAC1 ZSWIM8-AS1 0.0120853025

rdfm = melt(rdf)
head(rdfm)
# Var1 Var2 value
#1 ZACN ZSWIM8-AS1 0.006748374
#2 XYLT2 ZSWIM8-AS1 0.215282382
#3 XDH ZSWIM8-AS1 -0.062246960
#4 VKORC1L1 ZSWIM8-AS1 -0.272115650
#5 VDAC2 ZSWIM8-AS1 0.076647718
#6 VDAC1 ZSWIM8-AS1 -0.203777611

table(pdfm$value < pvalue_thred )
# FALSE TRUE
# 5875071 1128795

table(abs(rdfm$value) > cor_thred )
# FALSE TRUE
# 7001719 2147

#设置筛选条件
kp = (pdfm$value < pvalue_thred ) & (abs(rdfm$value) > cor_thred )
table(kp)
# FALSE TRUE
# 7001719 2147

cor_df = rdfm[kp,]
head(cor_df)
# Var1 Var2 value
#15333 SLC38A4 ZNF385D-AS2 0.7822774
#29392 EPHX2 ZIM2-AS1 0.7123947
#29443 CYP4F12 ZIM2-AS1 0.7473137
#29500 CDO1 ZIM2-AS1 0.8504875
#29666 ABCB11 ZIM2-AS1 0.7221485
#38112 HS3ST2 ZBTB20-AS3 0.7256150

colnames(cor_df)[1:2]=c('target','lncRNA' )
cor_df$Regulation = ifelse(cor_df$value > 0 ,'pos','neg')
table(cor_df$Regulation ) #此时相关性全是pos正相关
# pos
#2147

#save data
save(cor_df,file = 'output/rdata/step1_cor_results.Rdata')
length(unique(cor_df$lncRNA))

3. 可视化

library(ggalluvial)
library(ggplot2)
library(ggsci)
library(RColorBrewer)

data <- cor_df
length(unique(data$target))
#[1] 271
length(unique(data$lncRNA))
#[1] 775
head(data)
# target lncRNA value Regulation
#15333 SLC38A4 ZNF385D-AS2 0.7822774 pos
#29392 EPHX2 ZIM2-AS1 0.7123947 pos
#29443 CYP4F12 ZIM2-AS1 0.7473137 pos
#29500 CDO1 ZIM2-AS1 0.8504875 pos
#29666 ABCB11 ZIM2-AS1 0.7221485 pos
#38112 HS3ST2 ZBTB20-AS3 0.7256150 pos

colourCount <- length(unique(data$target))
getPalette <- colorRampPalette(brewer.pal(12, "Paired"))
3.1 ggalluvium
ggplot(data, aes(axis1 = lncRNA, axis2 = target,
axis3 = Regulation,
y = value)) +
scale_x_discrete(limits = c("lncRNA", "target", "Regulation"), expand = c(.2, .05))+
xlab("CorGraph") +
geom_alluvium(aes(fill = Regulation),width = 1/12, alpha = 1, knot.pos = 0.1, reverse = FALSE)+
scale_fill_aaas()+
geom_stratum(width= 1/12, reverse = FALSE)+
# geom_text(stat = "stratum", aes(label = after_stat(stratum)))+
theme_bw() +
theme(axis.line = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.grid = element_blank(),
panel.border = element_blank())+
ggtitle("Correlation between lncRNAs and target genes",
"stratified by regulation")
ggsave( filename = 'output/plot/step1.doCor_ggalluvium.pdf'),
width = 10, height = 6)

可以看到在我们选定的阈值下,相关性都为正相关。

3.2 barplot

查看的是代谢基因(横坐标)所对应的lncRNA的数量(纵坐标)。

ggplot(data) + 
geom_bar(aes(factor(target), fill=factor(target))) +
scale_fill_manual(values = getPalette(colourCount))+
xlab("") + ylab("")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, vjust = 0.6))

ggsave( filename = 'output/plot/step1.doCor_barplot.pdf',
width = 10, height = 8)

colnames(data)
ggplot(data) +
geom_bar(aes(factor(target), fill=factor(target))) +
scale_fill_manual(values = getPalette(colourCount))+
xlab("") + ylab("")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, vjust = 0.6)) + facet_wrap(~Regulation,nrow=2) #加上正负相关性的分组,不过这里只有正相关

ggsave(filename = 'output/plot/step1.doCor_barplot_split.pdf',
width = 10, height = 8)


所有基因的基因名图例非常多,下面基因名也看不清了。

挑选相关性前一百个基因看一下。但其实对应的lncRNA会不全,没有什么意义。

#相关性前100个基因(有重复)
data1 <- data[order(data$value,decreasing = T),]
data1 = data[1:100,]

colourCount <- length(unique(data1$target))

ggplot(data1) +
geom_bar(aes(factor(target), fill=factor(target))) +
scale_fill_manual(values = getPalette(colourCount))+
xlab("") + ylab("")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, vjust = 0.6)) + facet_wrap(~Regulation,nrow=2)
ggsave(filename = 'output/plot/step1.100doCor_barplot_split.png'),
width = 20, height = 12)


那看一下对应lncRNA数量前50的基因:

a=names(sort(table(data$target),decreasing = T)[1:50])
data2=data[data$target%in%a,]

colourCount <- length(unique(data2$target))

ggplot(data2) +
geom_bar(aes(factor(target), fill=factor(target))) +
scale_fill_manual(values = getPalette(colourCount))+
xlab("") + ylab("")+
theme_bw()+
theme(axis.text.x = element_text(angle = 45, vjust = 0.6))
ggsave(filename = paste0(pro,'_','output/plot/step1.50doCor_barplot_split.png'),
width = 20, height = 12)