ixxmu / mp_duty

抓取网络文章到github issues保存
https://archives.duty-machine.now.sh/
122 stars 30 forks source link

探索TCGA的临床特征分组——做差异分析前你有没有忘记它 #2828

Closed ixxmu closed 2 years ago

ixxmu commented 2 years ago

https://mp.weixin.qq.com/s/klTM3-pUWTUhcDC9LRqjQQ

ixxmu commented 2 years ago

探索TCGA的临床特征分组——做差异分析前你有没有忘记它 by 生信菜鸟团

上次我们说到把代谢基因做差异分析,由于TCGA中上传整理的并不是严格的tumor-normal实验设计,我们前期一直探索的LAML数据,就是没有normal样本的,那么就得选取别的分组做差异分析。而在差异分析前不能忘记的就是——再次强调表达量矩阵分析一定要三张图,根据老师的要求先尝试质控三张图的pca图,我们最关心的生存结局,在这个时候就是没有显著差异的——这时我们会很自然地想到用其他临床特征来分组。

数据准备

导入我们在 TCGA的XENA的转录组测序表达量矩阵预处理 中,id转换之后的LAML表达量矩阵:

load(file = 'output/rdata/0.expr.all.Rdata')
n_t_exp = dat
dim(n_t_exp)
#[1] 38953 151
n_t_exp[1:4,1:4]#没有运行colnames(n_t_exp) = substring(colnames(n_t_exp),1,15),n_t_exp列名还是16位,我们后面的临床特征矩阵样本名也是16位
# TCGA-AB-2949-03B TCGA-AB-2918-03A TCGA-AB-2943-03A TCGA-AB-2851-03A
#ZZZ3 3290 2830 4059 2087
#ZZEF1 7002 7297 6663 7688
#ZYX 10026 15461 10270 27210
#ZYG11B 1810 1645 1919 1718

dat=log(edgeR::cpm(n_t_exp)+1)

lncRNA和代谢基因表达矩阵:

load( file = 'output/rdata/0.lncRNA_target_expression_data.Rdata') 
lnc_dat[1:4, 1:4]
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZSWIM8-AS1 15 14 21 11
#ZSCAN16-AS1 277 163 214 170
#ZRANB2-AS1 36 26 100 15
#ZNRD1-AS1 1880 929 1909 1663

target_dat[1:4, 1:4] #代谢基因
# TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZACN 104 117 137 93
#XYLT2 919 1305 1525 1214
#XDH 1 1 1 5
#VKORC1L1 1578 954 1818 1473

lnc_g = rownames(lnc_dat)
prg_g = rownames(target_dat)
dat <- t(dat[prg_g,]) #取的是基因不是样本

分组准备

导入生存信息和其他临床信息:

load( file = 'output/rdata/0.survival.Rdata') #之前处理好的生存信息
head(survdata)
# event time
#TCGA-AB-2802-03 1 1.0000000
#TCGA-AB-2803-03 1 2.1698630
#TCGA-AB-2804-03 0 7.0054795
#TCGA-AB-2805-03 1 1.5808219
#TCGA-AB-2806-03 1 2.5890411
#TCGA-AB-2807-03 1 0.4958904

library(stringr)
library(data.table)
n1 <- fread('input/TCGA-LAML.GDC_phenotype.tsv.gz', #其他临床信息
data.table = F)
dim(n1) #很多冗余信息
#[1] 697 98

head(colnames(n1),15)
# [1] "submitter_id.samples" "acute_myeloid_leukemia_calgb_cytogenetics_risk_category"
# [3] "age_at_initial_pathologic_diagnosis" "atra_exposure" # [5] "batch_number" "bcr"
# [7] "submitter_id" "cumulative_agent_total_dose"
# [9] "cytogenetic_abnormality_other" "cytogenetic_analysis_performed_ind"
# [11] "day_of_dcc_upload" "days_to_initial_pathologic_diagnosis"
# [13] "disease_detection_molecular_analysis_method_type" "file_uuid"
# [15] "fish_evaluation_performed_ind"

见曾老师的基础教学:对表型数据框进行去冗余

rownames(n1) = n1[,1]
n2 = n1[colnames(n_t_exp), ] #取临床特征矩阵和表达量矩阵的交集
n2=n2[,apply(n2,2,function(x) length(unique(x)) > 1 )] #去掉在所有样本里记录都一致的列
n2=n2[,apply(n2,2,function(x) length(unique(x)) < 5 )] #但是分组不要超过5个,这是自定的

dim(n2) #这时候就只剩下23个特征了
#[1] 151 23
head(colnames(n2))
# [1] "acute_myeloid_leukemia_calgb_cytogenetics_risk_category"
# [2] "atra_exposure"
# [3] "cytogenetic_analysis_performed_ind"
# [4] "disease_detection_molecular_analysis_method_type"
# [5] "fish_evaluation_performed_ind"
# [6] "fluorescence_in_situ_hybridization_abnormal_result_indicator"

开始画图

生存结局的分组pca图:

gp = as.factor(survdata$event) 
table(gp)

p1=fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = gp, # color by groups
# alette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Tissue Type",
#title = 'Paired Samples',
ggtheme = theme_minimal()
)
p1
ggsave(p1,filename = paste0(pro,'_','output/plot/step1.pca-surv.pdf'))

用两个临床特征的分组画pca看看:

#1---- prior_malignancy.diagnoses
table(n2$prior_malignancy.diagnoses)
# no yes
#138 13
gp = n2$prior_malignancy.diagnoses

library("FactoMineR")
library("factoextra")

dat.pca <- PCA(dat , graph = FALSE)
p1=fviz_pca_ind(dat.pca, #看看样本和分组是否对应
geom.ind = "point", # show points only (nbut not "text")
col.ind = gp, # color by groups
# alette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "prior_malignancy.diagnoses",
#title = 'Paired Samples',
ggtheme = theme_minimal()
)
p1
ggsave(p1,filename = 'output/plot/step1.pca-prior_malignancy.diagnoses.pdf')
#2---- fish_evaluation_performed_ind
table(n2$fish_evaluation_performed_ind)
# NO YES
# 3 26 122
gp = n2$fish_evaluation_performed_ind

library("FactoMineR")
library("factoextra")

dat.pca <- PCA(dat , graph = FALSE)
p1=fviz_pca_ind(dat.pca, #看看样本和分组是否对应
geom.ind = "point", # show points only (nbut not "text")
col.ind = gp, # color by groups
# alette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "fish_evaluation_performed_ind",
#title = 'Paired Samples',
ggtheme = theme_minimal()
)
p1
ggsave(p1,filename = 'output/plot/step1.pca-fish_evaluation_performed_ind.pdf',)

小洁老师说过,当一个代码需要复制粘贴三次,就应该写成函数或使用循环:

#尝试写成循环
ftp = as.data.frame(colnames(n2))
n2 = n2[,-11] #第11列是na
groupl = colnames(n2)
dat.pca <- PCA(dat , graph = FALSE)

library("FactoMineR")
library("factoextra")

pca_list <- lapply(groupl, function(i){
#i = groupl[3]
gp = n2[,i]
gp = as.factor(gp)

p1=fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = gp, # color by groups
# alette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = 'group',
title = i,
ggtheme = theme_minimal()
)})

#拼图
library(patchwork)
p2 = wrap_plots(pca_list,nrow=5)
p2
ggsave(p2,filename = 'output/plot/step1.pca.png',
width = 20,height = 10,
limitsize = FALSE)

可以看到稍微能分开的是vital_number,分组是A-冷冻样本,B-石蜡包埋,那么肯定是不能当作差异分析的分组了,但是也许向我们展示了技术处理造成的误差,现在确实有很多做TCGA技术处理的研究。