探索TCGA的临床特征分组——做差异分析前你有没有忘记它

探索TCGA的临床特征分组——做差异分析前你有没有忘记它 by 生信菜鸟团

上次我们说到把代谢基因做差异分析，由于TCGA中上传整理的并不是严格的tumor-normal实验设计，我们前期一直探索的LAML数据，就是没有normal样本的，那么就得选取别的分组做差异分析。而在差异分析前不能忘记的就是——再次强调表达量矩阵分析一定要三张图，根据老师的要求先尝试质控三张图的pca图，我们最关心的生存结局，在这个时候就是没有显著差异的——这时我们会很自然地想到用其他临床特征来分组。

数据准备

导入我们在 TCGA的XENA的转录组测序表达量矩阵预处理中，id转换之后的LAML表达量矩阵：

load(file = 'output/rdata/0.expr.all.Rdata')
n_t_exp = dat
dim(n_t_exp)
#[1] 38953   151
n_t_exp[1:4,1:4]#没有运行colnames(n_t_exp) = substring(colnames(n_t_exp),1,15)，n_t_exp列名还是16位，我们后面的临床特征矩阵样本名也是16位
#       TCGA-AB-2949-03B TCGA-AB-2918-03A TCGA-AB-2943-03A TCGA-AB-2851-03A
#ZZZ3               3290             2830             4059             2087
#ZZEF1              7002             7297             6663             7688
#ZYX               10026            15461            10270            27210
#ZYG11B             1810             1645             1919             1718

dat=log(edgeR::cpm(n_t_exp)+1)

lncRNA和代谢基因表达矩阵：

load( file = 'output/rdata/0.lncRNA_target_expression_data.Rdata') 
lnc_dat[1:4, 1:4] 
#            TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZSWIM8-AS1               15              14              21              11
#ZSCAN16-AS1             277             163             214             170
#ZRANB2-AS1               36              26             100              15
#ZNRD1-AS1              1880             929            1909            1663

target_dat[1:4, 1:4] #代谢基因
#         TCGA-AB-2949-03 TCGA-AB-2851-03 TCGA-AB-2822-03 TCGA-AB-2939-03
#ZACN                 104             117             137              93
#XYLT2                919            1305            1525            1214
#XDH                    1               1               1               5
#VKORC1L1            1578             954            1818            1473

lnc_g = rownames(lnc_dat)
prg_g = rownames(target_dat)
dat <- t(dat[prg_g,])   #取的是基因不是样本

分组准备

导入生存信息和其他临床信息：

load( file = 'output/rdata/0.survival.Rdata') #之前处理好的生存信息
head(survdata)
#                event      time
#TCGA-AB-2802-03     1 1.0000000
#TCGA-AB-2803-03     1 2.1698630
#TCGA-AB-2804-03     0 7.0054795
#TCGA-AB-2805-03     1 1.5808219
#TCGA-AB-2806-03     1 2.5890411
#TCGA-AB-2807-03     1 0.4958904

library(stringr)  
library(data.table) 
n1 <- fread('input/TCGA-LAML.GDC_phenotype.tsv.gz', #其他临床信息
            data.table = F) 
dim(n1)  #很多冗余信息
#[1] 697  98

head(colnames(n1),15)
# [1] "submitter_id.samples"         "acute_myeloid_leukemia_calgb_cytogenetics_risk_category"
# [3] "age_at_initial_pathologic_diagnosis"                     "atra_exposure"          # [5] "batch_number"                                            "bcr"                   
# [7] "submitter_id"         "cumulative_agent_total_dose"
# [9] "cytogenetic_abnormality_other"         "cytogenetic_analysis_performed_ind"    
# [11] "day_of_dcc_upload"         "days_to_initial_pathologic_diagnosis"               
# [13] "disease_detection_molecular_analysis_method_type"        "file_uuid"           
# [15] "fish_evaluation_performed_ind"

见曾老师的基础教学：对表型数据框进行去冗余

rownames(n1) = n1[,1]
n2 = n1[colnames(n_t_exp), ] #取临床特征矩阵和表达量矩阵的交集
n2=n2[,apply(n2,2,function(x) length(unique(x)) > 1 )] #去掉在所有样本里记录都一致的列
n2=n2[,apply(n2,2,function(x) length(unique(x)) < 5 )] #但是分组不要超过5个，这是自定的
             
dim(n2) #这时候就只剩下23个特征了
#[1] 151  23
head(colnames(n2))
# [1] "acute_myeloid_leukemia_calgb_cytogenetics_risk_category"               
# [2] "atra_exposure"                                                         
# [3] "cytogenetic_analysis_performed_ind"                                    
# [4] "disease_detection_molecular_analysis_method_type"                      
# [5] "fish_evaluation_performed_ind"                                         
# [6] "fluorescence_in_situ_hybridization_abnormal_result_indicator"

开始画图

生存结局的分组pca图：

gp = as.factor(survdata$event) 
table(gp)  

p1=fviz_pca_ind(dat.pca,
                geom.ind = "point", # show points only (nbut not "text")
                col.ind = gp, # color by groups
                # alette = c("#00AFBB", "#E7B800"),
                addEllipses = TRUE, # Concentration ellipses
                legend.title = "Tissue Type",
                #title = 'Paired Samples',
                ggtheme = theme_minimal()
)
p1  
ggsave(p1,filename = paste0(pro,'_','output/plot/step1.pca-surv.pdf'))

用两个临床特征的分组画pca看看：

#1---- prior_malignancy.diagnoses
table(n2$prior_malignancy.diagnoses)
# no yes 
#138  13 
gp = n2$prior_malignancy.diagnoses

library("FactoMineR")
library("factoextra") 

dat.pca <- PCA(dat , graph = FALSE)
p1=fviz_pca_ind(dat.pca,  #看看样本和分组是否对应
                geom.ind = "point", # show points only (nbut not "text")
                col.ind = gp, # color by groups
                # alette = c("#00AFBB", "#E7B800"),
                addEllipses = TRUE, # Concentration ellipses
                legend.title = "prior_malignancy.diagnoses",
                #title = 'Paired Samples',
                ggtheme = theme_minimal()
)
p1  
ggsave(p1,filename = 'output/plot/step1.pca-prior_malignancy.diagnoses.pdf')

#2---- fish_evaluation_performed_ind
table(n2$fish_evaluation_performed_ind)
#     NO YES 
#  3  26 122
gp = n2$fish_evaluation_performed_ind

library("FactoMineR")
library("factoextra") 

dat.pca <- PCA(dat , graph = FALSE)
p1=fviz_pca_ind(dat.pca,  #看看样本和分组是否对应
                geom.ind = "point", # show points only (nbut not "text")
                col.ind = gp, # color by groups
                # alette = c("#00AFBB", "#E7B800"),
                addEllipses = TRUE, # Concentration ellipses
                legend.title = "fish_evaluation_performed_ind",
                #title = 'Paired Samples',
                ggtheme = theme_minimal()
)
p1  
ggsave(p1,filename = 'output/plot/step1.pca-fish_evaluation_performed_ind.pdf',)

小洁老师说过，当一个代码需要复制粘贴三次，就应该写成函数或使用循环：

#尝试写成循环
ftp = as.data.frame(colnames(n2))
n2 = n2[,-11] #第11列是na
groupl = colnames(n2)
dat.pca <- PCA(dat , graph = FALSE)

library("FactoMineR")
library("factoextra") 

pca_list <- lapply(groupl, function(i){
  #i = groupl[3]
  gp = n2[,i]
  gp = as.factor(gp)
  
  p1=fviz_pca_ind(dat.pca, 
                  geom.ind = "point", # show points only (nbut not "text")
                  col.ind = gp, # color by groups
                  # alette = c("#00AFBB", "#E7B800"),
                  addEllipses = TRUE, # Concentration ellipses
                  legend.title = 'group',
                  title = i,
                  ggtheme = theme_minimal()
  )}) 

#拼图
library(patchwork)
p2 = wrap_plots(pca_list,nrow=5)
p2
ggsave(p2,filename = 'output/plot/step1.pca.png',
       width = 20,height = 10,
       limitsize = FALSE)

可以看到稍微能分开的是vital_number，分组是A-冷冻样本，B-石蜡包埋，那么肯定是不能当作差异分析的分组了，但是也许向我们展示了技术处理造成的误差，现在确实有很多做TCGA技术处理的研究。

ixxmu / mp_duty

探索TCGA的临床特征分组——做差异分析前你有没有忘记它 #2828

探索TCGA的临床特征分组——做差异分析前你有没有忘记它 by 生信菜鸟团

数据准备

分组准备

开始画图