Open w28924461701 opened 1 year ago
options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) options(BioC_mirror="http://mirrors.tuna.tsinghua.edu.cn/bioconductor/") BiocManager::install("ChAMP") BiocManager::install("ChAMPdata") BiocManager::install("ComplexHeatmap") install.packages("ggpubr", "randomcoloR")
library(ggplot2) library(ChAMP) library(data.table) library(randomcoloR) library(ggpubr) library(GSVA) library(clusterProfiler)
data("probe.features") # 读取甲基化450k数据的探针注释 Sys.setenv(LANGUAGE = "en") #显示英文报错信息 options(stringsAsFactors = FALSE) #禁止chr转成factor
tumors <- c("BLCA","BRCA","CESC","CHOL","COAD", "ESCA","GBM","HNSC","KICH","KIRC", "KIRP","LIHC","LUAD","LUSC","PAAD", "PRAD","READ","STAD","THCA","UCEC")
frg <- c("CDC20B")
rawAnno <- read.delim("~/public_data/merged_sample_quality_annotations.tsv",sep = "\t",row.names = NULL,check.names = F,stringsAsFactors = F,header = T) # 数据来自PanCanAtlas rawAnno$simple_barcode <- substr(rawAnno$aliquot_barcode,1,15) samAnno <- rawAnno[!duplicated(rawAnno$simple_barcode),c("cancer type", "simple_barcode")] samAnno <- samAnno[which(samAnno$cancer type != ""),] write.table(samAnno,"simple_sample_annotation.txt",sep = "\t",row.names = F,col.names = T,quote = F)
cancer type
promoter <- probe.features[which(probe.features$feature %in% c("TSS1500","TSS200")),] # 根据注释文件筛选启动子探针,也可以分析其他探针比如增强子Enhancer等等 promoter <- promoter[which(promoter$gene %in% frg),] # 取出感兴趣基因在感兴趣位点的探针 write.table(promoter, "promoter_annotation_for_interested_genes.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
for (i in tumors) { # 比较慢请耐心 message("--",i,"...") load(file.path("~/public_data/tcga_methy450",paste0("TCGA-",i,"_methy450.Rdata"))) # 加载甲基化RData文件 methy450 <- as.data.frame(methy450) # 转数据框 rownames(methy450) <- methy450[,1] # 行名改为第一列 methy450 <- methy450[,-1] # 去掉第一列探针名 dimname <- dimnames(methy450) # 获取行名和列明 methy450 <- sapply(methy450, as.numeric) # 将数据全部转为数值以防报错 dimnames(methy450) <- dimname # 重新赋值行名和列名 methy450 <- as.data.frame(methy450) # 转数据框
compb <- intersect(rownames(methy450),rownames(promoter)) # 获取甲基化数据和感兴趣探针的交集 methy450 <- methy450[compb,] # 取出子集 methy450$gene <- promoter[compb,"gene"] # 将探针名和基因名进行映射
methy450 <- apply(methy450[,setdiff(colnames(methy450), "gene")], 2, function(x) tapply(x, INDEX=factor(methy450$gene), FUN=median, na.rm=TRUE)) methy450 <- as.data.frame(methy450) write.table(methy450, paste0("TCGA_",i,"_methy450_subset.txt"),sep = "\t",row.names = T,col.names = NA,quote = F) rm(methy450); gc() # 释放内存 }
deltaMeth <- NULL for (i in tumors) { message("--",i,"...")
methsubset <- read.table(paste0("TCGA",i,"_methy450_subset.txt"),sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T) meth_subset<-t(meth_subset) meth_subset<-as.data.frame(meth_subset) colnames(meth_subset) <- substr(colnames(meth_subset),1,15)
tumsam <- colnames(meth_subset)[substr(colnames(meth_subset),14,14) == "0"] # 获取肿瘤样本 norsam <- colnames(meth_subset)[substr(colnames(meth_subset),14,14) == "1"] # 获取正常样本
outTab <- NULL for (j in rownames(meth_subset)) { if(i == "KICH") { # KICH没有正常的甲基化样本 delta <- 0 # 如果在分析KICH则delta设置为0 wt <- 1 # 如果在分析KICH则设置pvalue为1 outTab <- rbind.data.frame(outTab, data.frame(gene = j, tumor = i, Delta = delta, Pvalue = wt, stringsAsFactors = F), stringsAsFactors = F) } else { tmp1 <- as.numeric(meth_subset[j,tumsam]) # 获取肿瘤样本的beta值 tmp2 <- as.numeric(meth_subset[j,norsam]) # 获取正常样本的beta值
wt <- wilcox.test(tmp1,tmp2)$p.value # wilcox检验 avgt <- mean(tmp1) # 肿瘤样本的平均beta值 avgn <- mean(tmp2) # 正常样本的平均beta值 delta <- avgt - avgn # delta值由肿瘤样本减去正常样本计算得到 outTab <- rbind.data.frame(outTab, data.frame(gene = j, tumor = i, Delta = delta, Pvalue = wt, stringsAsFactors = F), stringsAsFactors = F) }
} deltaMeth <- rbind.data.frame(deltaMeth, outTab, stringsAsFactors = F) }
blue <- "#4577FF" red <- "#C2151A" orange <- "#E45737" green <- "#6F8B35" darkblue <- "#303B7F" darkred <- "#D51113" yellow <- "#EECA1F"
# 产生泡泡图 deltaMeth$gene <- factor(deltaMeth$gene, levels = rev(c("CDC20B"))) my_palette <- colorRampPalette(c(blue,"white",orange), alpha=TRUE)(n=128) ggplot(deltaMeth, aes(x=tumor,y=gene)) + geom_point(aes(size=-log10(Pvalue),color=Delta)) + scale_color_gradientn('Delta(T-N)', colors=my_palette) + theme_bw() + theme(#panel.grid.minor = element_blank(), #panel.grid.major = element_blank(), axis.text.x = element_text(angle = 45, size = 12, hjust = 0.3, vjust = 0.5, color = "black"), axis.text.y = element_text(size = 12, color = rep(c(red,blue),c(14,10))), axis.title = element_blank(), panel.border = element_rect(size = 0.7, linetype = "solid", colour = "black"), legend.position = "right", plot.margin = unit(c(1,1,1,1), "lines"))
ggsave("Figure 1E correlation between promoter methylation and expression of interested genes in pancancer.pdf", width = 8,height = 6)
options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) options(BioC_mirror="http://mirrors.tuna.tsinghua.edu.cn/bioconductor/") BiocManager::install("ChAMP") BiocManager::install("ChAMPdata") BiocManager::install("ComplexHeatmap") install.packages("ggpubr", "randomcoloR")
library(ggplot2) library(ChAMP) library(data.table) library(randomcoloR) library(ggpubr) library(GSVA) library(clusterProfiler)
data("probe.features") # 读取甲基化450k数据的探针注释 Sys.setenv(LANGUAGE = "en") #显示英文报错信息 options(stringsAsFactors = FALSE) #禁止chr转成factor
获得同时有肿瘤和正常样本的肿瘤名
tumors <- c("BLCA","BRCA","CESC","CHOL","COAD", "ESCA","GBM","HNSC","KICH","KIRC", "KIRP","LIHC","LUAD","LUSC","PAAD", "PRAD","READ","STAD","THCA","UCEC")
获得感兴趣的基因集(TTC35是EMC2的同名)
frg <- c("CDC20B")
修正TCGA名称
https://gdc.cancer.gov/about-data/publications/pancanatlas
rawAnno <- read.delim("~/public_data/merged_sample_quality_annotations.tsv",sep = "\t",row.names = NULL,check.names = F,stringsAsFactors = F,header = T) # 数据来自PanCanAtlas rawAnno$simple_barcode <- substr(rawAnno$aliquot_barcode,1,15) samAnno <- rawAnno[!duplicated(rawAnno$simple_barcode),c("cancer type", "simple_barcode")] samAnno <- samAnno[which(samAnno$
cancer type
!= ""),] write.table(samAnno,"simple_sample_annotation.txt",sep = "\t",row.names = F,col.names = T,quote = F)提取匹配到FRG基因的启动子探针
promoter <- probe.features[which(probe.features$feature %in% c("TSS1500","TSS200")),] # 根据注释文件筛选启动子探针,也可以分析其他探针比如增强子Enhancer等等 promoter <- promoter[which(promoter$gene %in% frg),] # 取出感兴趣基因在感兴趣位点的探针 write.table(promoter, "promoter_annotation_for_interested_genes.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
获取只有感兴趣探针/基因的甲基化子集
for (i in tumors) { # 比较慢请耐心 message("--",i,"...") load(file.path("~/public_data/tcga_methy450",paste0("TCGA-",i,"_methy450.Rdata"))) # 加载甲基化RData文件 methy450 <- as.data.frame(methy450) # 转数据框 rownames(methy450) <- methy450[,1] # 行名改为第一列 methy450 <- methy450[,-1] # 去掉第一列探针名 dimname <- dimnames(methy450) # 获取行名和列明 methy450 <- sapply(methy450, as.numeric) # 将数据全部转为数值以防报错 dimnames(methy450) <- dimname # 重新赋值行名和列名 methy450 <- as.data.frame(methy450) # 转数据框
compb <- intersect(rownames(methy450),rownames(promoter)) # 获取甲基化数据和感兴趣探针的交集 methy450 <- methy450[compb,] # 取出子集 methy450$gene <- promoter[compb,"gene"] # 将探针名和基因名进行映射
如果基因匹配多个探针,则取中位数beta值
methy450 <- apply(methy450[,setdiff(colnames(methy450), "gene")], 2, function(x) tapply(x, INDEX=factor(methy450$gene), FUN=median, na.rm=TRUE)) methy450 <- as.data.frame(methy450) write.table(methy450, paste0("TCGA_",i,"_methy450_subset.txt"),sep = "\t",row.names = T,col.names = NA,quote = F) rm(methy450); gc() # 释放内存 }
deltaMeth <- NULL for (i in tumors) { message("--",i,"...")
获取只包含感兴趣探针/基因的甲基化数据
methsubset <- read.table(paste0("TCGA",i,"_methy450_subset.txt"),sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T) meth_subset<-t(meth_subset) meth_subset<-as.data.frame(meth_subset) colnames(meth_subset) <- substr(colnames(meth_subset),1,15)
tumsam <- colnames(meth_subset)[substr(colnames(meth_subset),14,14) == "0"] # 获取肿瘤样本 norsam <- colnames(meth_subset)[substr(colnames(meth_subset),14,14) == "1"] # 获取正常样本
outTab <- NULL for (j in rownames(meth_subset)) { if(i == "KICH") { # KICH没有正常的甲基化样本 delta <- 0 # 如果在分析KICH则delta设置为0 wt <- 1 # 如果在分析KICH则设置pvalue为1 outTab <- rbind.data.frame(outTab, data.frame(gene = j, tumor = i, Delta = delta, Pvalue = wt, stringsAsFactors = F), stringsAsFactors = F) } else { tmp1 <- as.numeric(meth_subset[j,tumsam]) # 获取肿瘤样本的beta值 tmp2 <- as.numeric(meth_subset[j,norsam]) # 获取正常样本的beta值
} deltaMeth <- rbind.data.frame(deltaMeth, outTab, stringsAsFactors = F) }
设置颜色
blue <- "#4577FF" red <- "#C2151A" orange <- "#E45737" green <- "#6F8B35" darkblue <- "#303B7F" darkred <- "#D51113" yellow <- "#EECA1F"