w28924461701 / cdc20b

0 stars 0 forks source link

tumor and normal promoter methylation of interested genes in pancancer.r #7

Open w28924461701 opened 1 year ago

w28924461701 commented 1 year ago

options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) options(BioC_mirror="http://mirrors.tuna.tsinghua.edu.cn/bioconductor/") BiocManager::install("ChAMP") BiocManager::install("ChAMPdata") BiocManager::install("ComplexHeatmap") install.packages("ggpubr", "randomcoloR")

library(ggplot2) library(ChAMP) library(data.table) library(randomcoloR) library(ggpubr) library(GSVA) library(clusterProfiler)

data("probe.features") # 读取甲基化450k数据的探针注释 Sys.setenv(LANGUAGE = "en") #显示英文报错信息 options(stringsAsFactors = FALSE) #禁止chr转成factor

获得同时有肿瘤和正常样本的肿瘤名

tumors <- c("BLCA","BRCA","CESC","CHOL","COAD", "ESCA","GBM","HNSC","KICH","KIRC", "KIRP","LIHC","LUAD","LUSC","PAAD", "PRAD","READ","STAD","THCA","UCEC")

获得感兴趣的基因集(TTC35是EMC2的同名)

frg <- c("CDC20B")

修正TCGA名称

https://gdc.cancer.gov/about-data/publications/pancanatlas

rawAnno <- read.delim("~/public_data/merged_sample_quality_annotations.tsv",sep = "\t",row.names = NULL,check.names = F,stringsAsFactors = F,header = T) # 数据来自PanCanAtlas rawAnno$simple_barcode <- substr(rawAnno$aliquot_barcode,1,15) samAnno <- rawAnno[!duplicated(rawAnno$simple_barcode),c("cancer type", "simple_barcode")] samAnno <- samAnno[which(samAnno$cancer type != ""),] write.table(samAnno,"simple_sample_annotation.txt",sep = "\t",row.names = F,col.names = T,quote = F)

提取匹配到FRG基因的启动子探针

promoter <- probe.features[which(probe.features$feature %in% c("TSS1500","TSS200")),] # 根据注释文件筛选启动子探针,也可以分析其他探针比如增强子Enhancer等等 promoter <- promoter[which(promoter$gene %in% frg),] # 取出感兴趣基因在感兴趣位点的探针 write.table(promoter, "promoter_annotation_for_interested_genes.txt",sep = "\t",row.names = T,col.names = NA,quote = F)

获取只有感兴趣探针/基因的甲基化子集

for (i in tumors) { # 比较慢请耐心 message("--",i,"...") load(file.path("~/public_data/tcga_methy450",paste0("TCGA-",i,"_methy450.Rdata"))) # 加载甲基化RData文件 methy450 <- as.data.frame(methy450) # 转数据框 rownames(methy450) <- methy450[,1] # 行名改为第一列 methy450 <- methy450[,-1] # 去掉第一列探针名 dimname <- dimnames(methy450) # 获取行名和列明 methy450 <- sapply(methy450, as.numeric) # 将数据全部转为数值以防报错 dimnames(methy450) <- dimname # 重新赋值行名和列名 methy450 <- as.data.frame(methy450) # 转数据框

compb <- intersect(rownames(methy450),rownames(promoter)) # 获取甲基化数据和感兴趣探针的交集 methy450 <- methy450[compb,] # 取出子集 methy450$gene <- promoter[compb,"gene"] # 将探针名和基因名进行映射

如果基因匹配多个探针,则取中位数beta值

methy450 <- apply(methy450[,setdiff(colnames(methy450), "gene")], 2, function(x) tapply(x, INDEX=factor(methy450$gene), FUN=median, na.rm=TRUE)) methy450 <- as.data.frame(methy450) write.table(methy450, paste0("TCGA_",i,"_methy450_subset.txt"),sep = "\t",row.names = T,col.names = NA,quote = F) rm(methy450); gc() # 释放内存 }

deltaMeth <- NULL for (i in tumors) { message("--",i,"...")

获取只包含感兴趣探针/基因的甲基化数据

methsubset <- read.table(paste0("TCGA",i,"_methy450_subset.txt"),sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T) meth_subset<-t(meth_subset) meth_subset<-as.data.frame(meth_subset) colnames(meth_subset) <- substr(colnames(meth_subset),1,15)

tumsam <- colnames(meth_subset)[substr(colnames(meth_subset),14,14) == "0"] # 获取肿瘤样本 norsam <- colnames(meth_subset)[substr(colnames(meth_subset),14,14) == "1"] # 获取正常样本

outTab <- NULL for (j in rownames(meth_subset)) { if(i == "KICH") { # KICH没有正常的甲基化样本 delta <- 0 # 如果在分析KICH则delta设置为0 wt <- 1 # 如果在分析KICH则设置pvalue为1 outTab <- rbind.data.frame(outTab, data.frame(gene = j, tumor = i, Delta = delta, Pvalue = wt, stringsAsFactors = F), stringsAsFactors = F) } else { tmp1 <- as.numeric(meth_subset[j,tumsam]) # 获取肿瘤样本的beta值 tmp2 <- as.numeric(meth_subset[j,norsam]) # 获取正常样本的beta值

  wt <- wilcox.test(tmp1,tmp2)$p.value # wilcox检验
  avgt <- mean(tmp1) # 肿瘤样本的平均beta值
  avgn <- mean(tmp2) # 正常样本的平均beta值
  delta <- avgt - avgn # delta值由肿瘤样本减去正常样本计算得到

  outTab <- rbind.data.frame(outTab,
                             data.frame(gene = j,
                                        tumor = i,
                                        Delta = delta,
                                        Pvalue = wt,
                                        stringsAsFactors = F),
                             stringsAsFactors = F)
}

} deltaMeth <- rbind.data.frame(deltaMeth, outTab, stringsAsFactors = F) }

设置颜色

blue <- "#4577FF" red <- "#C2151A" orange <- "#E45737" green <- "#6F8B35" darkblue <- "#303B7F" darkred <- "#D51113" yellow <- "#EECA1F"

        # 产生泡泡图
        deltaMeth$gene <- factor(deltaMeth$gene,
                                 levels = rev(c("CDC20B")))

        my_palette <- colorRampPalette(c(blue,"white",orange), alpha=TRUE)(n=128)
        ggplot(deltaMeth, aes(x=tumor,y=gene)) +
          geom_point(aes(size=-log10(Pvalue),color=Delta)) +
          scale_color_gradientn('Delta(T-N)', 
                                colors=my_palette) + 
          theme_bw() +
          theme(#panel.grid.minor = element_blank(), 
            #panel.grid.major = element_blank(),
            axis.text.x = element_text(angle = 45, size = 12, hjust = 0.3, vjust = 0.5, color = "black"),
            axis.text.y = element_text(size = 12, color = rep(c(red,blue),c(14,10))),
            axis.title = element_blank(),
            panel.border = element_rect(size = 0.7, linetype = "solid", colour = "black"),
            legend.position = "right",
            plot.margin = unit(c(1,1,1,1), "lines"))
w28924461701 commented 1 year ago

ggsave("Figure 1E correlation between promoter methylation and expression of interested genes in pancancer.pdf", width = 8,height = 6)