Closed ixxmu closed 2 hours ago
导入包
library(grid)
library(ggpubr)
library(cowplot)
library(ggforce)
library(tidyverse)
导入自定义函数,这三个函数是自己封装的,可以到知识星球获取。
也可以自己封装改写 ggforce
包中的 geom_shape
对象,基本也可以实现类似的功能。
source('code/geom-round-rect.R')
source('code/geom-round-bar.R')
source('code/geom-round-col.R')
获取数据
下载 TCGA-Pancancer[1] 研究中提供的补充文件
将文件放入 data
文件夹中,然后就可以读取数据了
#| warning: false
#| message: false
CDR <- readxl::read_excel('data/TCGA-CDR-SupplementalTableS1.xlsx') %>%
dplyr::select(-1)
导入之前处理好的 TCGA-LIHC
表达数据
load('data/TCGA-LIHC.exprs.rda')
从 UCSC xena
下载 GDC LIHC
的突变数据
# https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LIHC.somaticmutation_wxs.tsv.gz
lihc.mut <- read_tsv('data/TCGA-LIHC.somaticmutation_wxs.tsv.gz',
show_col_types = FALSE) %>%
dplyr::select(c('sample', 'gene', 'effect'))
定义图例的圆角矩形边框
#' @export
#' @rdname element
element_round_rect <- function(fill = NULL, colour = NULL, linewidth = NULL,
linetype = NULL, color = NULL, inherit.blank = FALSE, radius = NULL) {
if (!is.null(color)) colour <- color
structure(
list(fill = fill, colour = colour, linewidth = linewidth,
linetype = linetype, radius = radius,
inherit.blank = inherit.blank),
class = c("element_round_rect", "element_rect", "element")
)
}
element_grob.element_round_rect <- function(
element, x = 0.5, y = 0.5, width = 1, height = 1,
radius = NULL,
fill = NULL, colour = NULL, linewidth = NULL, linetype = NULL,
...) {
# The gp settings can override element_gp
gp <- gpar(
lwd = linewidth %||% element$linewidth,
col = colour %||% element$colour,
fill = fill %||% element$fill,
lty = linetype %||% element$linetype)
# radius
r <- radius %||% element$radius
roundrectGrob(x, y, width, height, r = r, gp = gp, ...)
}
定义颜色和主题
my_colors <- list(
two = c('#a1d76a', '#e9a3c9'),
three = c('#0099e5', '#ff4c4c', '#34bf49'),
four = c('#ed6ca4', '#fbb05b', '#acd372', '#7bc4e2'),
five = c('#f29c98', '#f5b697', '#f5e797', '#a2e4f5', '#009dd3'),
gradient = c('#009392', '#39b185', '#9ccb86', '#e9e29c',
'#eeb479', '#e88471', '#cf597e')
)
my_theme <- theme_minimal() +
theme(panel.grid = element_blank(),
axis.title = element_text(size = 14, face = 'bold'),
axis.line.y.left = element_line(colour = 'black'),
axis.line.x.bottom = element_line(colour = 'black'),
legend.background = element_round_rect(
colour = '#a5aa99', radius = unit(10, 'pt'), linewidth = 1.5))
可视化 TCGA 不同癌症类型中的患者数量分布
#| label: plot-x
ggplot(CDR) +
geom_round_bar(aes(type, fill = type), show.legend = FALSE) +
labs(x = NULL) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
my_theme +
theme(axis.text.x.bottom = element_text(angle = 90))
#| label: plot-y
ggplot(CDR, aes(type)) +
geom_round_bar(aes(fill = type), show.legend = FALSE) +
geom_text(aes(label = after_stat(count)), stat = "count",
hjust = 0, size = 3) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
coord_flip() +
labs(x = NULL) +
my_theme +
theme(axis.text.x.bottom = element_text(angle = 90))
数据太多,选择其中几个癌症类型来展示,看看每种癌症治愈比例有多少。
先去掉缺失数据
data <- dplyr::filter(CDR,
type %in% c('BRCA', 'GBM', 'LIHC', 'PAAD', 'STAD'),
tumor_status %in% c('WITH TUMOR', 'TUMOR FREE'))
#| label: plot-stack
ggplot(data, aes(type, group = tumor_status)) +
geom_round_bar(aes(fill = tumor_status), radius = unit(5, 'pt'),
colour = '#f7f7f7', lwd = 1) +
geom_text(aes(label = after_stat(count)), stat = 'count',
position = position_stack(vjust = 0.5)) +
scale_fill_manual(values = my_colors$two) +
labs(x = NULL) +
my_theme +
theme(axis.line.y.left = element_blank(),
axis.text.y.left = element_blank())
胶质瘤确实很致命,乳腺癌还是比较乐观的
#| label: plot-fill
ggplot(data, aes(type, group = tumor_status)) +
geom_round_bar(aes(fill = tumor_status), position = 'fill',
radius = unit(5, 'pt'), colour = '#f7f7f7') +
scale_fill_manual(values = my_colors$two) +
labs(x = NULL, y = 'Percent') +
my_theme +
scale_y_continuous(expand = expansion(0, 0))
#| label: plot-parallel
ggplot(data, aes(type, group = tumor_status)) +
geom_round_bar(aes(fill = tumor_status),
position = position_dodge2(width = 0.5),
radius = unit(5, 'pt'),
colour = '#f7f7f7') +
geom_text(aes(label = after_stat(count)), stat = 'count',
position = position_dodge2(width = 0.9),
vjust = 0, hjust = 0.5) +
scale_fill_manual(values = my_colors$two) +
labs(x = NULL) +
my_theme +
theme(axis.line.y.left = element_blank(),
axis.text.y.left = element_blank())
#| label: plot-error-bar
data.age <- dplyr::select(data, type, age_at_initial_pathologic_diagnosis) %>%
rename_with(~ 'age', .cols = 2) %>%
dplyr::filter(!is.na(age))
data.summary <- data.age %>%
group_by(type) %>%
summarise(
mean_age= mean(age),
.groups = 'drop',
se = sqrt(var(age) / length(age))
) %>%
mutate(lower = mean_age - se, upper = mean_age + se)
data.summary %>%
ggplot(aes(type, mean_age)) +
geom_round_col(aes(fill = type), width = 0.7) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.3) +
scale_fill_manual(name = 'Cancer Type', values = my_colors$five) +
labs(x = 'Cancer Type', y = 'Age') +
my_theme
诊断年龄倒是差别不大
提取 TCGA-LIHC
肿瘤分级信息
lihc.data <- dplyr::filter(CDR, type == 'LIHC') %>%
dplyr::select(c('bcr_patient_barcode', 'histological_grade',
'ajcc_pathologic_tumor_stage')) %>%
rename_with(~c('sample', 'grade', 'stage')) %>%
mutate(stage = case_when(
startsWith(stage, 'Stage IV') ~ 'Stage IV',
startsWith(stage, 'Stage III') ~ 'Stage III',
startsWith(stage, 'Stage II') ~ 'Stage II',
startsWith(stage, 'Stage I') ~ 'Stage I',
.default = NA),
grade = ifelse(startsWith(grade, 'G'), grade, NA)) %>%
dplyr::filter(!is.na(stage), stage != 'Stage IV', !is.na(grade))
不同分期数量
table(lihc.data$stage)
# Stage I Stage II Stage III
# 174 86 86
合并 TP53
基因表达数据
#| message: false
tp53.data <- lihc.tumor[, 'TP53', drop = FALSE] %>%
tibble::rownames_to_column('sample') %>%
inner_join(lihc.data)
绘制带散点的柱状图
#| label: plot-jitter
tp53.summary <- group_by(tp53.data, grade) %>%
summarise(mu = mean(TP53))
ggplot(tp53.summary, aes(grade, mu, fill = grade)) +
geom_round_col(width = 0.6, alpha = 0.8, radius = unit(15, 'pt')) +
geom_jitter(data = tp53.data, aes(grade, TP53, fill = grade),
width = 0.1, size = 3, shape = 21) +
scale_fill_manual(name = NULL, values = my_colors$four) +
labs(x = NULL, y = 'TP53 Expression level') +
my_theme
#| label: plot-diff
tp53.summary <- group_by(tp53.data, stage) %>%
summarise(mu = mean(TP53))
ggplot(tp53.data, aes(stage, TP53)) +
geom_round_col(aes(stage, mu, fill = stage), data = tp53.summary,
width = 0.6, alpha = 0.5) +
geom_jitter(aes(fill = stage), width = 0.1, size = 3, shape = 21,
alpha = 0.6) +
stat_compare_means(aes(label = paste0("p = ", after_stat(p.signif))),
comparisons = list(c('Stage I', 'Stage II'),
c('Stage II', 'Stage III'),
c('Stage I', 'Stage III')),
method = "t.test",
bracket.size = 0.6) +
scale_fill_manual(name = NULL, values = my_colors$three) +
labs(x = NULL, y = 'TP53 Expression level') +
my_theme
可以展示基因在肿瘤或正常中的表达均值
#| label: plot-pyramid
df <- tibble(
gene = factor(paste0("gene_", rep(1:16, 2)),
levels = paste0("gene_", 16:1)),
stat = c(seq(-1, -10, -1), seq(-9, -4, 1),
seq(1, 10, 1), seq(9, 4, -1)),
direct = rep(c("down", "up"), each=16)
)
ggplot(df, aes(gene, stat, fill = direct)) +
geom_round_col(radius = unit(5, 'pt')) +
coord_flip() +
scale_y_continuous(breaks = seq(-10, 10, 2),
labels = c(seq(10, 0, -2), seq(2, 10, 2))) +
scale_fill_manual(values = my_colors$two) +
my_theme
可以搭配前面分析过的差异基因或通路富集的结果来展示。
#| label: plot-deviation
df <- tibble(
gene = factor(paste0('gene_', 1:20), levels = paste0('gene_', 20:1)),
log2FC = c(seq(10, 1, -1), seq(-1, -10, -1)),
direct = factor(rep(c('up', 'down'), each=10), levels = c('up', 'down')),
sign = ifelse(direct == 'up', 1, 0)
)
ggplot(df, aes(gene, log2FC, fill = direct)) +
geom_round_col(radius = unit(5, 'pt')) +
geom_text(aes(y = 0, label = gene, hjust = sign)) +
coord_flip() +
scale_fill_manual(values = my_colors$two) +
my_theme +
theme(axis.line.y.left = element_blank(),
axis.text.y.left = element_blank(),
axis.title.y.left = element_blank())
环状图就是换成了极坐标系,基本上和正常使用没有区别,只有一个要注意的点
当发现图片中数据有丢失的情况,应该将半径(radius)调小,太小的形状无法做出较大的弧度弯曲
#| label: plot-nightingale-rose
ggplot(data, aes(type, fill = type)) +
geom_round_bar(radius = unit(5, 'pt'), colour = '#f7f7f7') +
scale_fill_manual(values = my_colors$five) +
coord_polar() +
labs(x = NULL) +
theme_minimal() +
theme(axis.line.y.left = element_blank(),
axis.text.y.left = element_blank())
可以用于展示不同癌型(多分组)中某一指标(如,样本量、突变频率或某一基因)的分布情况
#| label: plot-radial-bar
ggplot(CDR) +
geom_round_bar(aes(type, fill = type), show.legend = FALSE,
radius = unit(1.5, 'pt')) +
coord_polar() +
scale_fill_manual(values = colorRampPalette(my_colors$gradient)(33)) +
labs(x = NULL, y = NULL) +
ylim(c(-500, 1100)) +
theme_minimal() +
theme(axis.line.y.left = element_blank(),
axis.text.y.left = element_blank())
手动构建了一份基因突变数据,来展示不同突变类型中高频突变的基因
lihc.mut %>%
group_by(effect, gene) %>%
summarise(count = n()) %>%
group_by(effect) %>%
arrange(-count) %>%
top_n(3)
# 设置空白柱子的个数
empty_bar = 2
# 自定义突变类型
mut_type <- c("Ins", "Del", "Mismatch", "Silent")
# 构造数据
custom.data <- tibble(
gene=paste( "Gene ", seq(1,60), sep=""),
group=c(rep('Ins', 10), rep('Mismatch', 30), rep('Del', 14), rep('Silent', 6)) ,
value=sample(seq(10,100), 60, replace=T)
) %>%
# 添加 NA 数据,用于在分组之间绘制空白柱形
add_row(tibble(
gene = rep(NA, empty_bar * length(mut_type)),
group = rep(mut_type, empty_bar),
value = gene
)) %>%
mutate(group = factor(group, levels = mut_type)) %>%
# 排序,为了让统一分组绘制在一起
arrange(group)
# 构造唯一标识,用作 x 轴,并按该顺序绘制
custom.data$id = 1:nrow(custom.data)
# 添加显示文本的角度
angle <- 90 - 360 * (custom.data$id - 0.5) / nrow(custom.data)
# 添加内圈注释
base_anno <- group_by(custom.data, group) %>%
summarise(start = min(id), end = max(id) - empty_bar) %>%
mutate(mid = (start + end) / 2)
绘制图像
#| label: plot_mutation
#| warning: false
ggplot(custom.data, aes(id, value, fill = group)) +
geom_round_col(position = position_dodge2(), radius = unit(2, 'pt')) +
# 基因名称
geom_text(aes(y = value + 18, label = gene,
hjust = ifelse(angle > -90, 0.3, 0.7)),
size = 2.5, alpha = 0.6,
angle = ifelse(angle < -90, angle + 180, angle)) +
# 内圈线段
geom_segment(data = base_anno,
aes(x = start, y = -5, xend = end, yend = -5),
colour = "grey40") +
# 分组标签
geom_text(data = base_anno, aes(x = mid, y = -18, label = group),
angle = c(-26, -100, -50, 26), colour = "grey40") +
coord_polar() +
scale_fill_manual(values = my_colors$four) +
ylim(-100,120) +
theme(
panel.background = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(),
)
下面介绍两个使用 geom_round_rect
函数的例子
set.seed(123)
df <- tibble(
gene = rep(LETTERS[1:5], 4),
status = rep(c("alpha", "beta", "gamma", "delta"), each = 5),
value = sample(1:100, 20),
percent = rep(c(10, 30, 30, 20, 10), 4)
)
df %>%
group_by(status) %>%
mutate(xmax = cumsum(percent), xmin = xmax - percent) %>%
group_by(gene) %>%
mutate(ytmp = value * 100 / sum(value),
ymax = cumsum(ytmp), ymin = ymax - ytmp) %>%
mutate(x = (xmin + xmax) / 2, y = (ymin + ymax) / 2) %>%
ggplot() +
geom_round_rect(aes(xmin = xmin, ymin = ymin,
xmax = xmax, ymax = ymax, fill = status),
colour = "#eef3ef", radius = unit(10, 'pt')) +
geom_text(aes(x, y, label = paste0(round(ytmp, 2), "%"))) +
geom_text(aes(x = x, y = 103, label = gene)) +
scale_fill_manual(values = my_colors$four) +
theme(panel.background = element_blank())
整理突变数据,突变类型有点多,挑选一些频率较高的突变类型来说明。具体分析需要根据自己的需求进行调整
#| message: false
mut_type <- c('missense_variant', 'frameshift_variant', 'intron_variant',
'stop_gained', 'synonymous_variant')
mut.data <- lihc.mut %>% select(sample, gene, effect) %>%
distinct() %>%
filter(effect %in% mut_type)
genes <- count(mut.data, gene) %>%
top_n(n = 20, wt = n) %>%
mutate(percent = round(n * 100 / sum(n), 1)) %>%
arrange(desc(n))
samples <- subset(mut.data, gene %in% genes$gene) %>%
count(sample) %>% arrange(desc(n)) %>%
rename(num = n) %>%
dplyr::filter(num >= 5)
plot.data <- inner_join(mut.data, genes) %>%
inner_join(samples) %>%
mutate(gene = factor(gene, levels = rev(genes$gene)),
sample = factor(sample, levels = samples$sample),
x = as.integer(sample),
y = as.integer(gene))
head(plot.data)
首先,绘制样本的突变基因数目条形图
#| label: sample_mut
p1 <- ggplot(plot.data) +
geom_round_bar(aes(x = sample, fill = effect), radius = unit(3, 'pt')) +
# 使用 expand 删除数据与轴之间的空隙
scale_y_continuous(breaks = seq(0, 25, 5), limits = c(0, 25),
expand = expansion(mult = 0, add = 0)) +
scale_fill_manual(values = my_colors$five) +
theme(
legend.position = "none",
panel.background = element_blank(),
axis.title = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.length.y.left = unit(.25, "cm"),
axis.line.y.left = element_line(colour = "black"),
)
p1
绘制中间的基因在样本中的突变图谱
#| label: gene_mut_in_samples
width <- 0.5
height <- 0.5
p2 <- ggplot(plot.data) +
# geom_tile(aes(x = sample, y = gene, fill = effect)) +
geom_round_rect(aes(xmin = x - width, xmax = x + width,
ymin = y - height, ymax = y + height,
fill = effect), radius = unit(2, 'pt')) +
# 图例行数的调整放到 fill,单独用 guides 无效
scale_fill_manual(values = my_colors$five,
guide = guide_legend(nrow = 2)) +
scale_x_continuous(breaks = seq(width, max(plot.data$x) + width, 2),
expand = expansion(0, 0)) +
scale_y_continuous(position = "right",
breaks = seq(height, max(plot.data$y) + height, 1),
labels = c('', rev(genes$gene)),
expand = expansion(0, 0)) +
theme(
axis.text.x = element_blank(),
axis.text.y = element_text(size = 6, vjust = 1.4),
axis.ticks = element_blank(),
axis.title = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "bottom",
# panel.background = element_blank()
)
p2
绘制 TOP
基因的突变频数条形图
#| label: genes_mut
p3 <- ggplot(plot.data) +
geom_round_bar(aes(y = gene, fill = effect), radius = unit(2, 'pt')) +
scale_x_continuous(position = "top", breaks = seq(0, 65, 20),
limits = c(0, 65),
expand = expansion(mult = 0, add = 0)) +
scale_fill_manual(values = my_colors$five) +
theme(
legend.position = "none",
panel.background = element_blank(),
axis.title = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.length.x.top = unit(.25, "cm"),
axis.line.x.top = element_line(colour = "black")
)
p3
合并图像
p12 <- plot_grid(p1, p2, nrow = 2, align = "v",
rel_heights = c(2, 5))
pr <- plot_grid(NULL, p3, NULL, nrow = 3, align = "v",
rel_heights = c(2, 5, 1.8))
plot_grid(p12, pr, ncol = 2, align = "h",
rel_widths = c(5, 1))
所有数据、代码(包含注释)以及原始文档都上传到了知识星球,任何代码的问题都可以提问,自愿加入。
TCGA-Pancancer: https://gdc.cancer.gov/about-data/publications/pancanatlas
[2]TCGA-Clinical Data Resource (CDR) Outcome: https://api.gdc.cancer.gov/data/1b5f413e-a8d1-4d10-92eb-7c4ae739ed81
https://mp.weixin.qq.com/s/LYeqTuB_WAQzXQou4QsV3g