复现单细胞结合常规转录组的Nat Med文章数据挖掘部分

复现单细胞结合常规转录组的Nat Med文章数据挖掘部分 by 生信菜鸟团

英文标题：High systemic and tumor-associated IL-8 correlates with reduced clinical benefit of PD-L1 blockade

期刊：Nat Med. 2020 May;26(5):693-698.
影响因子：1区，82.9
DOI: 10.1038/s41591-020-0860-1
研究领域：PD-L1抑制剂；IL-8

IMvigor210CoreBiologies这个R包中有一些免疫治疗相关的数据。看看这个试验的设计和结果。

首先安装这个包

#IMvigor210CoreBiologies安装，先下载到本地，使用本地安装方式
#http://research-pub.gene.com/IMvigor210CoreBiologies/packageVersions/
install.packages("./IMvigor210CoreBiologies_1.0.0.tar.gz", repos = NULL, type = "source")

会发现装不上，不要着急，先装一点依赖包

### 设置镜像
options("repos"=c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor")
### 安装缺失的R包-CRAN
if(!requireNamespace("dplyr",quietly = TRUE)) install.packages("dplyr")
if(!requireNamespace("DT",quietly = TRUE)) install.packages("DT")
if(!requireNamespace("ggplot2",quietly = TRUE)) install.packages("ggplot2")
if(!requireNamespace("reshape2",quietly = TRUE)) install.packages("reshape2")
if(!requireNamespace("plyr",quietly = TRUE)) install.packages("plyr")
if(!requireNamespace("survival",quietly = TRUE)) install.packages("survival")
if(!requireNamespace("circlize",quietly = TRUE)) install.packages("circlize")
if(!requireNamespace("lsmeans",quietly = TRUE)) install.packages("lsmeans")
if(!requireNamespace("spatstat",quietly = TRUE)) install.packages("spatstat")
if(!requireNamespace("corrplot",quietly = TRUE)) install.packages("corrplot")
### 安装缺失的R包-Bioconductor
if(!requireNamespace("ComplexHeatmap",quietly = TRUE)) BiocManager::install("ComplexHeatmap")
if(!requireNamespace("biomaRt",quietly = TRUE)) BiocManager::install("biomaRt")
if(!requireNamespace("DESeq2",quietly = TRUE)) BiocManager::install("DESeq2")
if(!requireNamespace("edgeR",quietly = TRUE)) BiocManager::install("edgeR")
if(!requireNamespace("limma",quietly = TRUE)) BiocManager::install("limma")

再装一下Deseq包，直接装装不上，在这个链接中下载老版本。https://www.bioconductor.org/packages/3.11/bioc/html/DESeq.html

下载到本地

if(!requireNamespace("genefilter",quietly = TRUE)) BiocManager::install("genefilter")
library(genefilter)
install.packages("DESeq_1.39.0.tar.gz", repos = NULL, type = "source")

安装完成后再来一遍

install.packages("IMvigor210CoreBiologies_1.0.0.tar.gz", repos = NULL, type = "source")

这样就好了。

library(IMvigor210CoreBiologies)

data(cds)
expreSet <- as.data.frame(counts(cds))
annoData <- fData(cds)
phenoData <- pData(cds)

1首先是Plasma IL-8 and clinical outcomes in muC and mRCC

首先看看原图，想要在这个数据集中根据pIL-8的表达划分高低组，方法中写的是根据median，可能还做了什么处理，导致我的数据和文中有一些偏差。不过没关系，大概的趋势是正确的，IL-8表达高带来生存获益较差，这里的获益是在患者接受atezo治疗后。

先来第一个图，普通的生存分析

#IL-8的基因为CXCL8
rm(list = ls())
library(pacman)
library(IMvigor210CoreBiologies)
library(dplyr)

#2.数据预处理
##2.1表达矩阵
data(cds)
expMatrix <- counts(cds)
eff_length2 <- fData(cds)[,c("entrez_id","length","symbol")]
rownames(eff_length2) <- eff_length2$entrez_id
head(eff_length2)
feature_ids <- rownames(expMatrix)
expMatrix <- expMatrix[feature_ids %in% rownames(eff_length2),]
nrow (expMatrix)
ncol (expMatrix)
mm <- match(rownames(expMatrix),rownames(eff_length2))
eff_length2 <- eff_length2[mm,]

x <- expMatrix/eff_length2$length
eset <- t(t(x)/colSums(x))*1e6
summary(duplicated(rownames(eset)))

if(!requireNamespace("remotes", quietly =TRUE)) install("remotes")
if(!requireNamespace("IOBR", quietly =TRUE))remotes::install_github("IOBR/IOBR",ref="master")

eset <- IOBR::anno_eset(eset = eset,
                        annotation = eff_length2,
                        symbol = "symbol",
                        probe = "entrez_id",
                        method = "mean")
#BLCA Bladder Urothelial Carcinoma 膀胱尿路上皮癌
tumor_type <- "BLCA"
if(max(eset)>100) eset <- log2(eset+1)


##2.2 表型文件
library(tibble)
pdata <- pData(cds)
colnames(pdata) <- gsub(colnames(pdata),pattern = " ",replacement = "_")
pdata<-pdata[!is.na(pdata$binaryResponse),]
table(pdata$binaryResponse)
pdata$group<-ifelse(pdata$binaryResponse=="CR/PR","R","NR")
table(pdata$group)
table(pdata$Best_Confirmed_Overall_Response)
save(expMatrix,eset,pdata,file = "expcli_IMvigor210.Rdata")


rm(list = ls())
load('expcli_IMvigor210.Rdata')
exp = eset[,colnames(eset) %in% rownames(pdata)]

p = identical(colnames(exp),rownames(pdata));p

CXCL8_exp = exp['CXCL8',]
a = as.data.frame(t(CXCL8_exp)) 
p = identical(rownames(a),rownames(pdata));p
pdata$CXCL8 = a$CXCL8
pdata$CXCL8_group <- ifelse(pdata$CXCL8 >= median(pdata$CXCL8) ,"pIL-8 High","pIL-8 Low")
table(pdata$CXCL8_group)

library(survival)
library(ggplot2)
library(survminer)
colnames(pdata)
fit <- survfit(Surv(os, censOS) ~ CXCL8_group, data = pdata)

# 拟合Cox比例风险模型
pdata$CXCL8_group = factor(pdata$CXCL8_group,levels = c("pIL-8 Low","pIL-8 High"))
pdata$CXCL8_group
cox_fit <- coxph(Surv(os, censOS) ~ CXCL8_group, data = pdata)
hr <- exp(coef(cox_fit))
hr

g <- ggsurvplot(fit, data = pdata, risk.table = TRUE, pval = TRUE, surv.median.line = "hv",
                legend.title = " ", 
                legend.labs = c("pIL-8 High", "pIL-8 Low"),
                title = "pIL-8 cohort2 OS IMvigor210")
g

p_value <- summary(cox_fit)$coefficients[, "Pr(>|z|)"]
g$table <- g$table +
  theme(legend.position = "none") +
  labs(title = "OS(months)", x = "OS(months)", y = "风险比") +
  theme(plot.title = element_text(hjust = 0.5))

g$plot <- g$plot + theme(plot.title = element_text(hjust = 0.5))+
        annotate("text", x = 1.57, y = 0.12, label = paste("HR =", round(hr, 2)), size = 4.75)
print(g)

这里有一个点需要注意到，因为中间有一步计算了HR值，按照正常情况来讲，低pIL-8表达组比高pIL-8表达组降低了风险，最开始得到的HR是0.6，降低了40%。但是文中很明显是反过来比的，说高pIL-8表达组比低pIL-8表达组增加了风险，所以我们这里也反过来。

注意的就是生存曲线中的线，如果在得到fit前就设置factor，后面命名图例，很可能颜色会反。

rm(list = ls())
load('expcli_IMvigor210.Rdata')
exp = eset[,colnames(eset) %in% rownames(pdata)]

p = identical(colnames(exp),rownames(pdata));p
voomD <- filterNvoom(exp,
                     minSamples=ncol(exp)/10, # 最小样本数
                     minCpm=0.25, # 最小cpm值
                     DESeq=FALSE) #  edgeR normFactors will used for normalization(默认),

names(voomD)
m <- voomD$E
dim(m) 
m[1:3,1:3]

#归一化：scale
#对数据进行处理，可以将大范围变化数据大范围变化落入一个小的特定区间，如：[0, 1]或[-1, 1]
# 所谓数据的归一化是指中心化之后的数据在除以数据集的标准差，即数据集中的各项数据减去数据集的均值再除以数据集的标准差。
m <- t(scale( t( m ),
              center=TRUE, # 中心化
              scale=TRUE) # 标准化
)
dim(m)
m[1:3,1:3]
m = as.data.frame(m)
CXCL8_exp = m['CXCL8',]
a = as.data.frame(t(CXCL8_exp)) 
p = identical(rownames(a),rownames(pdata));p

pdata$CXCL8 = a$CXCL8
pdata$CXCL8_group <- ifelse(pdata$CXCL8 >= median(pdata$CXCL8) ,"pIL-8 High","pIL-8 Low")

table(pdata$CXCL8_group)

clinical=pdata
colnames(clinical)

#基因集计算
data(human_gene_signatures)
ind_genes <- human_gene_signatures
goi <- names(ind_genes)
goi # 20个基因集

for (sig in goi) {
  clinical[, sig] <- NA # 在pData新增一列列名为sig，内容NA
  genes <- ind_genes[[sig]] # 在基因集的基因
  genes <- genes[genes %in% rownames(m)] # 与原矩阵基因求交集
  tmp <- m[genes, , drop=FALSE] # 交集基因的表达量
  clinical[, sig] <- gsScore(tmp) # gsScore计算分数，并写入该对应列中
}

dim(clinical)
colnames(clinical)


BOR <- "Best_Confirmed_Overall_Response"
table(clinical[, BOR])
tmpDat <- clinical[ clinical[, BOR] != "NE", ]
table(tmpDat[, BOR])
tmpDat[, BOR] <- droplevels(tmpDat[, BOR]) # 将level=NE删除掉
table(tmpDat[, BOR])

#Fisher's exact test( 费希尔精确检验)
colnames(clinical)
IL8 <- table(tmpDat$CXCL8_group, tmpDat[, "binaryResponse"])
IL8

pval <- signif(fisher.test(IL8)$p.value, 2) # fisher.test检验，p值取两位有效数字
print(paste("Fisher P for IL8 by binary response:", pval))
#0.21,ORR之前没啥差异

##堆积柱状图
library(tidyr)
library(reshape2)

colnames(tmpDat)
tb=table(tmpDat$CXCL8_group,
         tmpDat[, BOR])
head(tb)
library (gplots) 
balloonplot(tb)
bar_data <- as.data.frame(tb)

bar_per <- bar_data %>% 
  group_by(Var1) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per) 
#write.csv(bar_per,file = "celltype_by_group_percent.csv")
col =c("#3176B7","#F78000","#3FA116","#CE2820","#9265C1",
                "#885649","#DD76C5","#7F7F7F","#BBBE00","#41BED1")
                
ggplot(bar_per, aes(x = percent, y = Var1)) +
  geom_bar(aes(fill = Var2) , stat = "identity") + coord_flip() +
  theme(axis.ticks = element_line(linetype = "blank"),
        legend.position = "top",
        panel.grid.minor = element_line(colour = NA,linetype = "blank"), 
        panel.background = element_rect(fill = NA),
        plot.background = element_rect(colour = NA)) +
  labs(y = " ", fill = NULL)+labs(x = 'Fraction of patients')+
  scale_fill_manual(values=col)+
  labs(title = "pIL-8 cohort2 ORR IMvigor210")+
  theme_classic()+
  theme(plot.title = element_text(size=12,hjust=0.5))

虽然ORR之间的p值是没有差异的，但是有一些趋势。

pIL-8组更多的CR患者，PR患者也多了3个（文章中明显多了，开始怀疑这个数据集到底是不是cohort2，刚读取进来的时候是348例，但cohort2应该只有310例患者，cohort1有119例患者）。但是ORR竟然拉不开差异，没有得到文章中的结果

好了这个IL-8竟然都不是plasma IL-8，做之前一定要仔细检查数据。

接下来是同时Teff和IL-8，看看高高，高低，低高，低低对生存的影响。Teff主要存在于CD8+ T细胞中，那么就可以直接使用gsScore 函数计算20个基因集的分数中的CD8 T effector评分。当然，也可以自己计算，通过文中给的基因，计算四个基因的平均值。

###上一步计算了Teff
tmpDat$Teff_group <- ifelse(tmpDat$`CD 8 T effector` >= median(tmpDat$`CD 8 T effector`) ,"Teff High","Teff Low")
table(tmpDat$Teff_group)
table(tmpDat$CXCL8_group)
tmpDat$group <- ifelse(tmpDat$Teff_group %in% c("Teff High") & tmpDat$CXCL8_group %in% c("pIL-8 High"),"Teff High,IL8 High",
                ifelse(tmpDat$Teff_group %in% c("Teff High") & tmpDat$CXCL8_group %in% c("pIL-8 Low"),"Teff High,IL8 Low",
                       ifelse(tmpDat$Teff_group %in% c("Teff Low") & tmpDat$CXCL8_group %in% c("pIL-8 High"),"Teff Low,IL8 High", "Teff Low,IL8 Low")))

table(tmpDat$group)

library(survival)
library(ggplot2)
library(survminer)
colnames(tmpDat)
fit <- survfit(Surv(os, censOS) ~ group, data = tmpDat)

# 拟合Cox比例风险模型
tmpDat$group = factor(tmpDat$group,levels = c("pIL-8 Low","pIL-8 High"))
tmpDat$group
cox_fit <- coxph(Surv(os, censOS) ~ group, data = tmpDat)

# 提取风险比的估计值
hr <- exp(coef(cox_fit))
hr

g <- ggsurvplot(fit, data = pdata, risk.table = TRUE, pval = TRUE, surv.median.line = "hv",
                legend.title = " ", 
                xlab = "OS(months)", 
                legend = c(0.8,0.75), 
                legend.labs = c("Teff High,IL8 High", "Teff High,IL8 Low","Teff Low,IL8 High", "Teff Low,IL8 Low"),
                title = "pIL-8 Teff high/low cohort2 OS IMvigor210")
g
p_value <- summary(cox_fit)$coefficients[, "Pr(>|z|)"]

g$table <- g$table +
  theme(legend.position = "none") +
  labs(title = " ", x = "OS(months)", y = " ") +
  theme(plot.title = element_text(hjust = 0.5))

g$plot <- g$plot + theme(plot.title = element_text(hjust = 0.5))+
  annotate("text", x = 11.4, y = 0, label = '11.4', size = 3.6,colour = "#E87B70")+
  annotate("text", x = 7.2, y = 0, label = '7.2', size = 3.6,colour = "#53BBC1")+
  annotate("text", x = 10.8, y = 0, label = '10.8', size = 3.6,colour = "#BC7FF8")

print(g)

还是按照我们自己的HR顺序吧，可以很明显的看出，Teff高IL8低降低风险最多，这一组生存较长。

看一下中位生存时间，标上去

趋势符合逻辑，Teff低IL8高表达，相对而言，生存最差。

2Poor clinical outcome and lower expression of antigen-presentation genes associated with IL8-high myeloid subsets in PBMCs

首先我们看看这个原图，不管是髓系还是淋巴细胞中，Nonresponders表达IL-8更多。对髓系细胞的IL8高和低表达进行差异分析后发现，IL8高的髓系细胞中髓系炎症反应基因(橙色)表达丰富，而IL8低的髓系细胞中抗原呈递基因(蓝色)表达较高。PBMCs中IL8基因的高表达与mUC IMvigor210中较差的OS显著相关。

rm(list=ls())
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
getwd()

dir.create("2-harmony")
getwd()
setwd("2-harmony")
sce=readRDS("../1-QC/sce.all_qc.rds")
sce
sce <- NormalizeData(sce, 
                   normalization.method = "LogNormalize",
                    scale.factor = 1e4) 
sce <- FindVariableFeatures(sce)
sce <- ScaleData(sce)
sce <- RunPCA(sce, features = VariableFeatures(object = sce))

library(harmony)
table(sce$orig.ident)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) 


sce=seuratObj
#奇怪的报错，下面是解决方式
# remove.packages("Matrix")
# install.packages("https://cran.r-project.org/src/contrib/Archive/Matrix/Matrix_1.5-1.tar.gz")
# #之后重启
# #先library(Matrix)，再调用其他包

sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:15) 
sce.all=sce
#设置不同的分辨率，观察分群效果(选择哪一个？)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
  sce.all=FindClusters(sce.all, #graph.name = "CCA_snn", 
                       resolution = res, algorithm = 1)
}
colnames(sce.all@meta.data)
apply(sce.all@meta.data[,grep("RNA_snn",colnames(sce.all@meta.data))],2,table)

p2_tree=clustree(sce.all@meta.data, prefix = "RNA_snn_res.")
#ggsave(plot=p2_tree, filename="Tree_diff_resolution.pdf")

#按照分辨率为1进行 
sel.clust = "RNA_snn_res.1"
sce.all <- SetIdent(sce.all, value = sel.clust)
table(sce.all@active.ident) 
saveRDS(sce.all, "sce.all_int.rds")

依旧是常规流程。

rm(list=ls())
setwd('../')
dir.create("3-cell")
setwd("3-cell") 
sce.all=readRDS("../2-harmony/sce.all_int.rds")
colnames(sce.all@meta.data)
p1 = DimPlot(sce.all, reduction = "umap", group.by = "sample",label = T,raster=FALSE,cols = c('red','green')) 
#ggsave('umap_by_patient.pdf',width = 9,height = 7)
col = c('#DF4298', '#7C388C', '#D58EBE' ,  '#746FB4','#3A53A4','#FEC026' ,'#F57F20' ,'#CB952B' , '#A1CC59','#186533')
p2 = DimPlot(sce.all, reduction = "umap", group.by = "orig.ident",raster=FALSE,cols = col) 
#ggsave('umap_by_RNA_snn_res.0.3.pdf',width = 9,height = 7)
library(patchwork)
p1+p2

library(ggplot2) 
genes_to_check = c('ITGAX', 'CD11b', 'CD86', 'CD1C', 'CD209', 'CLEC9A','CD1E',  #DC
                    'CD11a',  'IRF8', 'CCR5', 'HLA-DRB1', #DC-like
                   'CD14',  'CD64','CCR2', 'S100A8', 'S100A9', #mono
                   'FCGR3A', #CD16 Monocytes
                   'PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A','CD8B', 
                   'CD8',  'CD45RA', 'CD197', 'CCR7', #CD8T
                   'IL7R', 'BCL2', #CD8Tcm
                   'GZMB', 'PRF1', 'KLRG1','TBX21',#CD8Tem
                    'CD27', #CD4T
                   'FOXP3', 'IL2RA', 'CTLA4', 'IKZF2', 'LRRC32', #Treg
                   'CD45RO','CD62L', 'CD28', #CD4Tcm
                   'CD57','CD127', #CD4Tem
                   'CD19', 'CD79A', 'MS4A1' ,'BLK', #B
                   'KLRB1','NCR1', 'NKG7', 'KLRD1','CD56','NKG2D', # NK 
                   'FGF7','MME', 'ACTA2', ## fibo 
                   'CD41', 'CD61', 'CD42b', 'CD34', 'CD235a','MPL', 'PF4', 'ITGA2B', 'ITGB3', 'GP1BB',#Megakaryocytes
                   'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
library(stringr)  
p_all_markers <- DotPlot(sce.all, features = genes_to_check,
                         assay='RNA'  )  + coord_flip()

p_all_markers

p_umap=DimPlot(sce.all, reduction = "umap",
               group.by = "RNA_snn_res.1",label = T,raster=FALSE) 
p_umap
library(patchwork)
p_all_markers+p_umap

rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
getwd()
setwd('3-cell/')
sce.all=readRDS( "../2-harmony/sce.all_int.rds")
sce.all

celltype=data.frame(ClusterID=0:16,
                    celltype= 0:16) 

celltype[celltype$ClusterID %in% c( 6),2]='DC' 
celltype[celltype$ClusterID %in% c(1,3,9,15 ),2]='Monocytes' 
celltype[celltype$ClusterID %in% c( 8 ),2]='CD16 monoccytes'
celltype[celltype$ClusterID %in% c(12),2]='DC-like' 
celltype[celltype$ClusterID %in% c( 9),2]='Megakaryocytes'  
celltype[celltype$ClusterID %in% c(11  ),2]='CD8 Tcm' 
celltype[celltype$ClusterID %in% c( 16 ),2]='CD8 Tem' 
celltype[celltype$ClusterID %in% c( 0 ),2]='CD4 T' 
celltype[celltype$ClusterID %in% c(2 ),2]='CD4 Tcm' 
celltype[celltype$ClusterID %in% c( 14),2]='CD4 Tem' 
celltype[celltype$ClusterID %in% c( 13 ),2]='Treg'
celltype[celltype$ClusterID %in% c(4,5,10),2]='NK' 
celltype[celltype$ClusterID %in% c( 7),2]='B'  

head(celltype)
celltype
table(celltype$celltype)
sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)

table(sce.all@meta.data$celltype,sce.all@meta.data$RNA_snn_res.1) 

mycolors <-c('#E5D2DD', '#53A85F', '#F1BB72', '#F3B1A0', '#D6E7A3', '#57C3F3',
                      '#E95C59', '#E59CC4', '#AB3282',  '#BD956A', 
                      '#9FA3A8', '#E0D4CA',  '#C5DEBA', '#F7F398',
                      '#C1E6F3', '#6778AE', '#91D0BE', '#B53E2B',
                      '#712820', '#DCC1DD', '#CCE0F5',  '#CCC9E6', '#625D9E', '#68A180', '#3A6963',
                      '#968175')
                                      
library(patchwork)
p_umap=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = T,
                     label.box=T,raster=FALSE,cols = mycolors)
p_all_markers+p_umap

library(patchwork)
p_all_markers=DotPlot(sce.all, features = genes_to_check,
                    assay='RNA' ,group.by = 'celltype' )  + coord_flip()+th
p_umap=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = T,
                    label.box=T,raster=FALSE,cols = mycolors)
p_all_markers+p_umap

p_sample=DimPlot(sce.all, reduction = "umap", group.by = "celltype",label = T,raster=FALSE,cols = mycolors,split.by = 'sample')
p_sample

明显看出在Responder中单核细胞和巨核细胞小了一圈，其他细胞亚群看起来或多或少有所增加。

####Featureplot
sce = sce.all
cell_marker <- c("CD3D","NKG7","CD79A","CD14","IL1B","CXCL8")
FeaturePlot(sce,features = cell_marker,cols = c("lightgrey" ,"#DE1F1F"),ncol=3)

FeaturePlot(sce,features = 'CXCL8',cols = c("lightgrey" ,"#DE1F1F"),split.by = 'sample')

可以明显看到CXCL8在不响应的人群中表达更高。

##小提琴图
library(ggplot2)
library(gghalves)
library(tidyverse)

### 提取某一基因的表达值到metadata中
expr <- sce@assays$RNA@scale.data
gene_name <- c("CXCL8")
gene_expression <- expr %>% 
  .[gene_name,] %>% 
  #t() %>% 
  as.data.frame()
colnames(gene_expression) <- paste0(gene_name)
identical(colnames(sce),row.names(gene_expression))
sce$CXCL8 <- gene_expression[,paste0(gene_name)]
identical(sce@meta.data[,paste0(gene_name)],gene_expression[,paste0(gene_name)])
meta <- sce@meta.data
colnames(meta)
table(meta$sample)
R <- meta %>% filter(sample=="Responder")

NR <- meta %>% filter(sample=="NonResponder")

library(ggsignif)
library(ggpubr)
ggplot() +
  geom_half_violin(data = R, aes(x = celltype, y = CXCL8),
                   position = position_dodge(width = 1),
                   scale = 'width',
                   fill = "#47BB0C",
                   side = "l") +
  geom_half_violin(data = NR, aes(x = celltype, y = CXCL8),
                   position = position_dodge(width = 1),
                   scale = 'width',
                   fill = "#DC291A",
                   side = "r") +
  theme_classic() +
  xlab("") +
  ylab("CXCL8 expression") +
  stat_compare_means(data = meta, aes(x = celltype, y = CXCL8, group = sample),
                     symnum.args = list(cutpoints = c(0, 0.001, 0.01, 0.05, 1),
                                        symbols = c("***", "**", "*", "ns")),
                     label = "p.signif")

和原文趋势一样，无响应者的CXCL8几乎在所有细胞亚型中分布都较高。

source('VolcanoPlot.R')
table(sce$celltype)
myeloid <- subset(sce, celltype %in% c("CD16 monoccytes","DC","DC-like","Megakaryocytes","Monocytes"))
myeloid$CXCL8_group <- ifelse(myeloid$CXCL8 >= median(myeloid$CXCL8) ,"IL-8 High","IL-8 Low")
diff_myeloid <- FindMarkers(myeloid, min.pct = 0.25, 
                           logfc.threshold = 0.25,
                           group.by = "CXCL8_group",
                           ident.1 ="IL-8 High",
                           ident.2="IL-8 Low")       
#avg_logFC：两组之间平均表达的对数折叠通道。正值表示该特征在第一组中的表达更高。

dif=data.frame(
  symbol=rownames(diff_myeloid),
  log2FoldChange=diff_myeloid$avg_log2FC,
  padj=diff_myeloid$p_val_adj
)

# 可以指定要标记的DEG数量，选出FC最大和最小的基因标记
VolcanoPlot(dif, padj=0.05, title="IL-8 High vs IL-8 Low", label.max = 50, cols=c("blue", "red"))

可以看到CXCL8在IL-8 High中高表达

###细胞亚群比例
##堆积柱状图
library(tidyr)
library(reshape2)

colnames(meta)
tb=table(meta$celltype,
         meta$sample)
head(tb)
library (gplots) 
balloonplot(tb)
bar_data <- as.data.frame(tb)

bar_per <- bar_data %>% 
  group_by(Var1) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per) 
col =c("#3176B7","#F78000","#3FA116","#CE2820","#9265C1",
                "#885649","#DD76C5","#7F7F7F","#BBBE00","#41BED1")
                
library(ggthemes)
g1 = ggplot(bar_per, aes(x = percent, y = Var1)) +
  geom_bar(aes(fill = Var2) , stat = "identity") + 
  theme(axis.ticks = element_line(linetype = "blank"),
        legend.position = "top",
        panel.grid.minor = element_line(colour = NA,linetype = "blank"), 
        panel.background = element_rect(fill = NA),
        plot.background = element_rect(colour = NA)) +
  labs(y = "cell type", fill = NULL)+labs(x = 'Fraction of cells')+
  scale_fill_manual(values=col)+
  labs(title = " ")+
  theme_tufte()+
  theme(plot.title = element_text(size=12,hjust=0.5))


###2
##堆积柱状图
library(tidyr)
library(reshape2)

colnames(meta)
tb=table(meta$celltype,
         meta$orig.ident)
head(tb)
library (gplots) 
balloonplot(tb)
bar_data <- as.data.frame(tb)

bar_per <- bar_data %>% 
  group_by(Var1) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per) 
#write.csv(bar_per,file = "celltype_by_group_percent.csv")
col =c("#3176B7","#F78000","#3FA116","#CE2820","#9265C1",
                "#885649","#DD76C5","#7F7F7F","#BBBE00","#41BED1")
                
library(ggthemes)
g2 = ggplot(bar_per, aes(x = percent, y = Var1)) +
  geom_bar(aes(fill = Var2) , stat = "identity") + 
  theme(axis.ticks = element_line(linetype = "blank"),
        legend.position = "top",
        panel.grid.minor = element_line(colour = NA,linetype = "blank"), 
        panel.background = element_rect(fill = NA),
        plot.background = element_rect(colour = NA)) +
  labs(y = "cell type", fill = NULL)+labs(x = 'Fraction of cells')+
  scale_fill_manual(values=col)+
  labs(title = " ")+
  theme_tufte()+
  theme(plot.title = element_text(size=12,hjust=0.5))


library(patchwork)
g1+g2

这里比上面的dotplot图更能清晰的看出，单核细胞和聚合细胞以及DC-like在NonResponder中更多一点。

3scRNA-seq analysis of IL8 gene expression in immune subsets from matched intratumoral and peripheral blood leukocytes from patients with RCC and association of tumor IL8 gene expression with clinical outcomes in muC and mRCC

接下来是另一个数据集RCC对于上面结论的进一步佐证，当然，方法和上面一样。连图形都是再来一遍。

###### step5:检查常见分群情况  ######
rm(list=ls())
setwd('../')
dir.create("3-cell")
setwd("3-cell") 
sce.all=readRDS("../2-harmony/sce.all_int.rds")
colnames(sce.all@meta.data)
table(sce.all$orig.ident)
sce.all$orig.ident=str_split(sce.all$orig.ident ,'[_]',simplify = T)[,2]
table(sce.all$orig.ident)
p1 = DimPlot(sce.all, reduction = "umap", group.by = "sample",raster=FALSE,cols = c('red','green')) 
#ggsave('umap_by_patient.pdf',width = 9,height = 7)
col = c('#DF4298', '#7C388C', '#D58EBE' ,  '#746FB4','#3A53A4','#FEC026' ,'#F57F20' ,'#CB952B' , '#A1CC59','#186533')
p2 = DimPlot(sce.all, reduction = "umap", group.by = "orig.ident",raster=FALSE,cols = col) 
#ggsave('umap_by_RNA_snn_res.0.3.pdf',width = 9,height = 7)
library(patchwork)
p1+p2

Human Primary Cell Atlas (HPCA): 基因表达集合(GEO数据集)，包含713个微阵列样本，分类为38种主要细胞类型，进一步注释为169个子类型 (Mabbott et al. 2013)。大多数标签指的是血液亚群，但细胞类型从其他组织也可用。
Blueprint/ENCODE 参考由 Blueprint (Martens and Stunnenberg 2013)和 ENCODE 项目(ENCODE Project Consortium 2012)产生的纯基质和免疫细胞的大量 RNA-seq 数据组成。
DICE参考由来自同名项目(Schmiedel等人，2018年)的排序细胞群的大量RNA-seq样本组成。
Novershtern参考资料(以前称为分化图)由GSE24759中排序的造血细胞群的微阵列数据集组成(Novershtern et al. 2011)。
Monaco参考由来自GSE107011的免疫细胞群的大量RNA-seq样本组成(Monaco et al. 2019)。

rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
library(SingleR)
library(celldex)
library(singleseqgset)
library(devtools)
getwd()
setwd('3-cell/')
sce=readRDS( "../2-harmony/sce.all_int.rds")
sce

# singleR注释
#hpca.se <- HumanPrimaryCellAtlasData()
#save(hpca.se,file = 'hpca.RData')
load('../UC_IMvigor210/2-harmony/hpca.RData')
#bpe.se <- BlueprintEncodeData()
#save(bpe.se,file = 'bpe.RData')
load('../UC_IMvigor210/2-harmony/bpe.RData')
unique(hpca.se$label.main)
unique(hpca.se$label.fine)
unique(bpe.se$label.main)
unique(bpe.se$label.fine)
##这个运行太太太慢了
DICE <- DatabaseImmuneCellExpressionData() 
#save(DICE,file = 'DICE.RData')
NHD <- NovershternHematopoieticData() 
#save(NHD,file = 'NHD.RData')
MID <- MonacoImmuneData()
#save(MID,file = 'MID.RData')
unique(NHD$label.main)
unique(NHD$label.fine)
unique(MID$label.main)
unique(MID$label.fine)
anno <- SingleR(sce@assays$RNA@data,
                ref = list(BP=bpe.se,HPCA=hpca.se),
                labels = list(bpe.se$label.fine,hpca.se$label.fine),
                clusters = sce@meta.data$seurat_clusters
)
##我开始怀疑了，每次更新什么东西，matrix这个包就会更新版本，然后，需要就会持续报错
#上一次遇到这个情况还是在FindNeighbors

plotScoreHeatmap(anno,clusters = anno@rownames,show_colnames = T)
table(anno$labels)

celltype = data.frame(ClusterID=rownames(anno), 
                      celltype=anno$labels, 
                      stringsAsFactors = F) 

sce@meta.data$singleR = celltype[match(sce@meta.data$seurat_clusters,celltype$ClusterID),'celltype']
P <- DimPlot(sce, reduction = "umap", group.by = "singleR")
P

和文中的细胞亚群相比，相差并不大。

后面的分析流程也是一样，差异分析火山图，细胞比例，只是复制粘贴再来一遍，这里就不再赘述了。

提问：作者是怎么挑到这个基因的？

凭借文中的只言片语，较难理解为什么IL-8可能成为一个预测性的生物标志物，还得结合文中的一些生存分析曲线，但是由于临床试验的数据很多并不公开，我也没找到另外两个数据集，所以并没有进行复现，

文章的结论是在mUC和mRCC患者中，血浆、pbmc和肿瘤中高水平的IL8与atezolizumab疗效降低相关，即使在典型的CD8+ T细胞炎症的肿瘤中也是如此。mUC患者低基线pIL8与atezolizumab和化疗反应增加相关。在接受atezolizumab治疗而非化疗的情况下，经历了治疗期pIL8下降的mUC患者表现出了改善的总生存期。scRNAseq显示，IL8主要表达于循环细胞和瘤内髓系细胞中，IL8的高表达与抗原递呈机制的下调有关。能够逆转IL8介导的髓系炎症影响的疗法对于改善接受免疫检查点抑制剂治疗的患者的预后至关重要。

ixxmu / mp_duty