文献复现-单细胞揭示新辅助治疗后NSCLC的免疫微环境变化 by 单细胞天地

好久没做文献复现了…
上一次好像还是在去年的3月份左右，详见：【生信文献200篇】专栏也落下帷幕，生信菜鸟团的发展需要你。。。。
所以距离这个领域我已经脱节一年了….别骂了别骂了在学了在学了😧。
如有不对，请指出，下次一定更正！！！

文章在这：Tumor microenvironment remodeling after neoadjuvant immunotherapy in non-small cell lung cancer revealed by single-cell RNA sequencing
方法：来自3名治疗前和12名接受新辅助PD-1阻断联合化疗的非小细胞肺癌（NSCLC）患者的~92,000个单细胞的转录组。根据病理反应将12个治疗后样本分为两组：MPR（n = 4）和非MPR（n = 8）。

主要病理反应(MPR)被定义为治疗后苏木精和伊红（H&E）染色的残余活肿瘤细胞不超过10%

结果和结论：不同的治疗诱导的癌细胞转录组与临床反应有关。MPR患者中：癌细胞通过MHC-II表现出活化抗原呈递的特征。aged CCL3+中性粒细胞减少，FCRL4+FCRL5+记忆B细胞和CD16+CX3CR1+单核细胞富集，并且是免疫治疗反应的预测因子。NMPR患者中：癌细胞表现出雌激素代谢酶的过表达和血清雌二醇升高。在所有患者中，治疗促进了细胞毒性T细胞和CD16+NK细胞的扩增和活化，减少了免疫抑制性Treg，并将记忆CD8+T细胞激活为效应表型。组织驻留巨噬细胞在治疗后扩增，肿瘤相关巨噬细胞（TAM）被重塑为中性。

接下来是对文中的图进行的复现…

纯手工，自己写代码，并不是文章的代码哦。。。。

1.总分群

文章原图

rm(list=ls())
options(stringsAsFactors = F) 
source('scrip/lib.R')
library(readxl)

###### step1:导入数据 ######   
ct=fread( 'GSE207422_RAW/GSE207422_NSCLC_scRNAseq_UMI_matrix.txt.gz',data.table = F)
ct[1:4,1:4]
ct[nrow(ct),1:4]
rownames(ct)=ct[,1]
ct=ct[,-1]
ct[1:4,1:4]
ct1 = ct
s <- colnames(ct1)
s[1:10]
s1 = substr(s, 1, 11)
s1[1:10]
table(s1)
colnames(ct1) = s1
meta <- read_excel("GSE207422_RAW/GSE207422_NSCLC_scRNAseq_metadata.xlsx")

sce.all = CreateSeuratObject(counts =  ct1 , 
                             min.cells = 5,
                             min.features = 500)
as.data.frame(sce.all@assays$RNA@counts[1:10, 1:2])
head(sce.all@meta.data, 10)
table(sce.all@meta.data$orig.ident) 

s = colnames(sce.all)
s1 = substr(s, 1, 11)
s1[1:10]
table(s1)
s1 = gsub(c('BD_immune01|BD_immune05|BD_immune08'),"Pre-treatment",s1)
s2 = gsub("BD_immune.*$", "Post-treatment", s1) #第一个空格直达结尾替换成b
table(s2)

phe=str_split(  colnames(sce.all) ,'_',simplify = T)
head(phe)
table(phe[,1])
table(phe[,2])  
table(phe[,2],phe[,1])  
sce.all$orig.ident = s2
table(sce.all$orig.ident )
sce.all$group=sce.all$orig.ident
sce.all$orig.ident=paste(phe[,2],phe[,1])  
head(sce.all@meta.data) 
table(sce.all@meta.data$group)
table(sce.all@meta.data$orig.ident)

###### step2:QC质控 ######
dir.create("./1-QC")
setwd("./1-QC")
source('../scrip/qc.R')
sce.all.filt = basic_qc(sce.all)
print(dim(sce.all))
print(dim(sce.all.filt))
setwd('../')
sp='human'

###### step3: harmony整合多个单细胞样品 ######
dir.create("2-harmony")
getwd()
setwd("2-harmony")
source('../scrip/harmony.R')
# 默认 ScaleData 没有添加"nCount_RNA", "nFeature_RNA"
# 默认的
sce.all.int = run_harmony(sce.all.filt)
setwd('../')

#######step4: 检查常见分群情况
rm(list=ls())
library(ggplot2)
dir.create("3-cell")
setwd("3-cell")  
getwd()
sce.all.int=readRDS("../2-harmony/sce.all_int.rds")
table(Idents(sce.all.int))
table(sce.all.int$seurat_clusters)
table(sce.all.int$RNA_snn_res.0.1) 
table(sce.all.int$RNA_snn_res.0.3) 

sel.clust = "RNA_snn_res.0.3"
sce.all <- SetIdent(sce.all.int, value = sel.clust)
table(sce.all@active.ident) 

meta <- read_excel("../GSE207422_RAW/GSE207422_NSCLC_scRNAseq_metadata.xlsx")
s = colnames(sce.all)
s[1:10]
s1 = substr(s, 1, 11)
s1[1:10]
table(s1)
s2 = gsub("BD_immune", "P", s1) 
table(s2)

sce.all$patient = s2
table(sce.all$patient )

##RECIST
colnames(meta)
meta$RECIST
s_RECIST = gsub(c('BD_immune01|BD_immune07|BD_immune08|BD_immune11|BD_immune12|BD_immune15'),"PR",s1)
s_RECIST = gsub("BD_immune.*$", "SD", s_RECIST) 
table(s_RECIST)
sce.all$RECIST = s_RECIST

##病理反应
##后面是按照TN和治疗后手术样本来分类的
colnames(meta)
meta$`Pathologic Response`
s_Pathologic = gsub(c('BD_immune03|BD_immune06|BD_immune11|BD_immune14'),"MPR",s1)
s_Pathologic = gsub("BD_immune.*$", "NMPR", s_Pathologic) 
table(s_Pathologic)
sce.all$Pathologic = s_Pathologic
phe = sce.all@meta.data

phe[phe$orig.ident == 'Pre-treatment', "Pathologic"] = 'TN'
table(phe$Pathologic)
sce.all@meta.data = phe
table(sce.all$Pathologic)

##药物治疗
colnames(meta)
meta = meta[1:15,]
meta$medicine = paste(meta$`PD1 Antibody`,meta$Chemotherapy,sep="+")
s1
table(meta$medicine)
match_vec <- match(s1, unique(s1))
s_medicine <- meta$medicine[match_vec]
##检验一下是不是对的...
s1[23456]
s_medicine[23456]

s1[45678]
s_medicine[45678]

table(s_medicine)
sce.all$medicine = s_medicine

saveRDS(sce.all, "sce.all_meta.rds")

#marker
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',  ## T 
                   'CD19', 'CD79A', 'MS4A1' ,  ## B 
                   'IGHG1', 'MZB1', 'SDC1', ## Plasma 
                   'CD68', 'CD163', 'CD14', 
                   'VCAN', 'FCN1', 
                   'TPSAB1' , 'TPSB2',  # mast cells,
                   'RCVRN','FPR1' , 'ITGAM' ,
                   'C1QA',  'C1QB',  # mac
                   'S100A9', 'S100A8', 'MMP19',# monocyte
                   'MKI67' , 'TOP2A','LYZ', ##myloid
                   'LAMP3', 'IDO1','IDO2',## DC3 
                   'CD1E','CD1C', # DC2
                   'CLEC10A', 'CLEC12A','LILRA4', ##pDC
                   'FGFBP2','NCR1','KLRK1','KLRC1','NCAM1','FCGR3A', # NK 
                   'KIT', ##MAST
                   'CSF3R','ITGAM','ITGB2','FUT4','FCGR3B','FCGR2A', ##Neutrophil
                   'FGF7','MME', 'ACTA2','VWF','COL1A1', ## human  fibo 
                   'DCN', 'LUM',  'GSN' , ## mouse PDAC fibo 
                   'PECAM1', 'VWF',  ## endo 
                   'EPCAM' , 'KRT19','KRT7' # epi 
)


library(stringr)  
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p <- DotPlot(sce.all, features = unique(genes_to_check),
             assay='RNA'  )  + coord_flip()

p 

mycolors <-c('#E5D2DD', '#53A85F', '#F1BB72', '#F3B1A0', '#D6E7A3', '#57C3F3',
                        '#E95C59', '#E59CC4', '#AB3282',  '#BD956A', 
                        '#9FA3A8', '#E0D4CA',  '#C5DEBA', '#F7F398',
                         '#C1E6F3', '#6778AE', '#91D0BE', '#B53E2B',
                        '#712820', '#DCC1DD', '#CCE0F5',  '#CCC9E6', '#625D9E', '#68A180', '#3A6963',
                        '#968175')

col =c("#3176B7","#F78000","#3FA116","#CE2820","#9265C1",
                "#885649","#DD76C5","#7F7F7F","#BBBE00","#41BED1")


p_umap=DimPlot(sce.all, reduction = "umap",cols = mycolors,pt.size = 0.4,
               group.by = "RNA_snn_res.0.3",label = T,label.box = T) 
p_umap
library(patchwork)
p+p_umap
ggsave('markers_umap_2.pdf',width = 13,height = 8)

#####step5: 细胞生物学命名
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
getwd()
sce.all=readRDS( "../3-cell/sce.all_meta.rds")
sce.all

celltype=data.frame(ClusterID=0:17,
                    celltype= 0:17) 

#定义细胞亚群 
celltype[celltype$ClusterID %in% c(0 ,12 ),2]='Tcells'
celltype[celltype$ClusterID %in% c( 4 ),2]='Bcells'
celltype[celltype$ClusterID %in% c( 5),2]='plasma'
celltype[celltype$ClusterID %in% c( 13 ),2]='mast'
celltype[celltype$ClusterID %in% c( 14 ),2]='pDC'
celltype[celltype$ClusterID %in% c(  11 ),2]='fibroblast'
celltype[celltype$ClusterID %in% c( 1,2,15  ),2]='myeloids'
celltype[celltype$ClusterID %in% c( 6,8,9,10,17 ),2]='epithelium'
celltype[celltype$ClusterID %in% c( 3,16 ),2]='neutrophil'
celltype[celltype$ClusterID %in% c( 7 ),2]='NK'


head(celltype)
celltype
table(celltype$celltype)

sce.all@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  sce.all@meta.data[which(sce.all@meta.data$RNA_snn_res.0.3 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all@meta.data$celltype)


th=theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5)) 
library(patchwork)
p_umap=DimPlot(sce.all, reduction = "umap",cols = col,pt.size = 0.4,
                               group.by = "celltype",label = T) 
p+p_umap
ggsave('markers_umap_by_celltype_2.pdf',width = 15,height = 7)
saveRDS(sce.all, "sce.all_celltype.rds")
phe=sce.all@meta.data
save(phe,file = 'phe-by-markers.Rdata')

colnames(phe)
table(sce.all$Pathologic)

p = DimPlot(sce.all, reduction = "umap",split.by = 'Pathologic',
        group.by = "celltype",label = T) 
install.packages('tidydr')
library(ggplot2)
library(tidydr)

p <- p + theme_dr()
p

p <- p + theme(panel.grid=element_blank())
p


load('sce-by-celltype.Rdata')
features = c('PTPRC','KRT19','COL1A1','VWF','CD3E','CD79A','MS4A1',
             'IGHG1','CSF3R','LYZ','FGFBP2','KIT','LILRA4','MKI67')
FeaturePlot(sce.all,features = features,ncol = 5 )

上面的代码里面的有一些source步骤，是依赖于外部的script文件夹里面的几个R代码文件。。。详见：小鼠的5个样品的10x技术单细胞转录组上游定量（文末赠送全套代码）

###### step6: 单细胞亚群比例差异  ######
## 每种细胞类型中，分组所占比例 ----
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
getwd()
sce.all=readRDS( "3-cell/sce.all_celltype.rds")
sce.all
phe=sce.all@meta.data
library(tidyr)# 使用的gather & spread
library(reshape2) # 使用的函数 melt & dcast 

colnames(phe)
tb=table(phe$celltype,
         phe$patient)
head(tb)
library (gplots) 
balloonplot(tb)
bar_data <- as.data.frame(tb)

bar_per <- bar_data %>% 
  group_by(Var2) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per) 
write.csv(bar_per,file = "celltype_by_group_percent.csv")
col =c("#3176B7","#F78000","#3FA116","#CE2820","#9265C1",
                "#885649","#DD76C5","#7F7F7F","#BBBE00","#41BED1")
ggplot(bar_per, aes(x = percent, y = Var2)) +
  geom_bar(aes(fill = Var1) , stat = "identity") + coord_flip() +
  theme(axis.ticks = element_line(linetype = "blank"),
        legend.position = "top",
        panel.grid.minor = element_line(colour = NA,linetype = "blank"), 
        panel.background = element_rect(fill = NA),
        plot.background = element_rect(colour = NA)) +
  labs(y = "% Relative cell source", fill = NULL)+labs(x = NULL)+
  scale_fill_manual(values=col)

ggsave("celltype_by_group_percent.pdf",
       width = 8,height = 4)

再就是不同病理结局中细胞亚群的比例

###Figure1D 分为未治疗组TN，和治疗组（治疗组又分为MPR和NMPR）
table(phe$celltype)
table(phe$patient)
phe$patientoutcone =  paste(phe$patient,phe$Pathologic,sep = '_')

phe_immune = phe[phe$celltype %in% c('Bcells','myeloids','neutrophil','NK','Tcells'),]

tb=table(phe_immune$celltype,
         phe_immune$patientoutcone)

head(tb)
balloonplot(tb)
bar_data <- as.data.frame(tb)


bar_per <- bar_data %>% 
  group_by(Var2) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per) 
bar_per$Var2
group = ifelse(str_detect(bar_per$Var2,"TN"),"TN",
               ifelse(str_detect(bar_per$Var2,"NMPR"),"NMPR","MPR"))
bar_per$group = group

bar_per$group = factor(bar_per$group,levels =c("TN","MPR","NMPR"))
table(bar_per$Var1)
bar_per$Var1 = factor(bar_per$Var1,levels =c("Tcells","Bcells","NK",'myeloids','neutrophil'))

###拼接五个箱线图
library(gridExtra)
p <- list()

for(i in 1:5){
  # 选择当前细胞类型的数据
  cell <- bar_per[bar_per$Var1 == unique(bar_per$Var1)[i],]

  p[[i]] <- ggplot(cell, aes(x = group, y = percent, fill = group)) +
    geom_boxplot(size = 0.5, fill = "white", outlier.fill = "white", outlier.color = "white", color = col) +
    geom_jitter(aes(fill = group), width = 0.2, shape = 21, size = 2) +
    scale_fill_manual(values = col) +
    ggtitle(paste0(unique(bar_per$Var1)[i], " ")) +
    theme_bw() +
    theme(legend.position = "none",
          axis.text.x = element_text(colour = "black", family = "Times", size = 14),
          axis.text.y = element_text(family = "Times", size = 14, face = "plain"),
          axis.title.y = element_text(family = "Times", size = 14, face = "plain"),
          axis.title.x = element_text(family = "Times", size = 14, face = "plain"),
          plot.title = element_text(family = "Times", size = 15, face = "bold", hjust = 0.5),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    ylab("Fraction (%) of immune cells") + xlab(" ")
  # 移除y轴标题和刻度标签
  p[[i]] <- p[[i]] + theme(axis.title.y = element_blank())
}

combined_plot <- grid.arrange(grobs = p, ncol = 5, left = "Fraction (%) of immune cells") # 设置共同的纵轴标题和拼图列数

print(combined_plot)

和文中比例变化大多是一致的，药物治疗后MPR中T细胞和B细胞比例增加（文中还有NK细胞），中性粒细胞和髓系降低。

2.上皮细胞

####Epithelial cell重聚类####
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
dir.create("4-subEpi")
setwd("4-subEpi") 
getwd()
sce.all=readRDS( "../3-cell/sce.all_celltype.rds")
sce = sce.all
DefaultAssay(sce) <- "RNA"
table(sce$orig.ident)

table(sce$celltype)
sce2 = sce[, sce$celltype %in% c( 'epithelium' )]
table(sce2$celltype)
sce = sce2
table(sce$patient,sce$celltype)
table(sce$medicine,sce$celltype)
table(sce$orig.ident)
Idents(sce) = sce$patient
sce=CreateSeuratObject(counts = sce@assays$RNA@counts,
                       meta.data = sce@meta.data) 


sce <- NormalizeData(sce, normalization.method =  "LogNormalize",  
                     scale.factor = 1e4)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, 
                            selection.method = "vst", nfeatures = 2000)  
sce <- ScaleData(sce) 
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce)) 
DimHeatmap(sce, dims = 1:12, cells = 100, balanced = TRUE)
ElbowPlot(sce) 

sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.1)
table(sce@meta.data$RNA_snn_res.0.1)  

set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
saveRDS(sce,sce.all, "sce_epi.rds")


sce <- RunUMAP(object = sce, dims = 1:5, do.fast = TRUE)
DimPlot(sce,reduction = "umap",label=T) 
head(sce@meta.data)
DimPlot(sce,reduction = "umap",label=T,group.by = 'orig.ident')

mycolors <-c('#E5D2DD', '#53A85F', '#F1BB72', '#F3B1A0', '#D6E7A3', '#57C3F3',
                      '#E95C59', '#E59CC4', '#AB3282',  '#BD956A', 
                      '#9FA3A8', '#E0D4CA',  '#C5DEBA', '#F7F398',
                      '#C1E6F3', '#6778AE', '#91D0BE', '#B53E2B',
                      '#712820', '#DCC1DD', '#CCE0F5',  '#CCC9E6', '#625D9E', '#68A180', '#3A6963',
                      '#968175')
library(harmony)
table(sce$orig.ident)
seuratObj <- RunHarmony(sce, "orig.ident")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj,  dims = 1:8, 
                     reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T )  
library(patchwork)
library(RColorBrewer)
library(scales)
col4<-colorRampPalette(brewer.pal(8,'Set2'))(12)
DimPlot(seuratObj, reduction = 'umap',pt.size = 2,cols = mycolors,label = T,group.by = "RNA_snn_res.0.1",label.size = 6,raster=FALSE) 


sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",
                     dims = 1:5)
sce <- FindClusters(sce, resolution = 0.5)
table(sce@meta.data$seurat_clusters)
DimPlot(sce,reduction = "umap",label=T) 
##居然发现还是原始配色好看点，我哭死
DimPlot(sce, reduction = 'umap',pt.size = 1,label = T,group.by = "RNA_snn_res.0.5",label.size = 6,raster=FALSE) 
ggsave(filename = 'harmony_umap_recluster_by_0.5.pdf',width = 7,height = 7) 

col4<-colorRampPalette(brewer.pal(8,'Set2'))(15)
sce$orig.ident
DimPlot(sce, reduction = 'umap',pt.size = 1,cols = col4,label = T,group.by = "patient",label.size = 6,raster=FALSE) 
ggsave(filename = 'harmony_umap_sce_recluster_by_patient.pdf',width = 7,height = 7) 

#####患者之间的差异
DimPlot(sce, reduction = 'umap',pt.size = 1,cols = col4,label = T,split.by = 'orig.ident',label.size = 6,raster=FALSE) 
ggsave(filename = 'umap_sce_recluster_treatment.pdf',width = 10,height = 6.5) 

###配色什么的，不重要啦，喜欢什么颜色就放什么颜色
saveRDS(sce,file = "sce_epi.rds")



###copykat区分肿瘤细胞和正常细胞
devtools::install_github("navinlabcode/copykat")
library("copykat") 

library(Seurat)
library(copykat)
library(tidyverse)
scRNA <- sce
counts <- as.matrix(scRNA@assays$RNA@counts)
table(scRNA$celltype)
cnv <- copykat(rawmat=counts, ngene.chr=5, sam.name="test", n.cores=8)
# ngene.chr参数是过滤细胞的一个标准，它要求被用来做CNV预测的细胞，一个染色体上至少有5个基因。
# sam.name定义样本名称 (sample name)，会给出来的文件加前缀
saveRDS(cnv, "cnv.rds")
table(rownames(cnv$CNAmat))
a = cnv$prediction$copykat.pred
table(a)
scRNA$CopyKAT = a
p1 <- DimPlot(scRNA, group.by = "celltype", label = T)
p2 <- DimPlot(scRNA, group.by = "CopyKAT") + scale_color_manual(values = c("#F8766D",'#02BFC4', "gray"))
pc <- p1 + p2
pc
ggsave("pred_mallignant.pdf", pc, width = 12, height = 5)
###虽然这里得到结果了，但是我觉得上面选5数值太大了，也不想重新跑一遍几小时了....


cols = c("gray", "coral2")
plot <- FeaturePlot(scRNA, features = c('AGER','SFTPA2','SCGB1A1','TPPP3','KRT17'),cols = cols, pt.size = 1)+  
  theme(panel.border = element_rect(fill=NA,color="black", size=1, linetype="solid"))#加边框 
plot_grid(p2,plot)


#####注释亚群
#marker
#1. DST：参与细胞骨架的形成和维护，细胞附着和迁移的调节，肌肉成熟和神经元发育等过程。
#2. KRT17：编码角质形成细胞中的一种角蛋白，参与细胞骨架的形成和角质形成。
#3. S100P：与钙离子结合的蛋白质，参与细胞增殖、迁移和侵袭等生物学过程，也可能与多种癌症的发生和发展有关。
#4. PCNA：是细胞周期中S期的细胞核蛋白，在DNA复制和细胞增殖过程中起重要作用。
#5. TOP2A：编码一种DNA拓扑异构酶，参与DNA的超螺旋松弛和重连，影响DNA复制和细胞分裂。
#6. SFTPA2：编码一种肺表面活性物质蛋白，参与肺泡的防御和免疫功能。
#7. SPARCL1：编码一种分泌蛋白，参与肿瘤抑制、细胞迁移和基质重塑等过程。
#8. SERPINB9：编码一种丝氨酸蛋白酶抑制剂，参与炎症调节和细胞凋亡等过程。
#9. SCGB3A1：编码一种分泌蛋白，可能与肺部基底细胞的分化和肺免疫功能有关。
#10. TPPP3：编码一种调节微管动力学的蛋白，可能参与细胞骨架的重塑和细胞运动等生物学过程。
genes_to_check = c('DST', 'AKR1B10', 'NDUFA4L2', 
                   'KRT17' , 'CEACAM5', 'LY6G6C', 'KLK10',
                   'S100P', 'S100A9', 'MT2A',
                   'PCNA', 'TYMS', 'HIST1H2BJ', 'RRM2','DUT',
                   'TOP2A', 'TPX2', 'CENPF','ARL6IP1',
                   'SFTPA2', 'SFTPA1', 'SFTPB',
                   'SPARCL1' ,'IGKC','AL365357.1',
                   'SERPINB9', 'MYH11' ,'GJB6',
                   'SCGB3A1','MSMB', 'SCGB1A1', 'SLPI',
                   'TPPP3','CAPS', 'C9orf24', 'RSPH1')

library(stringr)  
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p <- DotPlot(scRNA, features = unique(genes_to_check),
             assay='RNA'  )  + coord_flip()

mycolors <-c('#E5D2DD', '#53A85F', '#F1BB72', '#F3B1A0', '#D6E7A3', '#57C3F3',
                      '#E95C59', '#E59CC4', '#AB3282',  '#BD956A', 
                      '#9FA3A8', '#E0D4CA',  '#C5DEBA', '#F7F398',
                      '#C1E6F3', '#6778AE', '#91D0BE', '#B53E2B',
                      '#712820', '#DCC1DD', '#CCE0F5',  '#CCC9E6', '#625D9E', '#68A180', '#3A6963',
                      '#968175')

p_umap=DimPlot(scRNA, reduction = "umap",cols = mycolors,pt.size = 0.4,
               group.by = "RNA_snn_res.0.5",label = T,label.box = T) 
p_umap
library(patchwork)
p+p_umap
ggsave('markers_umap_2.pdf',width = 13,height = 8)

#####细胞生物学命名
celltype=data.frame(ClusterID=0:13,
                    celltype= 0:13) 

#定义细胞亚群 
celltype[celltype$ClusterID %in% c( 0 ),2]='E0_DST'
celltype[celltype$ClusterID %in% c( 9 ),2]='E1_KRT17'
celltype[celltype$ClusterID %in% c(4,6,10,13 ),2]='E2_S100P'
celltype[celltype$ClusterID %in% c(2  ),2]='E3_PCNA'
celltype[celltype$ClusterID %in% c( 7 ),2]='E4_TOP2A'
celltype[celltype$ClusterID %in% c( 5,12 ),2]='E5_SFTPA2'
celltype[celltype$ClusterID %in% c( 3 ),2]='E6_SPARCL1'
celltype[celltype$ClusterID %in% c( 8 ),2]='E7_SERPINB9'
celltype[celltype$ClusterID %in% c(1 ),2]='E8_SCGB3A1'
celltype[celltype$ClusterID %in% c( 11),2]='E9_TPPP3'


head(celltype)
celltype
table(celltype$celltype)

scRNA@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
  scRNA@meta.data[which(scRNA@meta.data$RNA_snn_res.0.5 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(scRNA@meta.data$celltype)


th=theme(axis.text.x = element_text(angle = 45, 
                                    vjust = 0.5, hjust=0.5)) 
library(patchwork)
p_umap=DimPlot(scRNA, reduction = "umap",cols = mycolors,pt.size = 0.4,
               group.by = "celltype",label = T) 
p+p_umap
ggsave('markers_umap_by_celltype_2.pdf',width = 15,height = 7)
saveRDS(scRNA, "sce.epi_celltype.rds")

可以看出E2，5，8，9亚群恶性细胞群较少或没有

######基因在不同病理结局的患者中表达情况
rm(list=ls())
scRNA=readRDS( "sce.epi_celltype.rds")
scRNA
table(scRNA$CopyKAT)
##在这里我查看了加入无法分类的细胞和未加入无法分类的细胞，差别还挺大的
#加入：CX3CL1趋势不对，AKR1C3趋势正确
#不加入：CX3CL1趋势正确，AKR1C3趋势不对


sce_obj = scRNA[, scRNA$CopyKAT %in% c( 'aneuploid')]
sce_obj$patientoutcone =  paste(sce_obj$patient,sce_obj$Pathologic,sep = '_')


# 通过Seurat的AverageExpression函数计算每个患者基因的平均表达量
avg <- AverageExpression(object = sce_obj, assay = "RNA", group.by = "patientoutcone")
a = avg$RNA


C1 = c('CX3CL1','CD74','HLA-DRA')
D1 = c('AKR1C1','AKR1C2','AKR1C3')
#这六个基因的生物学意义如下：
#1. CX3CL1是一种细胞因子，也被称为神经内皮炎性因子1（N-TAC），它是一种趋化因子，在炎症和免疫反应中起着重要作用。它与免疫细胞的迁移和炎症反应相关，同时也参与神经发育和神经保护。
#2. CD74是一种细胞膜蛋白，也被称为类别II转运分子细胞膜关联蛋白。CD74与抗原呈递、免疫调节和细胞信号传导等免疫功能密切相关。它在抗原处理和递呈的过程中起重要作用，并在免疫细胞识别、炎症和免疫反应中发挥关键作用。
#3. HLA-DRA（Human Leukocyte Antigen-DR α chain）：HLA-DRA是人白细胞抗原-DR（HLA-DR）的α链。HLA-DR是一种MHC类II复合物，负责呈递外源性抗原给T细胞，从而触发免疫反应。HLA-DRA与免疫系统的抗原递呈、T细胞的激活和调节密切相关，在免疫途径中发挥重要作用。
#4. AKR1C1属于醛酮还原酶家族的一员。它是一种脱氢酶，对多种内源性和外源性化合物具有还原活性。AKR1C1参与睾酮、孕酮等激素的代谢转化，同时还参与抗氧化和抗炎过程。
#5. AKR1C2也是醛酮还原酶家族的成员，与AKR1C1具有相似的酶活性。它在多种生物学过程中发挥重要作用，包括激素代谢、抗氧化、抗炎和神经保护。
#6. AKR1C3是醛酮还原酶家族的一员，与AKR1C1和AKR1C2具有相似的酶活性。它参与睾酮、雄激素和孕激素的代谢转化，同时还具有抗氧化和抗炎活性。在某些癌症中，AKR1C3的过度表达与抗药性和肿瘤的恶性进展有关。
gene_boxplot <- function(gene){
  b = as.data.frame(a[gene,])
  colnames(b) = gene
  phe = b
  group = ifelse(str_detect(rownames(phe),"TN"),"TN",
                 ifelse(str_detect(rownames(phe),"NMPR"),"NMPR","MPR"))
  phe$group = group

  phe$group = factor(phe$group,levels =c("TN","MPR","NMPR"))
  table(phe$group)
  colnames(phe)

  col = c("#508E42", "#DF5E38","#7B7DC4")
  P_T <- ggplot(phe,aes(x=group,y=!!rlang::sym(gene),fill=group)) +
    geom_boxplot(size=0.5,fill="white",outlier.fill="white",outlier.color="white",color=col) +
    geom_jitter(aes(fill=group),width =0.2,shape = 21,size=2) +
    scale_fill_manual(values = col)+
    ggtitle(gene) +
    theme_bw() +
    theme(panel.grid = element_blank(), panel.background = element_blank(),
          axis.line = element_line(color = 'black')) +
    ylab("Average signature score") +xlab(" ")+
    theme(legend.position = "none",
          axis.text.x = element_text(colour = "black", family = "Times", size = 14),
          axis.text.y = element_text(family = "Times", size = 14, face = "plain"),
          axis.title.y = element_text(family = "Times", size = 14, face = "plain"),
          axis.title.x = element_text(family = "Times", size = 14, face = "plain"),
          plot.title = element_text(family = "Times", size = 17, face = "bold", hjust = 0.5),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())

  P_T
  compaired = list(c("TN", "MPR"),
                   c("TN", "NMPR"))
  P_T = P_T+geom_signif(comparisons = compaired,step_increase = 0.1,map_signif_level = T,test = t.test,tip_length = 0)
  P_T

}

P_C1 = gene_boxplot(C1[1])+gene_boxplot(C1[2])+gene_boxplot(C1[3])
P_D1 = gene_boxplot(D1[1])+gene_boxplot(D1[2])+gene_boxplot(D1[3])

P_C1/P_D1
###文中主要是看了两个基因的变化，CX3CL1在MPR的患者中上调，AKR1C3在NMPR的患者中上调
ggsave('gene_outcome.pdf',width = 13,height = 8)
saveRDS(sce_obj, "sce.cancer_celltype.rds")

###GSVA分析
rm(list=ls())
sce.all=readRDS( "sce.cancer_celltype.rds")

scRNA <- sce.all
library(msigdbr)
library(GSVA)
library(Seurat)
#选择基因集
Idents(scRNA) = scRNA$patientoutcone
#Idents设置按什么来取组的表达均值（计算case control之间的均值也可以）
expr <- AverageExpression(scRNA, assays = "RNA", slot = "data")[[1]]
expr <- expr[rowSums(expr)>0,]  #选取非零基因
expr <- as.matrix(expr)
library(GSEABase)

gmt <- 'msigdb_v2023.1.Hs_files_to_download_locally/msigdb_v2023.1.Hs_files_to_download_locally/msigdb_v2023.1.Hs_GMTs/h.all.v2023.1.Hs.symbols.gmt'
geneSet <- getGmt(gmt)
# gsva(若为count矩阵，kcdf = 'Poisson'，若为其他标准化后的数据，使用默认参数即可)
gsva_scores <- gsva(expr,geneSet,method = 'gsva',min.sz=5,max.sz=500,kcdf = 'Poisson')
head(gsva_scores)
gsva_scores = as.data.frame(gsva_scores)

library(stringr)
library(ggplot2)
library(ggsignif)
b = as.data.frame(t(gsva_scores['HALLMARK_ESTROGEN_RESPONSE_LATE',]))
colnames(b) = 'Estrogen response late'
phe = b
group = ifelse(str_detect(rownames(phe),"TN"),"TN",
               ifelse(str_detect(rownames(phe),"NMPR"),"NMPR","MPR"))
phe$group = group

phe$group = factor(phe$group,levels =c("TN","MPR","NMPR"))
table(phe$group)
colnames(phe)

col = c("#508E42", "#DF5E38","#7B7DC4")
pathway = 'Estrogen response late'
P_T <- ggplot(phe,aes(x=group,y=!!rlang::sym(pathway),fill=group)) +
  geom_boxplot(size=0.5,fill="white",outlier.fill="white",outlier.color="white",color=col) +
  geom_jitter(aes(fill=group),width =0.2,shape = 21,size=2) +
  scale_fill_manual(values = col)+
  ggtitle(pathway) +
  theme_bw() +
  theme(panel.grid = element_blank(), panel.background = element_blank(),
        axis.line = element_line(color = 'black')) +
  ylab("Average signature score") +xlab(" ")+
  theme(legend.position = "none",
        axis.text.x = element_text(colour = "black", family = "Times", size = 14),
        axis.text.y = element_text(family = "Times", size = 14, face = "plain"),
        axis.title.y = element_text(family = "Times", size = 14, face = "plain"),
        axis.title.x = element_text(family = "Times", size = 14, face = "plain"),
        plot.title = element_text(family = "Times", size = 17, face = "bold", hjust = 0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank())

P_T
compaired = list(c("TN", "MPR"),
                 c("TN", "NMPR"))
P_T = P_T+geom_signif(comparisons = compaired,step_increase = 0.1,map_signif_level = T,test = t.test,tip_length = 0)
P_T


gmt <- 'msigdb_v2023.1.Hs_GMTs/c5.go.bp.v2023.1.Hs.symbols.gmt'
geneSet <- getGmt(gmt)
# gsva(若为count矩阵，kcdf = 'Poisson'，若为其他标准化后的数据，使用默认参数即可)
gsva_scores <- gsva(expr,geneSet,method = 'gsva',min.sz=5,max.sz=500,kcdf = 'Poisson')
head(gsva_scores)
gsva_scores = as.data.frame(gsva_scores)


library(stringr)
library(ggplot2)
library(ggsignif)
b = as.data.frame(t(gsva_scores['GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_OR_POLYSACCHARIDE_ANTIGEN_VIA_MHC_CLASS_II',]))
colnames(b) = 'Antigen presentation via MHC-II'
phe = b
group = ifelse(str_detect(rownames(phe),"TN"),"TN",
               ifelse(str_detect(rownames(phe),"NMPR"),"NMPR","MPR"))
phe$group = group

phe$group = factor(phe$group,levels =c("TN","MPR","NMPR"))
table(phe$group)
colnames(phe)

col = c("#508E42", "#DF5E38","#7B7DC4")
pathway = 'Antigen presentation via MHC-II'
P_T <- ggplot(phe,aes(x=group,y=!!rlang::sym(pathway),fill=group)) +
  geom_boxplot(size=0.5,fill="white",outlier.fill="white",outlier.color="white",color=col) +
  geom_jitter(aes(fill=group),width =0.2,shape = 21,size=2) +
  scale_fill_manual(values = col)+
  ggtitle(pathway) +
  theme_bw() +
  theme(panel.grid = element_blank(), panel.background = element_blank(),
        axis.line = element_line(color = 'black')) +
  ylab("Average signature score") +xlab(" ")+
  theme(legend.position = "none",
        axis.text.x = element_text(colour = "black", family = "Times", size = 14),
        axis.text.y = element_text(family = "Times", size = 14, face = "plain"),
        axis.title.y = element_text(family = "Times", size = 14, face = "plain"),
        axis.title.x = element_text(family = "Times", size = 14, face = "plain"),
        plot.title = element_text(family = "Times", size = 17, face = "bold", hjust = 0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank())

P_T
compaired = list(c("TN", "MPR"),
                 c("TN", "NMPR"))
P_T = P_T+geom_signif(comparisons = compaired,step_increase = 0.1,map_signif_level = T,test = t.test,tip_length = 0)
P_T

MHC-II确实是在MPR的患者中上调了，但是这个文章立足之根本的雌激素反应对不上，实在是对不上啊。
（PS: 埋头苦干三个小时了，此时我正在看着楼下的人在小湖边钓鱼，好想去看😶‍🌫️）

3.NK_T细胞

由于图形和上述都是一致的，就只放结果图了，请看：：：

rm(list=ls())
sce.all=readRDS( "sce.NKT_celltype.rds")

seuratObj <- sce.all
avg <- AverageExpression(object = seuratObj, assay = "RNA", group.by = "celltype")
a = avg$RNA
b = as.data.frame(a)
str(b)
genes <- c()

for (i in seq_along(genes_to_check)) {
  genes <- c(genes, unlist(genes_to_check[[i]]))
}
gene_heatmap = b[genes,]

column_ha = HeatmapAnnotation('Cell type' = ifelse(str_detect(colnames(gene_heatmap),"NK"),"NK",
                                                   ifelse(str_detect(colnames(gene_heatmap),"CD8"),"CD8",
                                                          ifelse(str_detect(colnames(gene_heatmap),"CD4"),"CD4",
                                                                 ifelse(str_detect(colnames(gene_heatmap),"Treg"),"Treg","T")))) )

row_ha <- rowAnnotation(
  'Marker' = row)

col_fun = colorRamp2(c(0, 2, 7), c("#4EC0DB", "white", "#E63946"))

gene_list = genes_to_check
row <- character()
for (i in 1:length(gene_list)) {
  row <- c(row, rep(names(gene_list)[i], length(gene_list[[i]])))
}

column = ifelse(str_detect(colnames(gene_heatmap),"NK"),"NK",
                ifelse(str_detect(colnames(gene_heatmap),"CD8"),"CD8",
                       ifelse(str_detect(colnames(gene_heatmap),"CD4"),"CD4",
                              ifelse(str_detect(colnames(gene_heatmap),"Treg"),"Treg","T"))))

Heatmap(gene_heatmap, 
        column_split = column,
        row_split = c('Marker' = row),
        row_title = NULL,
        heatmap_legend_param = list(
          title = "Score gene expression",
          legend_height = unit(4, "cm"),
          title_position = "lefttop-rot"
        ),
        top_annotation = column_ha,
        left_annotation = row_ha,
        col = col_fun,
        cluster_rows = FALSE, cluster_columns = FALSE,column_title = NULL,
        show_row_names = T,show_column_names  = T
)

总觉得配色很扎眼。

###应该是基因的平均表达量
#取出CD8T
####NK和T细胞
table(seuratObj$celltype)
sce_CD8T = seuratObj[, seuratObj$celltype %in% c( 'CD8_GZMB','CD8_GZMK','CD8_HAVCR2','CD8_IL7R','CD8_STMN1' )]
avg <- AverageExpression(object = sce_CD8T, assay = "RNA", group.by = "patientoutcone")
a = avg$RNA
b = as.data.frame(a)
str(b)

gene_boxplot <- function(gene){
  average_expression <- colMeans(submatrix)
  average_expression = as.data.frame(average_expression)
  colnames(average_expression) = gene
  phe = average_expression
  group = ifelse(str_detect(rownames(phe),"TN"),"TN",
                 ifelse(str_detect(rownames(phe),"NMPR"),"NMPR","MPR"))
  phe$group = group

  phe$group = factor(phe$group,levels =c("TN","MPR","NMPR"))
  table(phe$group)
  colnames(phe)

  col = c("#508E42", "#DF5E38","#7B7DC4")
  P_T <- ggplot(phe,aes(x=group,y=!!rlang::sym(gene),fill=group)) +
    geom_boxplot(size=0.5,fill="white",outlier.fill="white",outlier.color="white",color=col) +
    geom_jitter(aes(fill=group),width =0.2,shape = 21,size=2) +
    scale_fill_manual(values = col)+
    ggtitle(gene) +
    theme_bw() +
    theme(panel.grid = element_blank(), panel.background = element_blank(),
          axis.line = element_line(color = 'black')) +
    ylab("Average signature score") +xlab(" ")+
    theme(legend.position = "none",
          axis.text.x = element_text(colour = "black", family = "Times", size = 14),
          axis.text.y = element_text(family = "Times", size = 14, face = "plain"),
          axis.title.y = element_text(family = "Times", size = 14, face = "plain"),
          axis.title.x = element_text(family = "Times", size = 14, face = "plain"),
          plot.title = element_text(family = "Times", size = 17, face = "bold", hjust = 0.5),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank())

  P_T
  compaired = list(c("TN", "MPR"),
                   c("TN", "NMPR"))
  P_T = P_T+geom_signif(comparisons = compaired,step_increase = 0.1,map_signif_level = T,test = t.test,tip_length = 0)
  P_T

}

# Cytotoxic基因名称列表
Cytotoxic_gene <- c("GZMA", "GZMB", "GZMK" ,"GNLY", "IFNG" ,"PRF1" ,"NKG7")

submatrix <- b[Cytotoxic_gene, ]
gene_boxplot('Cytotoxic signature')

# Exhausted基因名称列表
##看，确实有CD39，果然是高耗竭基因啊
gene_list
Exhausted_gene <- c("LAG3" ,  "TIGIT" , "PDCD1" , "HAVCR2", "CTLA4" , "LAYN"  , "ENTPD1")

submatrix <- b[Exhausted_gene, ]
gene_boxplot('Exhausted signature')


#####下面是NK
table(seuratObj$celltype)
sce_NK = seuratObj[, seuratObj$celltype %in% c( 'NK_FCGR3A','NK_KLRD1' )]

avg <- AverageExpression(object = sce_NK, assay = "RNA", group.by = "patientoutcone")
a = avg$RNA
b = as.data.frame(a)
str(b)

# Cytotoxic基因名称列表
submatrix <- b[Cytotoxic_gene, ]
gene_boxplot('Cytotoxic signature')


# Exhausted基因名称列表
submatrix <- b[Exhausted_gene, ]
gene_boxplot('Exhausted signature')



#####下面是Treg
table(seuratObj$celltype)
sce_Treg = seuratObj[, seuratObj$celltype %in% c( 'Treg_CTLA4','Treg_SELL' )]
avg <- AverageExpression(object = sce_Treg, assay = "RNA", group.by = "patientoutcone")
a = avg$RNA
b = as.data.frame(a)
str(b)


# Exhausted基因名称列表
submatrix <- b[Exhausted_gene, ]
gene_boxplot('Exhausted signature')
###在未治疗组Treg这么高？，MPR组抑制性Treg下降了

这是CD8T细胞的细胞毒性基因和耗竭基因集的评分，当时忘了拼在一起给它们加上CD8T的标签了…

下面是NK细胞，这个NK细胞确实有点不太对劲，和第一节里的NK相互印证了。

#####箱线图显示TN ( n = 3)、MPR ( n = 4)和NMPR ( n = 8)患者各T/NK细胞群的细胞比例。
####不同细胞群里亚组比例
phe = seuratObj@meta.data
table(phe$celltype)
table(phe$patient)
phe$patientoutcome =  paste(phe$patient,phe$Pathologic,sep = '_')

tb=table(phe$celltype,
         phe$patientoutcome)

head(tb)
library (gplots) 
balloonplot(tb)
bar_data <- as.data.frame(tb)

bar_per <- bar_data %>% 
  group_by(Var2) %>%
  mutate(sum(Freq)) %>%
  mutate(percent = Freq / `sum(Freq)`)
head(bar_per) 
#write.csv(bar_per,file = "celltype_by_group_percent.csv")
library(ggplot2)
library(stringr)
bar_per$Var2
group = ifelse(str_detect(bar_per$Var2,"TN"),"TN",
               ifelse(str_detect(bar_per$Var2,"NMPR"),"NMPR","MPR"))
bar_per$group = group

bar_per$group = factor(bar_per$group,levels =c("TN","MPR","NMPR"))
table(bar_per$Var1)
bar_per$Var1 = factor(bar_per$Var1,levels =c("B3_IGHD","B0_MS4A1","B1_IGHM",'B2_HSPA1A','B4_FCRL4','B5_CD83','B6_RGS13'))

col = c("#508E42", "#DF5E38","#7B7DC4")
P <- ggplot(bar_per, aes(x = Var1, y = percent, fill = group, color = group)) +
  geom_boxplot(width = 0.6, outlier.shape = NA) +
  geom_jitter(shape = 21, size = 2, position = position_jitterdodge(0.2)) +
  scale_fill_manual(values = c("white", "white", "white")) +
  scale_color_manual(values = col,
                     labels = c("TN(n=3)", "MPR(n=4)", "NMPR(n=8)"),
                     name = "Group") +
  theme_bw() +
  theme(legend.position = "bottom",
        legend.direction = 'horizontal',
        legend.justification = "left",
        legend.box.just = "left",
        legend.title = element_blank(),
        axis.text.x = element_text(colour = "black",angle=45,  hjust = 1, family = "Times", size = 12),
        axis.text.y = element_text(family = "Times", size = 14, face = "plain"),
        axis.title.y = element_text(family = "Times", size = 14, face = "plain"),
        axis.title.x = element_text(family = "Times", size = 14, face = "plain"),
        plot.title = element_text(family = "Times", size = 15, face = "bold", hjust = 0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  labs(y = "Fraction (%) of immune cells",
       x = "")+
  guides(fill = FALSE)

P

p = P+
  coord_cartesian(clip = 'off',ylim = c(0,1))+ #在非图形区域绘图,且要定好y轴范围
  theme(plot.margin = margin(2.2,0.5,1.5,0.5,'cm'))+ #自定义图片上左下右的边框宽度
  annotate('text',x=2.6,y=1.13,label='CD4 Tconv',size=7,family='serif',color='red')+ 
  annotate('segment',x=0.55,xend=4.2,y=1.1,yend=1.1,color='red',cex=.8)+
  annotate('text',x=6.8,y=1.13,label='CD8 T',size=7,family='serif',color='red')+ 
  annotate('segment',x=4.7,xend=8.9,y=1.1,yend=1.1,color='red',cex=.8)+
  annotate('text',x=10.5,y=1.13,label='NK',size=7,family='serif',color='red')+
  annotate('segment',x=9.5,xend=11.5,y=1.1,yend=1.1,color='red',cex=.8)+
  annotate('text',x=13.6,y=1.13,label='CD4 Treg',size=7,family='serif',color='red')+
  annotate('segment',x=12.8,xend=14.3,y=1.1,yend=1.1,color='red',cex=.8)

p
ggsave('NK_T_por.pdf',width = 14.5,height = 7.5)
####活化的Tregs比naïve Tregs具有更强的免疫抑制功能，且与预后不良相关。
#持续激活Tregs（Treg_CTLA4，表达TNFRSF4和TNFRSF9）仅在MPR患者中降低。
#相对于TN患者，MPR和NMPR患者中naïve Tregs (Treg_SELL，表达SELL和LEF1)的比例均下降。
#MPR患者的Treg耗竭特征始终低于NMPR患者。

#文章中用的monocle2，这里也用2
###### 拟时序分析 ######
sce_4CD8T = seuratObj[, seuratObj$celltype %in% c( 'CD8_GZMB' ,'CD8_GZMK','CD8_HAVCR2','CD8_IL7R')]
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
library(monocle)
packageVersion("monocle")
#monocle构建CDS需要3个矩阵：expr.matrix、pd、fd
# 将Seurat中的对象转换为monocle识别的对象
#cds <- importCDS(GetAssayData(seurat.object))
#选择做拟时序的亚群
Mono_tj<-sce_4CD8T

Mono_matrix<-as(as.matrix(GetAssayData(Mono_tj,slot = "counts")), 'sparseMatrix')
#构建featuredata，一般featuredata需要两个col，一个是gene_id,一个是gene_short_name,row对应counts的rownames
feature_ann<-data.frame(gene_id=rownames(Mono_matrix),gene_short_name=rownames(Mono_matrix))
rownames(feature_ann)<-rownames(Mono_matrix)
Mono_fd<-new("AnnotatedDataFrame", data = feature_ann)

#Seurat object中的@meta.data一般会存放表型相关的信息如cluster、sample的来源、group等，所以选择将metadata转换为phenodata
sample_ann<-Mono_tj@meta.data
#rownames(sample_ann)<-colnames(Mono_matrix)

Mono_pd<-new("AnnotatedDataFrame", data =sample_ann)
#build new cell data set
Mono.cds<-newCellDataSet(Mono_matrix,phenoData =Mono_pd,featureData =Mono_fd,expressionFamily=negbinomial.size())


#预处理
Mono.cds <- estimateSizeFactors(Mono.cds)
Mono.cds <- estimateDispersions(Mono.cds)
#筛选基因,这里可以根据自己的需要筛选特定的基因
disp_table <- dispersionTable(Mono.cds)
#unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)
unsup_clustering_genes <- subset(disp_table, mean_expression >= 0.1)

Mono.cds <- setOrderingFilter(Mono.cds, unsup_clustering_genes$gene_id)
#用DDRtree 进行降维分析
Mono.cds <- reduceDimension(
  Mono.cds,
  max_components = 2,
  method = 'DDRTree')
#计算psudotime值
Mono.cds <- orderCells(Mono.cds)
head(pData(Mono.cds))

plot_cell_trajectory(Mono.cds,color_by="celltype", size=1,show_backbone=TRUE)
ggsave("拟时序.pdf",width = 7,height = 6)
plot_cell_trajectory(Mono.cds,color_by="patient", size=1,show_backbone=TRUE)
plot_cell_trajectory(Mono.cds,color_by="Pathologic", size=1,show_backbone=TRUE)

p1 <- plot_cell_trajectory(Mono.cds, x = 1, y = 2, color_by = "celltype") + 
  theme(legend.position='none',panel.border = element_blank()) + #去掉第一个的legend
  scale_color_manual(values = mycolors) 
p2 <- plot_complex_cell_trajectory(Mono.cds, x = 1, y = 2,
                                   color_by = "celltype")+
  scale_color_manual(values = mycolors) +
  theme(legend.title = element_blank()) 
p1|p2

trace('project2MST', edit = T, where = asNamespace("monocle"))