PRJNA705464-Seurat数据下载及分析 by 学R时习之

PRJNA705464是肾透明细胞癌的大样本数据集，包含scRNA-seq和TCR数据。组织来源分为Normal、Tumor、PBMC和淋巴结，同时还有药物治疗等，内容是相当丰富。

这么大的数据量咱们怎么用呢？

从上游直接分析？还是找人帮忙分析？

上游分析数据量太大：

找人分析…………一言难尽，还是先自己摸索下吧。

惊奇的发现作者竟然提供了Seurat对象数据，而且生信技能树大神还分享过该数据集《构建seurat对象之初就应该是把基因名字转换好》，简直完美。下面跟着大神的教程认真学习下：

数据分析

首先从SRP308561相关链接下载ccRCC_6pat_Seurat和ccRCC_6pat_cell_annotations.txt文件。(数据有点大)

rm(list = ls())
librarian::shelf(Seurat)
# sce <- readRDS("rawdata/ccRCC_6pat_Seurat")
# sce
# sce <- UpdateSeuratObject(sce)
# sce
# save(sce,file = "sce_0raw.Rdata")

# 基因转换
if(F){
  load("sce_0raw.Rdata")
  DimPlot(sce)
  sce  # 16323 features across 167283 samples within 1 assay


  # 基因转换
  library(org.Hs.eg.db)
  head(rownames(sce))
  ids=AnnotationDbi::select(org.Hs.eg.db,keys = rownames(sce),
                            columns = c('ENSEMBL','SYMBOL'),
                            keytype = 'ENSEMBL')
  head(ids)
  dim(ids) # [1] 16458
  ids=na.omit(ids)
  dim(ids) # [1] 15527
  length(unique(ids$SYMBOL)) # [1] 15516
  # 这里的关系超级乱，互相之间都不是一对一
  # 凡是混乱的ID一律删除即可
  ids=ids[!duplicated(ids$SYMBOL),]
  ids=ids[!duplicated(ids$ENSEMBL),]
  pos=match(ids$ENSEMBL,rownames(sce) )
  sce=sce[pos,]
  sce # 15388 features
  #rownames(sce) = ids$SYMBOL
  head(rownames(sce)); head(ids)

  # https://github.com/satijalab/seurat/issues/2617
  # RenameGenesSeurat  ------------------------------------------------------------------------------------
  RenameGenesSeurat <- function(obj ,
                                newnames ) {
    # Replace gene names in different slots of a Seurat object. Run this before integration. Run this before integration.
    # It only changes obj@assays$RNA@counts, @data and @scale.data.
    print("Run this before integration. It only changes obj@assays$RNA@counts, @data and @scale.data.")
    RNA <- obj@assays$RNA

    if (nrow(RNA) == length(newnames)) {
      if (length(RNA@counts)) RNA@counts@Dimnames[[1]]            <- newnames
      if (length(RNA@data)) RNA@data@Dimnames[[1]]                <- newnames
      if (length(RNA@scale.data)) RNA@scale.data@Dimnames[[1]]    <- newnames
    } else {"Unequal gene sets: nrow(RNA) != nrow(newnames)"}
    obj@assays$RNA <- RNA
    return(obj)
  }
  obj=RenameGenesSeurat(obj = sce,
                        newnames = ids$SYMBOL)
  sce <- obj; rm(obj)
  save(sce,file = 'first_sce.Rdata')
}

# 细胞注释
if(F){
  load("first_sce.Rdata")

  library(ggplot2)
  genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A','CD19', 'CD79A', 'MS4A1' ,
                     'IGHG1', 'MZB1', 'SDC1',
                     'CD68', 'CD163', 'CD14',
                     'TPSAB1' , 'TPSB2',  # mast cells,
                     'RCVRN','FPR1' , 'ITGAM' ,
                     'FGF7','MME', 'ACTA2',
                     'PECAM1', 'VWF',
                     'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
  library(stringr)
  p_all_markers <- DotPlot(sce , features = genes_to_check,
                           assay='RNA'  )  + coord_flip()

  p_all_markers
  ggsave(plot=p_all_markers, filename="check_all_marker_by_seurat_cluster.pdf",width = 12, height = 10)


  # 需要自行看图，定细胞亚群：
  celltype=data.frame(ClusterID=0:34,
                      celltype='unkown')
  celltype[celltype$ClusterID %in% c( 0,2:4,6:8,10,15,17,17,19,31,32),2]='Tcells'
  celltype[celltype$ClusterID %in% c( 24 ),2]='plasma'
  celltype[celltype$ClusterID %in% c( 22 ),2]='naive-Bcells'
  celltype[celltype$ClusterID %in% c(21),2]='mast'

  celltype[celltype$ClusterID %in% c( 1,5,12:14,18,23,27,33,34),2]='myeloid'
  celltype[celltype$ClusterID %in% c( 9,16,26 ),2]='epithelial'

  celltype[celltype$ClusterID %in% c( 11),2]='endo'
  celltype[celltype$ClusterID %in% c(20),2]='fibo'
  celltype[celltype$ClusterID %in% c(25 ),2]='SMC'

  head(celltype)
  celltype
  table(celltype$celltype)
  sce$celltype <- plyr::mapvalues(sce$cluster,
                                  from = 0:34,
                                  to=celltype$celltype)
  p1 <- DimPlot(sce,group.by = "celltype",label = T)
  ggsave(p1,filename="DimPlot_by_seurat_cluster.pdf",width = 12, height = 10)
  save(sce, file = "first_sce.Rdata")
}

实战1-PMID34804944-Fig5A

接下来看看大佬们的神来之笔，首先是PMID34804944Fig5A。

作者是这样进行分析的：

PRJNA705464, which is a large database containing cells totally, was applied to investigate the role of pyroptosis-related genes in cell–cell interaction in ccRCC TME. Only untreated tumor samples from PRJNA705464 were applied for further analysis.

下面咱们就提取untreated样本进行后续分析，果然可完美复现Figure5A。

# meatdata
library(tidyverse)
metadata <- data.table::fread("rawdata/ccRCC_6pat_cell_annotations.txt",data.table = F) %>%
  column_to_rownames("cell")
metadata <- metadata[sce$cell,]
metadata1 <- metadata[,c("Sample_name","cluster_name")]
sce1 <- AddMetaData(object = sce,
                    metadata = metadata1)
DimPlot(sce1, group.by = "cluster_name",label = T)+ NoLegend()
rm(sce)

#仅提取 untreated 样本
sce_sub <- subset(sce1, (Sample_name=="Untreated 1"| Sample_name=="Untreated 2") & type == "Tumor")
sce_sub #29799 samples

DimPlot(sce_sub, reduction = "tsne") # PMID: 34804944 Figure5A
DimPlot(sce_sub, reduction = "tsne",group.by = "celltype")

实战2-PMID35592316

同样作者在PMID35592316的FiguresS9A-D也对PRJNA705464进行再次分析，有兴趣的大佬可以下载下来进行复习学习。

参考链接：

构建seurat对象之初就应该是把基因名字转换好
Identification and Verification of m7G Modification Patterns and Characterization of Tumor Microenvironment Infiltration via Multi-Omics Analysis in Clear Cell Renal Cell Carcinoma.
Establishment of a prognosis Prediction Model Based on Pyroptosis-Related Signatures Associated With the Immune Microenvironment and Molecular Heterogeneity in Clear Cell Renal Cell Carcinoma.

声明：

本文绝大部分代码均学习整理自生信技能树、单细胞天地等，仅供交流学习，禁止用于商业用途，如有侵权，请联系删除。鉴于本人水平有限，某些内容难免理解不到位，还请各位大佬多多指教。

ixxmu / mp_duty

PRJNA705464-Seurat数据下载及分析 #4996

PRJNA705464-Seurat数据下载及分析 by 学R时习之

数据分析

实战1-PMID34804944-Fig5A

实战2-PMID35592316