ixxmu / mp_duty

抓取网络文章到github issues保存
https://archives.duty-machine.now.sh/
122 stars 30 forks source link

PRJNA705464-Seurat数据下载及分析 #4996

Closed ixxmu closed 5 months ago

ixxmu commented 5 months ago

https://mp.weixin.qq.com/s/GRjOw4TNhLrOiSHdhqX4pQ

ixxmu commented 5 months ago

PRJNA705464-Seurat数据下载及分析 by 学R时习之

PRJNA705464是肾透明细胞癌的大样本数据集,包含scRNA-seq和TCR数据。组织来源分为Normal、Tumor、PBMC和淋巴结,同时还有药物治疗等,内容是相当丰富。

这么大的数据量咱们怎么用呢?

从上游直接分析?还是找人帮忙分析?

上游分析数据量太大:


找人分析…………一言难尽,还是先自己摸索下吧。

惊奇的发现作者竟然提供了Seurat对象数据,而且生信技能树大神还分享过该数据集《构建seurat对象之初就应该是把基因名字转换好》,简直完美。下面跟着大神的教程认真学习下:

数据分析

首先从SRP308561相关链接下载ccRCC_6pat_SeuratccRCC_6pat_cell_annotations.txt文件。(数据有点大)

rm(list = ls())
librarian::shelf(Seurat)
# sce <- readRDS("rawdata/ccRCC_6pat_Seurat")
# sce
# sce <- UpdateSeuratObject(sce)
# sce
# save(sce,file = "sce_0raw.Rdata")

# 基因转换
if(F){
  load("sce_0raw.Rdata")
  DimPlot(sce)
  sce  # 16323 features across 167283 samples within 1 assay


  # 基因转换
  library(org.Hs.eg.db)
  head(rownames(sce))
  ids=AnnotationDbi::select(org.Hs.eg.db,keys = rownames(sce),
                            columns = c('ENSEMBL','SYMBOL'),
                            keytype = 'ENSEMBL')
  head(ids)
  dim(ids) # [1] 16458
  ids=na.omit(ids)
  dim(ids) # [1] 15527
  length(unique(ids$SYMBOL)) # [1] 15516
  # 这里的关系超级乱,互相之间都不是一对一
  # 凡是混乱的ID一律删除即可
  ids=ids[!duplicated(ids$SYMBOL),]
  ids=ids[!duplicated(ids$ENSEMBL),]
  pos=match(ids$ENSEMBL,rownames(sce) )
  sce=sce[pos,]
  sce # 15388 features
  #rownames(sce) = ids$SYMBOL
  head(rownames(sce)); head(ids)

  # https://github.com/satijalab/seurat/issues/2617
  # RenameGenesSeurat  ------------------------------------------------------------------------------------
  RenameGenesSeurat <- function(obj ,
                                newnames ) {
    # Replace gene names in different slots of a Seurat object. Run this before integration. Run this before integration.
    # It only changes obj@assays$RNA@counts, @data and @scale.data.
    print("Run this before integration. It only changes obj@assays$RNA@counts, @data and @scale.data.")
    RNA <- obj@assays$RNA

    if (nrow(RNA) == length(newnames)) {
      if (length(RNA@counts)) RNA@counts@Dimnames[[1]]            <- newnames
      if (length(RNA@data)) RNA@data@Dimnames[[1]]                <- newnames
      if (length(RNA@scale.data)) RNA@scale.data@Dimnames[[1]]    <- newnames
    } else {"Unequal gene sets: nrow(RNA) != nrow(newnames)"}
    obj@assays$RNA <- RNA
    return(obj)
  }
  obj=RenameGenesSeurat(obj = sce,
                        newnames = ids$SYMBOL)
  sce <- obj; rm(obj)
  save(sce,file = 'first_sce.Rdata')
}

# 细胞注释
if(F){
  load("first_sce.Rdata")

  library(ggplot2)
  genes_to_check = c('PTPRC''CD3D''CD3E''CD4','CD8A','CD19''CD79A''MS4A1' ,
                     'IGHG1''MZB1''SDC1',
                     'CD68''CD163''CD14',
                     'TPSAB1' , 'TPSB2',  # mast cells,
                     'RCVRN','FPR1' , 'ITGAM' ,
                     'FGF7','MME''ACTA2',
                     'PECAM1''VWF',
                     'EPCAM' , 'KRT19''PROM1''ALDH1A1' )
  library(stringr)
  p_all_markers <- DotPlot(sce , features = genes_to_check,
                           assay='RNA'  )  + coord_flip()

  p_all_markers
  ggsave(plot=p_all_markers, filename="check_all_marker_by_seurat_cluster.pdf",width = 12, height = 10)


  # 需要自行看图,定细胞亚群:
  celltype=data.frame(ClusterID=0:34,
                      celltype='unkown')
  celltype[celltype$ClusterID %in% c( 0,2:4,6:8,10,15,17,17,19,31,32),2]='Tcells'
  celltype[celltype$ClusterID %in% c( 24 ),2]='plasma'
  celltype[celltype$ClusterID %in% c( 22 ),2]='naive-Bcells'
  celltype[celltype$ClusterID %in% c(21),2]='mast'

  celltype[celltype$ClusterID %in% c( 1,5,12:14,18,23,27,33,34),2]='myeloid'
  celltype[celltype$ClusterID %in% c( 9,16,26 ),2]='epithelial'

  celltype[celltype$ClusterID %in% c( 11),2]='endo'
  celltype[celltype$ClusterID %in% c(20),2]='fibo'
  celltype[celltype$ClusterID %in% c(25 ),2]='SMC'

  head(celltype)
  celltype
  table(celltype$celltype)
  sce$celltype <- plyr::mapvalues(sce$cluster,
                                  from = 0:34,
                                  to=celltype$celltype)
  p1 <- DimPlot(sce,group.by = "celltype",label = T)
  ggsave(p1,filename="DimPlot_by_seurat_cluster.pdf",width = 12, height = 10)
  save(sce, file = "first_sce.Rdata")
}


实战1-PMID34804944-Fig5A

接下来看看大佬们的神来之笔,首先是PMID34804944Fig5A


作者是这样进行分析的:

PRJNA705464, which is a large database containing cells totally, was applied to investigate the role of pyroptosis-related genes in cell–cell interaction in ccRCC TME. Only untreated tumor samples from PRJNA705464 were applied for further analysis.

下面咱们就提取untreated样本进行后续分析,果然可完美复现Figure5A。

# meatdata
library(tidyverse)
metadata <- data.table::fread("rawdata/ccRCC_6pat_cell_annotations.txt",data.table = F) %>%
  column_to_rownames("cell")
metadata <- metadata[sce$cell,]
metadata1 <- metadata[,c("Sample_name","cluster_name")]
sce1 <- AddMetaData(object = sce,
                    metadata = metadata1)
DimPlot(sce1, group.by = "cluster_name",label = T)+ NoLegend()
rm(sce)

#仅提取 untreated 样本
sce_sub <- subset(sce1, (Sample_name=="Untreated 1"| Sample_name=="Untreated 2") & type == "Tumor")
sce_sub #29799 samples

DimPlot(sce_sub, reduction = "tsne"# PMID: 34804944 Figure5A
DimPlot(sce_sub, reduction = "tsne",group.by = "celltype")


实战2-PMID35592316

同样作者在PMID35592316的FiguresS9A-D也对PRJNA705464进行再次分析,有兴趣的大佬可以下载下来进行复习学习。


参考链接:

  • 构建seurat对象之初就应该是把基因名字转换好

  • Identification and Verification of m7G Modification Patterns and Characterization of Tumor Microenvironment Infiltration via Multi-Omics Analysis in Clear Cell Renal Cell Carcinoma.

  • Establishment of a prognosis Prediction Model Based on Pyroptosis-Related Signatures Associated With the Immune Microenvironment and Molecular Heterogeneity in Clear Cell Renal Cell Carcinoma.

声明:

本文绝大部分代码均学习整理自生信技能树、单细胞天地等,仅供交流学习,禁止用于商业用途,如有侵权,请联系删除。鉴于本人水平有限,某些内容难免理解不到位,还请各位大佬多多指教。