Closed ixxmu closed 4 hours ago
miRNA(microRNA) 是一种小的非编码 RNA 分子,通常由 20 到 24 个核苷酸组成。miRNA 主要存在于动植物中,并在基因表达调控中起到关键作用。它们通过与特定的信使 RNA(mRNA)分子结合来抑制基因表达,通常通过抑制翻译或促进 mRNA 的降解。
miRNA 的特征和功能
miRNA 的生物学意义
miRNA 的应用
笔者能力有限,详细的miRNA知识就不再赘述了,感兴趣的小伙伴可以去读一读参考资料中的综述~
miRNA数据库
关于miRNA的数据库有很多,以下是几个比较重要的:
1、miRBase:这个数据库是所有想做miRNA分析的研究者一定绕不开的工具
2、MicroRNAdb
3、miRTarBase
4、除此之外还有很多,比如:psRNATARGET,MicroRNAdb,miRWalk,TarBase,miRGator,CoGemiR,PolymiRTS,PicTar等。
正式分析之前还需要获取miRNA的数据,其中最常用的数据库一定是TCGA了, 因此我们先从这个数据库开始。
对于来自TCGA数据库的数据,有一个好用的下载工具就是TCGAbiolinks,它可以帮助我们方便的获取到TCGA中的数据。
rm(list = ls())
library(TCGAbiolinks)
library(qs)
library(BiocParallel)
register(MulticoreParam(workers = 8, progressbar = TRUE))
# 查看TCGA中33种癌症的简称
library(TCGAbiolinks)
projects <- TCGAbiolinks::getGDCprojects()$project_id ##获取癌症名字
projects <- projects[grepl('^TCGA', projects, perl=TRUE)]
projects
# [1] "TCGA-PCPG" "TCGA-THYM" "TCGA-PAAD" "TCGA-STAD" "TCGA-TGCT" "TCGA-SARC" "TCGA-PRAD" "TCGA-READ" "TCGA-UCS" "TCGA-UVM"
# [11] "TCGA-KICH" "TCGA-HNSC" "TCGA-LUAD" "TCGA-LIHC" "TCGA-LUSC" "TCGA-MESO" "TCGA-LAML" "TCGA-LGG" "TCGA-KIRP" "TCGA-KIRC"
# [21] "TCGA-ACC" "TCGA-BLCA" "TCGA-DLBC" "TCGA-CHOL" "TCGA-CESC" "TCGA-COAD" "TCGA-BRCA" "TCGA-ESCA" "TCGA-GBM" "TCGA-OV"
# [31] "TCGA-THCA" "TCGA-SKCM" "TCGA-UCEC"
TCGAbiolinks:::getProjectSummary("TCGA-HNSC")
# $file_count
# [1] 29489
#
# $data_categories
# file_count case_count data_category
# 1 8330 528 Simple Nucleotide Variation
# 2 4595 528 Sequencing Reads
# 3 2858 528 Biospecimen
# 4 1103 528 Clinical
# 5 5925 526 Copy Number Variation
# 6 2270 528 Transcriptome Profiling
# 7 1740 528 DNA Methylation
# 8 354 354 Proteome Profiling
# 9 50 24 Somatic Structural Variation
# 10 2264 521 Structural Variation
#
# $case_count
# [1] 528
#
# $file_size
# [1] 3.019863e+14
proj <- "TCGA-HNSC"
# 单独下载
query <- GDCquery(project = proj,
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
workflow.type = "BCGSC miRNA Profiling"
)
GDCdownload(query)
GDCprepare(query,save = T,save.filename = paste0(project,"_miRNA.Rdata"))
# 批量下载数据
sapply(projects, function(project){
query <- GDCquery(project = project,
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification"
)
GDCdownload(query)
GDCprepare(query, save = T,save.filename = paste0(project,"_miRNA.Rdata"))
})
load(paste0(proj,"_miRNA.Rdata"))
head(data)[1:4,1:4]
# miRNA_ID read_count_TCGA-BA-6871-01A-11R-1872-13 reads_per_million_miRNA_mapped_TCGA-BA-6871-01A-11R-1872-13
# 1 hsa-let-7a-1 39430 8590.708
# 2 hsa-let-7a-2 39178 8535.804
# 3 hsa-let-7a-3 39394 8582.864
# 4 hsa-let-7b 65142 14192.642
# cross-mapped_TCGA-BA-6871-01A-11R-1872-13
# 1 N
# 2 Y
# 3 N
# 4 N
# 把列名中含有count的数据提取出来
rownames(data) <- data$miRNA_ID
col <- grepl("count", colnames(data))
miRNA_count <- data[,col]
colnames(miRNA_count) <- sub("read_count_","",colnames(miRNA_count)) # gsub是去除所有匹配的字符
head(miRNA_count)[1:4,1:4]
# TCGA-BA-6871-01A-11R-1872-13 TCGA-CN-6024-01A-11R-1685-13 TCGA-IQ-7631-01A-11R-2080-13
# hsa-let-7a-1 39430 37356 78370
# hsa-let-7a-2 39178 37111 78621
# hsa-let-7a-3 39394 37080 79843
# hsa-let-7b 65142 65155 193506
# TCGA-CV-7406-01A-11R-2080-13
# hsa-let-7a-1 43002
# hsa-let-7a-2 42954
# hsa-let-7a-3 43141
# hsa-let-7b 78858
save(miRNA_count,file = paste0(proj,"_miRNA_count.Rdata"))
致谢:感谢曾老师以及生信技能树团队全体成员。
注:若对内容有疑惑或者有发现明确错误的朋友,请联系后台(欢迎交流)。更多内容可关注公众号:生信方舟
- END -
https://mp.weixin.qq.com/s/l2eOdrqgM64ZVPX77XWmdw