Closed ixxmu closed 3 years ago
RCAS
是一款发表于2017年6月Nucleic Acid Research期刊上的一款主要用于RNA区域注释和分析的R package。因此RCAS
软件的使用范围或者说是高通量数据来源主要来自于用于研究蛋白与RNA互作的CLIP-seq (UV Cross-linking and Immunoprecipitation,紫外交联免疫沉淀,主要用于RNA结合蛋白在体内与众多RNA靶标的结合模式);RNA甲基化修饰的MeRIP-seq(m6A-seq,m6A RNA immunoprecipitation sequencing);CAGE-tag location等,RCAS
软件本身的软件使用说明详细,但是使用hg19数据,链接如下:https://bioconductor.org/packages/release/bioc/vignettes/RCAS/inst/doc/RCAS.vignette.html,整体的R代码会放在文章最后方便学习,本文的数据仅使用苹果相关数据。BiocManager::install("RCAS")
packageVersion("RCAS")
# [1] ‘1.16.0’
library("RCAS")
gtf
以及标准的bed
文件(示例文件未bed6),但是本身软件对这两个文件要求ucsc format,苹果本身的参考基因组以及注释文件中的chromosome name
是Chr00
,我们需要将以上两个软件的输入文件更改为chr0
,这里以gtf
文件为例使用sed
进行修改:sed -e 's/Chr/chr/g' /Users/hejieqiang/Genome/GDDH13_Gonome/gene_models_20170612.gtf > gene_ucsc.gtf
sed -e 's/chr0/chr/g' gene_ucsc.gtf > genes_ucsc.gtf
rm -rf gene_ucsc.gtf
setwd("~/MeRIP/exomePeak2_20210324/RCAS/")
queryRegions <- importBed(filePath = "droMp.bedformat.txt")
gff <- importGtf(filePath = "genes_ucsc.gtf")
# malus
overlaps <- as.data.table(queryGff(queryRegions = queryRegions, gffData = gff))
# hg19 example
gene_biotype
无法得到有效信息,这里截取了结果文件的表头如下:transcript_id
一列也能明显看出,由于苹果注释文件的原因,因此本身也只是区分了mRNA和ncRNA,因此这一部分的数据用文章本身的示例数据展现:library("RCAS")
# sample data
data("queryRegions")
data(gff)
# custom data
queryRegions <- importBed(filePath = <path to BED file>, sampleN = 10000)
gff <- importGtf(filePath = <path to GTF file>)
# genomic annotation features
overlaps <- as.data.table(queryGff(queryRegions = queryRegions, gffData = gff))
biotype_col <- grep('gene_biotype', colnames(overlaps), value = T)
df <- overlaps[, length(unique(queryIndex)), by = biotype_col]
colnames(df) <- c("feature", "count")
df$percent <- round(df$count / length(queryRegions) * 100 , 1) # 1 represent sig digit
?round
df <- df[order(count, decreasing = T)]
ggplot2::ggplot(df,aes(x = reorder(feature, -percent), y = percent)) +
geom_bar(stat = "identity" , aes(fill = feature)) +
geom_label(aes(y = percent + 0.5), label = df$count) +
labs(x = "transcript feature" , y = paste0('percent overlap (n = ',length(queryRegions),')')) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 90))
?paste0
#Extending the annotation feature space
txdbFeatures <- getTxdbFeaturesFromGRanges(gff)
#Plotting overlap counts between query regions and transcript features
summary <- summarizeQueryRegions(queryRegions = queryRegions,
txdbFeatures = txdbFeatures)
df <- data.frame(summary)
df$percent <- round((df$count / length(queryRegions)), 3) * 100
df$feature <- rownames(df)
#绘图
ggplot2::ggplot(df,aes(x = reorder(feature, -percent), y = percent)) +
geom_bar(stat = 'identity' , aes(fill = feature)) +
geom_label(aes(y = percent +3), label = df$count) +
labs(x = 'transcript feature', y = paste0('percent overlap (n = ', length(queryRegions), ')')) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 90))
#Obtaining a table of overlap counts between query regions and genes
dt <- getTargetedGenesTable(queryRegions = queryRegions,
txdbFeatures = txdbFeatures)
dt <- dt[order(transcripts, decreasing = TRUE)]
knitr::kable(dt[1:10,]) # markdown
# only show 5UTR and 3UTR
#Profiling the coverage of query regions across transcript features
cvgF <- getFeatureBoundaryCoverage(queryRegions = queryRegions,
featureCoords = txdbFeatures$transcripts,
flankSize = 1000,
boundaryType = "fiveprime")
cvgT <- getFeatureBoundaryCoverage(queryRegions = queryRegions,
featureCoords = txdbFeatures$transcripts,
flankSize = 1000,
boundaryType = "threeprime")
cvgF$boundary <- "fiveprime"
cvgT$boundary <- "threeprime"
df <- rbind(cvgF, cvgT)
ggplot2::ggplot(df, aes(x = bases, y = meanCoverage)) +
geom_ribbon(fill = 'lightgreen',
aes(ymin = meanCoverage - standardError * 1.96,
ymax = meanCoverage + standardError * 1.96)) +
geom_line(color = 'black') +
facet_grid( ~ boundary) + theme_bw(base_size = 14)
# all transcript features
# Coverage profile of query regions for all transcript features
cvgList <- calculateCoverageProfileList(queryRegions = queryRegions,
targetRegionsList = txdbFeatures)
ggplot2::ggplot(cvgList,aes(x = bins , y = meanCoverage)) +
geom_ribbon(fill = 'lightgreen',
aes(ymin = meanCoverage - standardError * 1.96,
ymax = meanCoverage + standardError * 1.96)) +
geom_line(color = 'black') + theme_bw(base_size = 14) +
facet_wrap( ~ feature, ncol = 3)
其实intron的结果在预料之中,因为RNA甲基化修饰的研究,所以理论上不应该注释到intron和promoter,但是promoter区域的富集可能是因为测序mapping的参数,但是结合这张图来看,因为在feature region的区域划分中,promoter是人为划定的,因此这里promoter区靠近转录起始的地区的富集我认为是合理的。
这个软件也可以支持motif的的研究,但是参数中的genome好像只是支持ucsc本身的下载,因此这里我觉得自己用homer或者meme做然后结合motifstack做一下图就好了,这里就用原先作者提供的原始参数就好。
# Calculating enriched motifs
motifResults <- runMotifDiscovery(queryRegions = queryRegions,
resizeN = 15, sampleN = 10000,
genomeVersion = 'hg19', motifWidth = 6,
motifN = 2, nCores = 1)
?runMotifDiscovery
ggseqlogo::ggseqlogo(motifResults$matches_query)
# motif analysis: getting motif summary statistics
summary <- getMotifSummaryTable(motifResults)
knitr::kable(summary)
这个软件使用完,主要就是genomic feature这一块可以采用,其次是可以对感兴趣的功能区域上下游富集信号进行观察和研究,并且软件本身也提供了
runReport()
的自动化运行,还是很人性化的。本身使用官方的文件示例代码整理在这里:
https://mp.weixin.qq.com/s/HZm379DTDavFe_2ipTA81A