Closed ixxmu closed 1 year ago
org <- "mgi" # or hgnc, or dmel
dbDir <- 'cisTarget_databases/mouse/mouse.mm9/' # RcisTarget databases location
myDatasetTitle <- "SCENIC example on Mouse brain" # choose a name for your analysis
data(defaultDbNames)
dbs <- defaultDbNames[[org]]
scenicOptions <- initializeScenic(org=org, dbDir=dbDir, dbs=dbs, datasetTitle=myDatasetTitle, nCores=10)
scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
scenicOptions@inputDatasetInfo$colVars <- "int/colVars.Rds"
saveRDS(scenicOptions, file="int/scenicOptions.Rds") #这样存起来方便调用
B站同步播出:
https://www.bilibili.com/video/BV1Mg4y1c7wU?p=5
通过表达矩阵推断潜在的转录因子作用靶点(GENIE3
/GRNBoost
). 需要将过滤后的矩阵、转录因子对应关系作为输入数据,由runSCENIC_1_coexNetwork2modules()
输出共表达模式.
GENIE3
, GRNBoost
, WGCNA
等都可以用来构建基因共表达网络,这里作者推荐使用GENIE3
,GENIE3终于不是这个作者写的包了:Inferring Regulatory Networks from Expression Data Using Tree-Based Methods,这里作者推荐的理由是GENIE3
可以识别数据集中部分存在的非线性关系,并且可以有效应对网络推断中的DREAM5 challenge(Reprogramming of regulatory network using expression uncovers sex-specific gene regulation in Drosophila),缺点就是会占用计算资源和计算时间(3-5k个细胞可能会算几天)。所以为了解决这个问题,作者又写了个包GRNboost
用于大型数据的计算。
当然你也可以选择从数据中随机提取一些细胞出来参与计算,但这样需要确保抽出的样本能够替代总体,所以这个方法我并不推荐。
为了加快计算速度,并且去除数据噪声,GENIE3/GRNBoost
执行前需要进行一步过滤,去除表达量过低(统计count数,默认删除低于3 X 0.1 X 数据集细胞数量
的基因,即细胞数为200,删除count低于6的基因)或表达比例过低的基因(统计表达该基因的细胞数,默认删除表达比例不足1%
的基因)。这里为了防止一些在特定细胞类型中表达的基因被误删,这里的1%
推荐设置的比数据中最小的细胞群比例更小。在初步过滤后,矩阵中能够被RcisTarget databases收录的基因被保留用于下游计算。
genesKept <- geneFiltering(exprMat, scenicOptions=scenicOptions,
minCountsPerGene=3*.01*ncol(exprMat),
minSamples=ncol(exprMat)*.01)#获得能经过“质检”的基因
## Maximum value in the expression matrix: 421
## Ratio of detected vs non-detected: 0.24
## Number of counts (in the dataset units) per gene:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 17.0 48.0 146.8 137.0 5868.0
## Number of cells in which each gene is detected:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 10.00 25.00 39.01 60.00 180.00
##
## Number of genes left after applying the following filters (sequential):
## 771 genes with counts per gene > 6
## 770 genes detected in more than 2 cells
## Using the column 'features' as feature index for the ranking database.
## 770 genes available in RcisTarget database
## Gene list saved in int/1.1_genesKept.Rds
exprMat_filtered <- exprMat[genesKept,]#取出过滤后的矩阵
dim(exprMat_filtered)
## [1] 770 200
默认是Spearman correlation
,并且正相关与负相关都会被考虑
runCorrelation(exprMat_filtered, scenicOptions)
GENIE3
GRNBoost
需要在Python中运行,可以通过exportsForGRNBoost
输出GRNBoost
所需数据。GENIE3
的输出数据即上面已被过滤过的exprMat_filtered
、RcisTarget
中提供的转录因子与靶基因的对应关系以及scenicOptions
中存放的其它已设定参数。
由于GENIE3
采取的是随机森林算法(Random Forest approach
),所以即使是相同的数据和程序,运行多次得到的结果也可能是不同的,ntrees
参数越高、结果的变化程度就越低,这里为了保证结果具有可重复性,应当用set.seed()
函数设置随机数种子。runGenie3
在大家运行真实的数据集时,这一步可能会花费数小时甚至数天,因此我不是很推荐在Rstudio-server中运行这一步,我的建议是通过nohup
把命令提交到Linux
后台运行。
mymethod <- 'R'
library(reticulate)
if(mymethod=='R'){
# R语言版:
runGenie3(exprMat_filtered_log, scenicOptions)
}else{
# 调用python参与计算:
arb.algo = import('arboreto.algo')
tf_names = getDbTfs(scenicOptions)
tf_names = Seurat::CaseMatch(
search = tf_names,
match = rownames(exprMat_filtered))
adj = arb.algo$grnboost2(
as.data.frame(t(as.matrix(exprMat_filtered))),
tf_names=tf_names, seed=2023L
)
colnames(adj) = c('TF','Target','weight')
saveRDS(adj,file=getIntName(scenicOptions,
'genie3ll'))
}
## Using 8 TFs as potential regulators...
## Warning in runGenie3(exprMat_filtered_log, scenicOptions): Only 8 (0%) of the 1721 TFs in the database were found in the dataset. Do they use the same gene IDs?
## Running GENIE3 part 1
## Running GENIE3 part 10
## Running GENIE3 part 2
## Running GENIE3 part 3
## Running GENIE3 part 4
## Running GENIE3 part 5
## Running GENIE3 part 6
## Running GENIE3 part 7
## Running GENIE3 part 8
## Running GENIE3 part 9
## Finished running GENIE3.
scenicOptions
的默认参数可以如是调整
library(SCENIC)
scenicOptions <- readRDS("int/scenicOptions.Rds")#读取初始化设置
scenicOptions@settings$verbose <- TRUE#运行过程中是否保留进度条
scenicOptions@settings$nCores <- 10#并行计算时调用的线程
scenicOptions@settings$seed <- 123#涉及随机数种子便于重复结果
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"] #这里是做示例运行时取子集用于加速,正式运行时应删除
scenicOptions <- runSCENIC_1_coexNetwork2modules(scenicOptions)
## 02:19 Creating TF modules
## Number of links between TFs and targets (weight>=0.001): 6139
## [,1]
## nTFs 8
## nTargets 770
## nGeneSets 47
## nLinks 17299
RcisTarget
获得regulons信息(TF motif analysis)#这一步大家无需运行,我们下面要做拆解
scenicOptions <- runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget")) #coexMethod参数用于加速运行,正式运行时应删除
scenicOptions <- readRDS("int/scenicOptions.Rds")#读取初始化信息
minGenes=20
coexMethod=NULL#正式运行数据时coexMethod不应选择'top5perTarget'
上面的runSCENIC_2_createRegulons
与下面的runSCENIC_3_scoreCells
封装了很多步功能,因此我们需要重点关注加以拆解,从而了解整个流程的步骤与原理,并能够更好的解读分析结果:
vignette("detailedStep_2_createRegulons", package="SCENIC")#这行可以获得runSCENIC_2_createRegulons的详细信息:
这一步是为了通过DNA motif的富集分析从而鉴别TF直接的靶点(regulons),其实上面的共表达分析已经能够初步为基因调控网络提供信息,但是Biomamba在此前的很多推送都强调过,相关性分析并不能代表最终的因果,因此共表达矩阵中可能存在着间接作用的基因(例如直接作用靶基因的下游基因)。那么为了找到真正的regulons,这里利用RcisTarget
数据库收录的信息进行顺势调控元件分析(cis-regulatory motif analysis)
(1)加载共表达模块RcisTarget
数据库
nCores <- getSettings(scenicOptions, "nCores")
nCores
## [1] 8
tfModules_asDF <- loadInt(scenicOptions, "tfModules_asDF")
head(tfModules_asDF)
## Target TF method corr
## 1 Dlx5 Dlx1 w0.001 1
## 2 Gad2 Dlx1 w0.001 1
## 3 Gad1 Dlx1 w0.001 1
## 4 Bmper Dlx1 w0.001 1
## 5 Gria4 Dlx1 w0.001 1
## 6 St8sia4 Dlx1 w0.001 1
if(!is.null(coexMethod)) tfModules_asDF <- tfModules_asDF[which(tfModules_asDF$method %in% coexMethod),]#选择coexMethod
if(nrow(tfModules_asDF)==0) stop("The co-expression modules are empty.")#如果不存在共表达模块,则会报错
为RcisTarget::addMotifAnnotation()设置并行线程数
if("BiocParallel" %in% installed.packages()) library(BiocParallel); register(MulticoreParam(nCores), default=TRUE) #设定并行参数
msg <- paste0(format(Sys.time(), "%H:%M"), "\tStep 2. Identifying regulons")#保存程序运行进度信息
if(getSettings(scenicOptions, "verbose")) message(msg)#若设置需要更新进度,则输出程序运行进度信息
## 02:19 Step 2. Identifying regulons
检查物种并加载数据库
if(is.na(getDatasetInfo(scenicOptions, "org"))) stop('Please provide an organism (scenicOptions@inputDatasetInfo$org).')#检查物种名是否正常
library(AUCell)#加载AUCell用于打分
library(RcisTarget)
motifAnnot <- getDbAnnotations(scenicOptions)#按照初始化参数载入数据库
motifAnnot[1:4,1:4]#收录了数据库中的大量信息
## motif TF directAnnotation inferred_Orthology
## 1: bergman__Abd-B Hoxa9 FALSE FALSE
## 2: bergman__Aef1 Zfp128 FALSE TRUE
## 3: bergman__Cf2 Zfp853 FALSE TRUE
## 4: bergman__EcR_usp Nr1h2 FALSE TRUE
if(is.null(names(getSettings(scenicOptions, "dbs"))))
{
names(scenicOptions@settings$"dbs") <- scenicOptions@settings$"dbs"
tmp <- sapply(strsplit(getSettings(scenicOptions, "dbs"),"-", fixed=T), function(x) x[grep("bp|kb",x)])
if(all(lengths(tmp)>0)) names(scenicOptions@settings$"dbs") <- tmp
}
loadAttempt <- sapply(getDatabases(scenicOptions), dbLoadingAttempt)
if(any(!loadAttempt)) stop("It is not possible to load the following databses: \n",
paste(dbs[which(!loadAttempt)], collapse="\n"))
genesInDb <- unique(unlist(lapply(getDatabases(scenicOptions), function(x)
names(feather::feather_metadata(x)[["types"]]))))#获取数据库中包含的gene symbol
genesInDb[1:10]
## [1] "features" "0610007C21Rik" "0610007L01Rik" "0610007P08Rik"
## [5] "0610007P14Rik" "0610007P22Rik" "0610008F07Rik" "0610009B14Rik"
## [9] "0610009B22Rik" "0610009D07Rik"
(2)过滤共表达矩阵
这里选出与TF表达成正相关关系的共表达模块(可能有激活关系),并将TF加入该模块中,保留至少包含20个基因的共表达模块。同理可以选出与TF表达成正相关关系的共表达模块(可能存在抑制关系),但在测试数据中这类模块数量较少。
移除RcisTarget数据库中没有收录的基因
#将TF和Target都转换成字符串
tfModules_asDF$TF <- as.character(tfModules_asDF$TF)
tfModules_asDF$Target <- as.character(tfModules_asDF$Target)
allTFs <- getDbTfs(scenicOptions)##获取数据库中收录的转录因子
allTFs[1:10]
## [1] "1700009N14Rik" "1700080O16Rik" "1810024B03Rik" "2010315B03Rik"
## [5] "2310011J03Rik" "2410141K09Rik" "2610044O15Rik8" "2810403A07Rik"
## [9] "3300002I08Rik" "3830417A13Rik"
tfModules_asDF <- tfModules_asDF[which(tfModules_asDF$TF %in% allTFs),]
geneInDb <- tfModules_asDF$Target %in% genesInDb#获取共表达木块中的基因是否数据库中收录的布尔值
geneInDb[1:10]
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
missingGene <- sort(unique(tfModules_asDF[which(!geneInDb),"Target"]))#共表达模块中存在的,但数据库中没有收录的基因,这里不存在这类基因
missingGene
## character(0)
if(length(missingGene)>0)
warning(paste0("Genes in co-expression modules not available in RcisTargetDatabases: ",
paste(missingGene, collapse=", ")))#若missingGene存在则数据warning信息
tfModules_asDF <- tfModules_asDF[which(geneInDb),]#保留包含在数据库中的基因共表达模块
取出正向的共表达模块并处理
tfModules_Selected <- tfModules_asDF[which(tfModules_asDF$corr==1),]#tfModules_asDF$corr,是个三元矩阵
# Add a column with the geneSet name (TF_method)
tfModules_Selected <- cbind(tfModules_Selected, geneSetName=paste(tfModules_Selected$TF, tfModules_Selected$method, sep="_"))
head(tfModules_Selected)
## Target TF method corr geneSetName
## 1 Dlx5 Dlx1 w0.001 1 Dlx1_w0.001
## 2 Gad2 Dlx1 w0.001 1 Dlx1_w0.001
## 3 Gad1 Dlx1 w0.001 1 Dlx1_w0.001
## 4 Bmper Dlx1 w0.001 1 Dlx1_w0.001
## 5 Gria4 Dlx1 w0.001 1 Dlx1_w0.001
## 6 St8sia4 Dlx1 w0.001 1 Dlx1_w0.001
tfModules_Selected$geneSetName <- factor(as.character(tfModules_Selected$geneSetName))
allGenes <- unique(tfModules_Selected$Target)
利用不同的methods
拆分并加入TF
tfModules <- split(tfModules_Selected$Target, tfModules_Selected$geneSetName)
# Add TF to the gene set (used in the following steps, careful if editing)
tfModules <- setNames(lapply(names(tfModules), function(gsn) {
tf <- strsplit(gsn, "_")[[1]][1]#_前的字符即为TF名称
unique(c(tf, tfModules[[gsn]]))
}), names(tfModules))
保留基因数量大于minGenes
的模块
tfModules <- tfModules[which(lengths(tfModules)>=minGenes)]
saveRDS(tfModules, file=getIntName(scenicOptions, "tfModules_forEnrichment"))
print(getIntName(scenicOptions, "tfModules_forEnrichment"))
## [1] "int/2.1_tfModules_forMotifEnrichmet.Rds"
#所以这里保存的就是转录因子及其对应模块包含的基因,其实就相当于做基因集评分时的geneset
if(getSettings(scenicOptions, "verbose")) {
tfModulesSummary <- t(sapply(strsplit(names(tfModules), "_"), function(x) x[1:2]))
message("tfModulesSummary:")
print(sort(table(tfModulesSummary[,2])))#输出每种算法下包含的TF
}
## tfModulesSummary:
##
## top3sd top1sd top50 top5perTarget w0.001
## 5 8 8 8 8
## w0.005
## 8
接下来将用到RcisTarget
,通过Motif富集分析在共表达网络种识别直接作用靶基因,套娃真的套不动了,更详细的教程可以参考:
vignette("RcisTarget")#这里我就不运行了
## Warning: vignette 'RcisTarget' not found
Run RcisTarget
(Motif enrichment) 简单来说,就是根据RcisTarget
中的motif-TF数据库找到TF对应的motif(在TSS附近检索,与基因组上其它基因做对比),并对来源于同一个TF的motif进行富集分析。
(1).找出高表达的motif: 这一步SCENIC会利用一个包含TSS周围(上游的500bp或上下游10kb,设置一个窗口进行扫描评分)motif评分的数据库。Normalized Enrichment Score (NES, 先求出motif的AUCell score,再对其进行normalization) > 3.0 (这样能够保证假阳性率在3%到9%)的motif会被认为是改TF模块中显著富集的motif (并不是基因集中的所有基因都拥有相同富集的motif,所以保留motif分值高者即可,高NES意味着这个motif可以覆盖大量的输入基因),并通过addMotifAnnotation()
存在motifEnrichment$TFinDB中(显著的以标记)。
得到的结果中对于motif的注释有高可信度(一般是被直接记载)与低可信度(一般是由motif的相似度预测得来, 例如sufix _extended**就代表这是一种低可信度的motif)。
#通过RcisTarget计算富集的motif
msg <- paste0(format(Sys.time(), "%H:%M"), "\tRcisTarget: Calculating AUC")
if(getSettings(scenicOptions, "verbose")) message(msg)#输出进度
## 02:20 RcisTarget: Calculating AUC
motifs_AUC <- lapply(getDatabases(scenicOptions), function(rnkName) {
ranking <- importRankings(rnkName, columns=allGenes)
message("Scoring database: ", ranking@description)
RcisTarget::calcAUC(tfModules, ranking, aucMaxRank=0.03*getNumColsInDB(ranking), nCores=nCores, verbose=FALSE)})
## Using the column 'features' as feature index for the ranking database.
## Scoring database: [Source file: mm9-tss-centered-10kb-7species.mc9nr.feather]
saveRDS(motifs_AUC, file=getIntName(scenicOptions, "motifs_AUC"))#int/2.2_motifs_AUC.Rds里存的是motif的AUCell值
#以NES>3过滤,并注释motif的置信度
# motifs_AUC <- loadInt(scenicOptions, "motifs_AUC") #上一步保存的int/2.2_motifs_AUC.Rds
msg <- paste0(format(Sys.time(), "%H:%M"), "\tRcisTarget: Adding motif annotation")
message(msg)#报进度
## 02:20 RcisTarget: Adding motif annotation
motifEnrichment <- lapply(motifs_AUC, function(aucOutput)
{
# Extract the TF of the gene-set name (i.e. MITF_w001):
tf <- sapply(setNames(strsplit(rownames(aucOutput), "_"), rownames(aucOutput)), function(x) x[[1]])
# Calculate NES and add motif annotation (provide tf in 'highlightTFs'):
addMotifAnnotation(aucOutput, #AUCell score
nesThreshold=3, #NES的阈值,这里是3
digits=3, #保留三位小数
motifAnnot=motifAnnot,#加入数据库中的注释
motifAnnot_highConfCat=c("directAnnotation", "inferredBy_Orthology"),
motifAnnot_lowConfCat=c("inferredBy_MotifSimilarity",
"inferredBy_MotifSimilarity_n_Orthology"),
highlightTFs=tf#将结果返回在motifEnrichment$tf中
)
})
## Using BiocParallel...
#合并各tf的列表,并加入一列'motifDb'
motifEnrichment <- do.call(rbind, lapply(names(motifEnrichment), function(dbName){
cbind(motifDb=dbName, motifEnrichment[[dbName]])
}))
head(motifEnrichment)#这是个长格式的数据,head也很难看出门道
## motifDb geneSet motif NES AUC
## 1: 10kb Dlx1_top1sd cisbp__M1205 5.24 0.133
## 2: 10kb Dlx1_top1sd taipale_cyt_meth__HOXB1_STAATTAN_eDBD 4.96 0.128
## 3: 10kb Dlx1_top1sd transfac_pro__M01388 4.72 0.124
## 4: 10kb Dlx1_top1sd taipale_cyt_meth__HOXB1_GTAATTAN_eDBD_meth 4.52 0.121
## 5: 10kb Dlx1_top1sd taipale_cyt_meth__PDX1_YTAATTAN_eDBD 4.52 0.121
## 6: 10kb Dlx1_top1sd stark__RATTAMY 4.47 0.120
## highlightedTFs TFinDB TF_highConf
## 1: Dlx1
## 2: Dlx1 Hoxb1 (inferredBy_Orthology).
## 3: Dlx1 Dlx5 (directAnnotation).
## 4: Dlx1 Hoxb1 (inferredBy_Orthology).
## 5: Dlx1 Pdx1 (inferredBy_Orthology).
## 6: Dlx1 Hoxa4; Hoxb4; Hoxc4; Hoxd4 (inferredBy_Orthology).
## TF_lowConf
## 1: Evx1; Hoxa2; Hoxa3; Hoxd3; Pdx1; Vax1; Vax2 (inferredBy_MotifSimilarity). Emx1; Emx2; Evx2; Gsx1; Gsx2; Hoxa6; Hoxa7; Hoxb2; Hoxb6; Hoxb7; Hoxc6; Lbx1; Lhx2; Lhx9; Noto (inferredBy_MotifSimilarity_n_Orthology).
## 2: Hoxa1; Hoxd3; Meox2 (inferredBy_MotifSimilarity). Hoxa5; Hoxb5; Hoxb7; Hoxc5; Hoxd1; Hoxd4; Lhx4; Meox1; Prop1 (inferredBy_MotifSimilarity_n_Orthology).
## 3: Hoxd3 (inferredBy_MotifSimilarity). Nkx1-1; Nkx1-2; Vsx1 (inferredBy_MotifSimilarity_n_Orthology).
## 4: Hoxa2; Pdx1 (inferredBy_MotifSimilarity). Emx1; Evx1; Evx2; Gm28308; Hoxa1; Hoxa5; Hoxb2; Hoxb5; Hoxd1; Hoxd4; Lhx4; Meox1; Meox2; Mnx1; Nkx1-1; Nkx1-2; Phox2a; Phox2b; Vax1; Vax2 (inferredBy_MotifSimilarity_n_Orthology).
## 5: Emx2; Evx1; Gsx2; Hoxa2; Hoxa3; Hoxa5; Hoxb3; Hoxd3; Vax1; Vax2 (inferredBy_MotifSimilarity). Emx1; En1; Evx2; Gm28308; Gsx1; Hoxa6; Hoxa7; Hoxb2; Hoxb5; Hoxb6; Hoxb7; Hoxc4; Hoxc5; Hoxc6; Hoxd4; Lhx2; Lhx6; Lhx8; Lhx9; Lmx1a; Mnx1; Rax; Vsx1; Vsx2 (inferredBy_MotifSimilarity_n_Orthology).
## 6:
saveRDS(motifEnrichment, file=getIntName(scenicOptions, "motifEnrichment_full"))#int/2.3_motifEnrichment.Rds中存放的既是motif富集信息
msg <- paste0("Number of motifs in the initial enrichment: ", nrow(motifEnrichment))
if(getSettings(scenicOptions, "verbose")) message(msg)#报告motif总数
## Number of motifs in the initial enrichment: 6605
# motifEnrichment <- loadInt(scenicOptions, "motifEnrichment_full")#除了readrds,还可以这样读回来
#保留能与现存TF对应的motif
motifEnrichment_selfMotifs <- motifEnrichment[which(motifEnrichment$TFinDB != ""),, drop=FALSE]
msg <- paste0("Number of motifs annotated to the corresponding TF: ", nrow(motifEnrichment_selfMotifs))
if(getSettings(scenicOptions, "verbose")) message(msg)
## Number of motifs annotated to the corresponding TF: 696
if(nrow(motifEnrichment_selfMotifs)==0)
stop("None of the co-expression modules present enrichment of the TF motif: There are no regulons.")
构建regulon RcisTarget
利用类似于GSEA的原理通过addSignificantGenes()
为每个motif找到高匹配度的基因,将motif、TF、靶基因构建为一个regulon, 这一步的运行速度与基因集的大小有关,不受细胞数量影响。
#Prune targets
msg <- paste0(format(Sys.time(), "%H:%M"), "\tRcisTarget: Prunning targets")#汇报能够注释到对应TF的motif数量
if(getSettings(scenicOptions, "verbose")) message(msg)
## 02:20 RcisTarget: Prunning targets
dbNames <- getDatabases(scenicOptions)
dbNames#这里是测试流程,所以只用到了TSS周围"10kb"的数据文件"mm9-tss-centered-10kb-7species.mc9nr.feather"
## 10kb
## "cisTarget_databases/mouse/mouse.mm9//mm9-tss-centered-10kb-7species.mc9nr.feather"
motifEnrichment_selfMotifs_wGenes <- lapply(names(dbNames), function(motifDbName){
ranking <- importRankings(dbNames[motifDbName], columns=allGenes)
addSignificantGenes(resultsTable=motifEnrichment_selfMotifs[motifEnrichment_selfMotifs$motifDb==motifDbName,],
geneSets=tfModules,#从tfmodule中找匹配基因
rankings=ranking,
maxRank=5000, method="aprox", nCores=nCores)
})
## Using the column 'features' as feature index for the ranking database.
## [1] 22058
suppressPackageStartupMessages(library(data.table))
motifEnrichment_selfMotifs_wGenes <- rbindlist(motifEnrichment_selfMotifs_wGenes)#将list中的元素纵向合并
saveRDS(motifEnrichment_selfMotifs_wGenes, file=getIntName(scenicOptions, "motifEnrichment_selfMotifs_wGenes"))
#那么"int/2.4_motifEnrichment_selfMotifs_wGenes.Rds"中存的就是motif与其对应的富集靶基因
if(getSettings(scenicOptions, "verbose"))
{
# TODO messages/print
message(format(Sys.time(), "%H:%M"), "\tNumber of motifs that support the regulons: ", nrow(motifEnrichment_selfMotifs_wGenes))
motifEnrichment_selfMotifs_wGenes[order(motifEnrichment_selfMotifs_wGenes$NES,decreasing=TRUE),][1:5,(1:ncol(motifEnrichment_selfMotifs_wGenes)-1), with=F]
}
## 02:26 Number of motifs that support the regulons: 696
## motifDb geneSet motif NES AUC
## 1: 10kb Tef_top1sd taipale_cyt_meth__TEF_NRTTAYGTAAYN_eDBD_meth 9.60 0.219
## 2: 10kb Tef_top1sd taipale__TEF_DBD_NRTTACRTAAYN 9.40 0.216
## 3: 10kb Tef_top1sd transfac_pro__M07688 9.17 0.212
## 4: 10kb Tef_top1sd cisbp__M5909 9.01 0.209
## 5: 10kb Tef_top1sd transfac_pro__M07439 8.85 0.206
## highlightedTFs TFinDB TF_highConf
## 1: Tef ** Tef (inferredBy_Orthology).
## 2: Tef ** Tef (inferredBy_Orthology).
## 3: Tef ** Dbp; Hlf; Tef (inferredBy_Orthology).
## 4: Tef ** Tef (inferredBy_Orthology).
## 5: Tef ** Tef (directAnnotation).
## TF_lowConf
## 1: Atf2; Cebpb; Dbp; Hlf; Nfil3 (inferredBy_MotifSimilarity). Cebpd; Cebpe; Cebpg (inferredBy_MotifSimilarity_n_Orthology).
## 2: Atf2; Cebpa; Cebpb; Dbp; Hlf; Nfil3 (inferredBy_MotifSimilarity). Cebpd; Cebpe; Cebpg (inferredBy_MotifSimilarity_n_Orthology).
## 3: Atf2; Cebpa; Cebpb; Nfil3 (inferredBy_MotifSimilarity).
## 4: Atf2; Cebpa; Cebpb; Dbp; Hlf; Nfil3 (inferredBy_MotifSimilarity). Cebpd; Cebpe; Cebpg (inferredBy_MotifSimilarity_n_Orthology).
## 5: Atf2; Cebpa; Cebpb; Dbp; Hlf (inferredBy_MotifSimilarity). Crebl2; Nfil3 (inferredBy_MotifSimilarity_n_Orthology).
## nEnrGenes rankAtMax
## 1: 65 920
## 2: 73 1156
## 3: 79 1274
## 4: 68 904
## 5: 80 1249
#Save motif enrichment results as text and HTML (optional):
motif富集基因结果输出(即初步保存regulon相关信息)
# motifEnrichment_selfMotifs_wGenes <- loadInt(scenicOptions, "motifEnrichment_selfMotifs_wGenes")#可以这样读入上一步数据
#以文本文件输出
if(!file.exists("output")) dir.create("output")#output文件夹在这一步生成
write.table(motifEnrichment_selfMotifs_wGenes, file=getOutName(scenicOptions, "s2_motifEnrichment"),
sep="\t", quote=FALSE, row.names=FALSE)#以文本格式存在"output/Step2_MotifEnrichment.tsv"中
#以HTML文件输出
if("DT" %in% installed.packages() && nrow(motifEnrichment_selfMotifs_wGenes)>0)
{
nvm <- tryCatch({
colsToShow <- c("motifDb", "logo", "NES", "geneSet", "TF_highConf", "TF_lowConf")
motifEnrichment_2html <- viewMotifs(motifEnrichment_selfMotifs_wGenes, colsToShow=colsToShow, options=list(pageLength=100))
fileName <- getOutName(scenicOptions, "s2_motifEnrichmentHtml")#存于"output/Step2_MotifEnrichment_preview.html"中
dirName <- dirname(fileName)
fileName <- basename(fileName)
suppressWarnings(DT::saveWidget(motifEnrichment_2html, fileName))#需要通过DT编译组件
file.rename(fileName, file.path(dirName, fileName))
if(getSettings(scenicOptions, "verbose")) message("Preview of motif enrichment saved as: ", file.path(dirName, fileName))
}, error = function(e) print(e$message))
}
## Preview of motif enrichment saved as: output/Step2_MotifEnrichment_preview.html
motifEnrichment.asIncidList <- apply(motifEnrichment_selfMotifs_wGenes, 1, function(oneMotifRow) {
genes <- strsplit(oneMotifRow["enrichedGenes"], ";")[[1]]
oneMotifRow <- data.frame(rbind(oneMotifRow), stringsAsFactors=FALSE)
data.frame(oneMotifRow[rep(1, length(genes)),c("NES", "motif", "highlightedTFs", "TFinDB")], genes, stringsAsFactors = FALSE)
})
class(motifEnrichment.asIncidList)
## [1] "list"
motifEnrichment.asIncidList <- rbindlist(motifEnrichment.asIncidList)#转换为数据框的结构,rbind会使其变成与类似于长格式的数据
head(motifEnrichment.asIncidList)
## NES motif highlightedTFs TFinDB
## 1: 4.34 taipale_cyt_meth__HOXA5_NYAATTAN_FL_meth Dlx1 *
## 2: 4.34 taipale_cyt_meth__HOXA5_NYAATTAN_FL_meth Dlx1 *
## 3: 4.34 taipale_cyt_meth__HOXA5_NYAATTAN_FL_meth Dlx1 *
## 4: 4.34 taipale_cyt_meth__HOXA5_NYAATTAN_FL_meth Dlx1 *
## 5: 4.34 taipale_cyt_meth__HOXA5_NYAATTAN_FL_meth Dlx1 *
## 6: 4.34 taipale_cyt_meth__HOXA5_NYAATTAN_FL_meth Dlx1 *
## genes
## 1: Adcyap1r1
## 2: Cntnap2
## 3: Crabp1
## 4: Dlx1
## 5: Frem1
## 6: Fzd1
colnames(motifEnrichment.asIncidList) <- c("NES", "motif", "TF", "annot", "gene")
motifEnrichment.asIncidList <- data.frame(motifEnrichment.asIncidList, stringsAsFactors = FALSE)
按照TF整理靶基因,并且保留motif富集信息
regulonTargetsInfo <- lapply(split(motifEnrichment.asIncidList, motifEnrichment.asIncidList$TF),#按照tf拆分表格,依次做如下处理
function(tfTargets){
print(unique(tfTargets$TF))#输出当前正在处理的TF
tfTable <- as.data.frame(do.call(rbind, lapply(split(tfTargets, tfTargets$gene), function(enrOneGene){#按照基因拆分表格,依次做如下处理
highConfAnnot <- "**" %in% enrOneGene$annot#取出高置信度的motif
enrOneGeneByAnnot <- enrOneGene
if(highConfAnnot) enrOneGeneByAnnot <- enrOneGeneByAnnot[which(enrOneGene$annot == "**"),]#取出包含高置信度的基因
bestMotif <- which.max(enrOneGeneByAnnot$NES)#取出每个基因当中富集分数NES最高的
cbind(TF=unique(enrOneGene$TF), gene=unique(enrOneGene$gene), nMotifs=nrow(enrOneGene),
bestMotif=as.character(enrOneGeneByAnnot[bestMotif,"motif"]), NES=as.numeric(enrOneGeneByAnnot[bestMotif,"NES"]),
highConfAnnot=highConfAnnot)#将以上数据合并
})), stringsAsFactors=FALSE)
tfTable[order(tfTable$NES, decreasing = TRUE),]
})
## [1] "Dlx1"
## [1] "Dlx5"
## [1] "Irf1"
## [1] "Sox10"
## [1] "Sox8"
## [1] "Sox9"
## [1] "Stat6"
## [1] "Tef"
regulonTargetsInfo <- rbindlist(regulonTargetsInfo)#将循环的list转换为数据框
colnames(regulonTargetsInfo) <- c("TF", "gene", "nMotifs", "bestMotif", "NES", "highConfAnnot")
head(regulonTargetsInfo)#即将每个module中的最佳motif取出,与靶基因、motif形成一个regulon
## TF gene nMotifs bestMotif NES
## 1: Dlx1 Scml4 24 hocomoco__GBX2_HUMAN.H11MO.0.D 4.2
## 2: Dlx1 Arl4c 24 taipale_cyt_meth__HOXA5_NYAATTAN_eDBD_meth 4.16
## 3: Dlx1 Ddr2 15 taipale_cyt_meth__HOXA5_NYAATTAN_eDBD_meth 4.16
## 4: Dlx1 Fstl4 18 taipale_cyt_meth__HOXA5_NYAATTAN_eDBD_meth 4.16
## 5: Dlx1 Gm5607 9 taipale_cyt_meth__HOXA5_NYAATTAN_eDBD_meth 4.16
## 6: Dlx1 Ldhb 23 taipale_cyt_meth__HOXA5_NYAATTAN_eDBD_meth 4.16
## highConfAnnot
## 1: FALSE
## 2: FALSE
## 3: FALSE
## 4: FALSE
## 5: FALSE
## 6: FALSE
#下面这步可选,目的是加入GENIE3 score(并不是用来构建regulons的),如果你在上面运行的是GRNBOOST而非GENIE3,则可忽略这一步
linkList <- loadInt(scenicOptions, "genie3ll", ifNotExists="null")
if(!is.null(linkList) & ("weight" %in% colnames(linkList)))
{
if(is.data.table(linkList)) linkList <- as.data.frame(linkList)
uniquePairs <- nrow(unique(linkList[,c("TF", "Target")]))
if(uniquePairs == nrow(linkList)) {
linkList <- linkList[which(linkList$weight>=getSettings(scenicOptions, "modules/weightThreshold")),]
rownames(linkList) <- paste(linkList$TF, linkList$Target,sep="__")
regulonTargetsInfo <- cbind(regulonTargetsInfo, Genie3Weight=linkList[paste(regulonTargetsInfo$TF, regulonTargetsInfo$gene,sep="__"),"weight"])
}else {
warning("There are duplicated regulator-target (gene id/name) pairs in the co-expression link list.",
"\nThe co-expression weight was not added to the regulonTargetsInfo table.")
}
}else warning("It was not possible to add the weight to the regulonTargetsInfo table.")
saveRDS(regulonTargetsInfo, file=getIntName(scenicOptions, "regulonTargetsInfo"))#即最终的regulon信息被存在"int/2.5_regulonTargetsInfo.Rds"
write.table(regulonTargetsInfo, file=getOutName(scenicOptions, "s2_regulonTargetsInfo"),
sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)#以文本格式输出最终的regulon信息于"output/Step2_regulonTargetsInfo.tsv"
根据motif注释划分regulons
regulonTargetsInfo_splitByAnnot <- split(regulonTargetsInfo, regulonTargetsInfo$highConfAnnot)#按照注释的置信度划分
regulons <- NULL
if(!is.null(regulonTargetsInfo_splitByAnnot[["TRUE"]]))
{
regulons <- lapply(split(regulonTargetsInfo_splitByAnnot[["TRUE"]], regulonTargetsInfo_splitByAnnot[["TRUE"]][,"TF"]), function(x) sort(as.character(unlist(x[,"gene"]))))
}#执行度搞的保留TF名称
regulons_extended <- NULL
if(!is.null(regulonTargetsInfo_splitByAnnot[["FALSE"]]))
{
regulons_extended <- lapply(split(regulonTargetsInfo_splitByAnnot[["FALSE"]],regulonTargetsInfo_splitByAnnot[["FALSE"]][,"TF"]), function(x) unname(unlist(x[,"gene"])))
regulons_extended <- setNames(lapply(names(regulons_extended), function(tf) sort(unique(c(regulons[[tf]], unlist(regulons_extended[[tf]]))))), names(regulons_extended))
names(regulons_extended) <- paste(names(regulons_extended), "_extended", sep="")
}#置信度低的重命名为TF名称_extended
regulons <- c(regulons, regulons_extended)
saveRDS(regulons, file=getIntName(scenicOptions, "regulons"))#将regulons信息(TF与其对应基因)以geneset的形式存在"int/2.6_regulons_asGeneSet.Rds"
incidList <- reshape2::melt(regulons)#转换为长格式数据
head(incidList)
## value L1
## 1 Ablim1 Dlx1
## 2 Ache Dlx1
## 3 Adarb2 Dlx1
## 4 Adcyap1r1 Dlx1
## 5 AI854703 Dlx1
## 6 Alk Dlx1
incidMat <- table(incidList[,2], incidList[,1])
saveRDS(incidMat, file=getIntName(scenicOptions, "regulons_incidMat"))
if(getSettings(scenicOptions, "verbose"))
{
# Number of regulons and summary of sizes:
length(regulons)
summary(lengths(regulons))
}
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41.0 105.2 128.5 162.0 165.2 448.0
scenicOptions@status$current <- 2
#如果程序中断了,可以这样查看运行阶段,返回的数字为已运行完的step:
scenicOptions@status$current
## [1] 2
[1]Aibar S, González-Blas CB, Moerman T, Huynh-Thu VA, Imrichova H, Hulselmans G, Rambow F, Marine JC, Geurts P, Aerts J, van den Oord J, Atak ZK, Wouters J, Aerts S. SCENIC: single-cell regulatory network inference and clustering. Nat Methods. 2017 Nov;14(11):1083-1086. doi: 10.1038/nmeth.4463. Epub 2017 Oct 9.
[2]https://github.com/aertslab/SCENIC
单细胞系列教程目录
如何联系我们
https://mp.weixin.qq.com/s/5rZ5yLXaLDVAtQbtGHAHoQ