GreenleafLab / ArchR

ArchR : Analysis of Regulatory Chromatin in R (www.ArchRProject.com)
MIT License
382 stars 137 forks source link

Add gChromvar for GWAS researchers? #221

Closed ParthaZain closed 3 years ago

ParthaZain commented 4 years ago

Hi all, thank you for an amazing, easy to use and well-documented package that allows relative bioinformatics newbies to start doing quite sophisticated analyses.

We are a GWAS lab and would be interested in looking at cell type enrichment for our association signals. We routinely do this with bulk ATAC-Seq datasets but would be interested in reproducing the kind of analysis in figure 6 of this paper (https://www.nature.com/articles/s41588-019-0362-6) but for much larger scATAC-Seq datasets (eg the hematopoiesis dataset produced by your lab in Granja et al 2019).

Are there any plans to add g-chromVar on top of your existing chromVar implementation? Would it be straightforward to just do this on my own without you having to add a specific function to the package?

jgranja24 commented 3 years ago

Sorry @ParthaZain for the long delay... This has always been functional if you supply the GWAS based matches file to ArchR. However, here is a walkthrough for simplicity showing ArchR = gchromVAR


#New Release Branch
devtools::install_github("GreenleafLab/ArchR", ref="release_1.0.0", repos = BiocManager::repositories())

#######
devtools::install_github("caleblareau/gchromVAR")
library(chromVAR)
library(gchromVAR)
library(SummarizedExperiment)
library(data.table)
library(BiocParallel)
library(BSgenome.Hsapiens.UCSC.hg19)
set.seed(123)
register(MulticoreParam(2))

#######################
# New Result
#######################
library(ArchR)
proj <- getTestProject()

#Get Peak Matrix
se <- getMatrixFromProject(proj, "PeakMatrix")
rownames(se) <- paste0(seqnames(se), "_", start(se), "_",end(se))
peaks <- getPeakSet(proj)
names(peaks) <- paste0(seqnames(peaks), "_", start(peaks), "_",end(peaks))
mcols(se)$bias <- peaks[rownames(se)]$GC

#Get Background Peaks
proj <- addBgdPeaks(proj)
bgdPeaks <- getBgdPeaks(proj)
rownames(bgdPeaks) <- paste0(seqnames(bgdPeaks), "_", start(bgdPeaks), "_",end(bgdPeaks))
bgdPeaks <- bgdPeaks[rownames(se),]

#Get Bed GWAS using gChromVAR
files <- list.files(system.file('extdata/paper/PP001',package='gchromVAR'),
                    full.names = TRUE, pattern = "*.bed$")
ukbb <- gchromVAR::importBedScore(rowRanges(se), files, colidx = 5)

################
# gChromVAR
################

#Weighted Deviations
names(assays(se)) <- "counts"
ukbb_wDEV <- computeWeightedDeviations(se, ukbb, background_peaks = assay(bgdPeaks))
zdf <- reshape2::melt(t(assays(ukbb_wDEV)[["z"]]))
zdf[,2] <- gsub("_PP001", "", zdf[,2])
colnames(zdf) <- c("ct", "tr", "Zscore")
head(zdf, 25)

#                              ct         tr      Zscore
# 1  PBMCSmall#TATCTGTAGACAGCTG-1 BASO_COUNT -0.30860295
# 2  PBMCSmall#ATGGATCCAGGCAAGT-1 BASO_COUNT -0.39312881
# 3  PBMCSmall#CACATGAAGGCCTAAG-1 BASO_COUNT -0.30409017
# 4  PBMCSmall#CAAAGCTGTCTGATTG-1 BASO_COUNT -0.13592679
# 5  PBMCSmall#CTCATGCAGCAGTAGC-1 BASO_COUNT -0.54970951
# 6  PBMCSmall#CTGAATGAGCAGAATT-1 BASO_COUNT -0.41639709
# 7  PBMCSmall#TCAGGTAAGAGCAGCT-1 BASO_COUNT -0.66608152
# 8  PBMCSmall#CCCAGAGTCTTTATCG-1 BASO_COUNT -0.43582579
# 9  PBMCSmall#AGGCCCAAGTCTGCTA-1 BASO_COUNT  0.25856140
# 10 PBMCSmall#ATTTGTCCAATGCCAT-1 BASO_COUNT -0.22555038
# 11 PBMCSmall#GTCACGGAGCTCGGCT-1 BASO_COUNT -0.26471028
# 12 PBMCSmall#CGCAGGTAGGAGTCTG-1 BASO_COUNT -0.17128822
# 13 PBMCSmall#CCTGCTACAATGGCAG-1 BASO_COUNT -0.27680395
# 14 PBMCSmall#TTACGGAAGAGGTCCA-1 BASO_COUNT -0.47225230
# 15 PBMCSmall#GCACGGTCAACACGGA-1 BASO_COUNT -0.68415552
# 16 PBMCSmall#GTCACTCAGCTCCATA-1 BASO_COUNT  0.15649160
# 17 PBMCSmall#AGTTTGGAGCAATGTA-1 BASO_COUNT -0.37026848
# 18 PBMCSmall#ACGTTAGGTGCACTTA-1 BASO_COUNT -0.27937833
# 19 PBMCSmall#TAATCGGGTTCTCGAA-1 BASO_COUNT -0.34514003
# 20 PBMCSmall#TTGTTGTTCGACTATG-1 BASO_COUNT -0.12945685
# 21 PBMCSmall#AAATGAGCAGATGGCA-1 BASO_COUNT -0.26072312
# 22 PBMCSmall#GGTAGGAAGTAGTGTA-1 BASO_COUNT -0.04414560
# 23 PBMCSmall#TTACGGACATCGCCTT-1 BASO_COUNT -0.04297001
# 24 PBMCSmall#CACATGATCCTGGAAT-1 BASO_COUNT -0.45827856
# 25 PBMCSmall#CATGCCTCATTTGTTC-1 BASO_COUNT -0.29474687

################
# ArchR
################

#Compute With ArchR
proj <- addDeviationsMatrix(proj, matches = ukbb, matrixName = "UKBBMatrix")
seUK <- getMatrixFromProject(proj, "UKBBMatrix")
zdf2 <- reshape2::melt(t(as.matrix(assays(seUK)[["z"]])))
zdf2[,2] <- gsub("_PP001", "", zdf2[,2])
colnames(zdf2) <- c("ct", "tr", "Zscore")
head(zdf2, 25)

#                              ct         tr      Zscore
# 1  PBMCSmall#TATCTGTAGACAGCTG-1 BASO_COUNT -0.30860295
# 2  PBMCSmall#ATGGATCCAGGCAAGT-1 BASO_COUNT -0.39312881
# 3  PBMCSmall#CACATGAAGGCCTAAG-1 BASO_COUNT -0.30409017
# 4  PBMCSmall#CAAAGCTGTCTGATTG-1 BASO_COUNT -0.13592679
# 5  PBMCSmall#CTCATGCAGCAGTAGC-1 BASO_COUNT -0.54970951
# 6  PBMCSmall#CTGAATGAGCAGAATT-1 BASO_COUNT -0.41639709
# 7  PBMCSmall#TCAGGTAAGAGCAGCT-1 BASO_COUNT -0.66608152
# 8  PBMCSmall#CCCAGAGTCTTTATCG-1 BASO_COUNT -0.43582579
# 9  PBMCSmall#AGGCCCAAGTCTGCTA-1 BASO_COUNT  0.25856140
# 10 PBMCSmall#ATTTGTCCAATGCCAT-1 BASO_COUNT -0.22555038
# 11 PBMCSmall#GTCACGGAGCTCGGCT-1 BASO_COUNT -0.26471028
# 12 PBMCSmall#CGCAGGTAGGAGTCTG-1 BASO_COUNT -0.17128822
# 13 PBMCSmall#CCTGCTACAATGGCAG-1 BASO_COUNT -0.27680395
# 14 PBMCSmall#TTACGGAAGAGGTCCA-1 BASO_COUNT -0.47225230
# 15 PBMCSmall#GCACGGTCAACACGGA-1 BASO_COUNT -0.68415552
# 16 PBMCSmall#GTCACTCAGCTCCATA-1 BASO_COUNT  0.15649160
# 17 PBMCSmall#AGTTTGGAGCAATGTA-1 BASO_COUNT -0.37026848
# 18 PBMCSmall#ACGTTAGGTGCACTTA-1 BASO_COUNT -0.27937833
# 19 PBMCSmall#TAATCGGGTTCTCGAA-1 BASO_COUNT -0.34514003
# 20 PBMCSmall#TTGTTGTTCGACTATG-1 BASO_COUNT -0.12945685
# 21 PBMCSmall#AAATGAGCAGATGGCA-1 BASO_COUNT -0.26072312
# 22 PBMCSmall#GGTAGGAAGTAGTGTA-1 BASO_COUNT -0.04414560
# 23 PBMCSmall#TTACGGACATCGCCTT-1 BASO_COUNT -0.04297001
# 24 PBMCSmall#CACATGATCCTGGAAT-1 BASO_COUNT -0.45827856
# 25 PBMCSmall#CATGCCTCATTTGTTC-1 BASO_COUNT -0.29474687

################
# Test Equivalent
################

identical(round(zdf2[,3], 5), round(zdf[,3], 5))
#[1] TRUE

Best

Jeff

jgranja24 commented 3 years ago

I am closing this issue. Feel free to open new ones!