scanning peaks for transcription binding sites

Hi @rojinsafavi, thanks for your question. Please try to use proper labels as this is a documentation sort of question and not a bug. The question you are asking can be solved below. I don't think subsetting by fragment positions is really necessary when you have sequenced deep. The reason for the larger difference is because I downsampled this dataset extremely small and removed lots of cells.


library(ArchR)

proj <- getTestProject()
positions <- getPositions(proj)
# > positions
# GRangesList object of length 870:
# $TFAP2B_1
# GRanges object with 2929 ranges and 1 metadata column:
#          seqnames              ranges strand |     score
#             <Rle>           <IRanges>  <Rle> | <numeric>
#      [1]     chr1       860792-860803      - |   8.14926
#      [2]     chr1       873916-873927      + |   8.07230
#      [3]     chr1       873916-873927      - |   8.07230
#      [4]     chr1       875505-875516      + |  10.03060
#      [5]     chr1       875505-875516      - |   8.86681
#      ...      ...                 ...    ... .       ...
#   [2925]     chr2 242809493-242809504      + |   7.87604
#   [2926]     chr2 242809450-242809461      - |   7.92003
#   [2927]     chr2 242813376-242813387      + |   7.85617
#   [2928]     chr2 242944266-242944277      + |   8.58860
#   [2929]     chr2 242944266-242944277      - |   8.26546
#   -------
#   seqinfo: 2 sequences from an unspecified genome; no seqlengths

# ...
# <869 more elements>

allFragments <- getFragmentsFromProject(proj)
# > allFragments
# List of length 1
# names(1): PBMCSmall

#Unlist
allFragments <- unlist(allFragments, use.names=FALSE)
# > allFragments
# GRanges object with 1798429 ranges and 1 metadata column:
#             seqnames              ranges strand |                     RG
#                <Rle>           <IRanges>  <Rle> |                  <Rle>
#         [1]     chr1         17171-17524      * | PBMCSmall#CCGTGAGAGA..
#         [2]     chr1       138313-138531      * | PBMCSmall#CCGTGAGAGA..
#         [3]     chr1       531909-532086      * | PBMCSmall#CCGTGAGAGA..
#         [4]     chr1       538911-538943      * | PBMCSmall#CCGTGAGAGA..
#         [5]     chr1       568247-568376      * | PBMCSmall#CCGTGAGAGA..
#         ...      ...                 ...    ... .                    ...
#   [1798425]     chr2 170590266-170590506      * | PBMCSmall#GTTGGTACAC..
#   [1798426]     chr2 174829442-174829503      * | PBMCSmall#GTTGGTACAC..
#   [1798427]     chr2 175117762-175117952      * | PBMCSmall#GTTGGTACAC..
#   [1798428]     chr2 179343258-179343345      * | PBMCSmall#GTTGGTACAC..
#   [1798429]     chr2 187350772-187350807      * | PBMCSmall#GTTGGTACAC..

#Subset By those Overlapping Fragment Regions ### SLOW
positions2 <- parallel::mclapply(seq_along(positions), function(x){
    message(x)
    subsetByOverlaps(positions[[x]], allFragments, ignore.strand=TRUE)
}, mc.cores = 16, mc.preschedule=FALSE)
names(positions2) <- names(positions)
positions2 <- as(positions2, "GRangesList")
# > positions2
# GRangesList object of length 870:
# $TFAP2B_1
# GRanges object with 2734 ranges and 1 metadata column:
#          seqnames              ranges strand |     score
#             <Rle>           <IRanges>  <Rle> | <numeric>
#      [1]     chr1       860792-860803      - |   8.14926
#      [2]     chr1       873916-873927      + |   8.07230
#      [3]     chr1       873916-873927      - |   8.07230
#      [4]     chr1       875505-875516      + |  10.03060
#      [5]     chr1       875505-875516      - |   8.86681
#      ...      ...                 ...    ... .       ...
#   [2730]     chr2 242809493-242809504      + |   7.87604
#   [2731]     chr2 242809450-242809461      - |   7.92003
#   [2732]     chr2 242813376-242813387      + |   7.85617
#   [2733]     chr2 242944266-242944277      + |   8.58860
#   [2734]     chr2 242944266-242944277      - |   8.26546
#   -------
#   seqinfo: 2 sequences from an unspecified genome; no seqlengths

# ...
# <869 more elements>

#How Different
cbPos <- cbind(unlist(lapply(positions, length)), unlist(lapply(positions2, length)))

#See How Correlated
cor(log10(cbPos[,1]), log10(cbPos[,2]))
# [1] 0.9961276

#Difference Range %
median(abs(cbPos[,1] - cbPos[,2]) / rowMeans(cbPos))
# [1] 0.1383671

GreenleafLab / ArchR

scanning peaks for transcription binding sites #598