GreenleafLab / ArchR

ArchR : Analysis of Regulatory Chromatin in R (www.ArchRProject.com)
MIT License
384 stars 137 forks source link

ERROR Found in .addGeneScoreMat TmpGS for PBMC #2158

Open zcaiwei opened 5 months ago

zcaiwei commented 5 months ago

Describe the bug

Here's a bug I encountered creating ArrowFiles, the following is my code :

library(ArchR)
addArchRGenome("hg38")

if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("BSgenome.Hsapiens.UCSC.hg38")
library(BSgenome.Hsapiens.UCSC.hg38)
genomeAnnotation <- createGenomeAnnotation(genome = BSgenome.Hsapiens.UCSC.hg38)
genomeAnnotation
BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene")
BiocManager::install("org.Hs.eg.db")
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)
geneAnnotation <- createGeneAnnotation(TxDb = TxDb.Hsapiens.UCSC.hg38.knownGene, OrgDb = org.Hs.eg.db)
genomeAnnotation

ATACFiles <- "PBMC_normalized_scATAC_fragment_filter.tsv.gz"
ATACNames <- c("PBMC")
ArrowFiles <- createArrowFiles(
inputFiles = ATACFiles,
sampleNames = ATACNames,
filterTSS = 0.5, #Dont set this too high because you can always increase later
filterFrags = 1000,
addTileMat = TRUE,
addGeneScoreMat = TRUE
)

And met the error that,

2024-04-20 16:06:21.96756 : ERROR Found in .addGeneScoreMat for (PBMC : 1 of 1)
LogFile = ArchRLogs/ArchR-createArrows-23b777aa4ca9-Date-2024-04-20_Time-16-00-33.055021.log

<simpleError in .safelapply(seq_along(geneRegions), function(z) { totalGSz <- tryCatch({ .logDiffTime(sprintf("Creating Temp GeneScoreMatrix for %s, Chr (%s of %s)!", sampleName, z, length(geneRegions)), tstart, verbose = FALSE, logFile = logFile) geneRegionz <- geneRegions[[z]] geneRegionz <- geneRegionz[order(geneRegionz$idx)] chrz <- paste0(unique(seqnames(geneRegionz))) frag <- .getFragsFromArrow(ArrowFile, chr = chrz, out = "IRanges", cellNames = cellNames) fragSt <- trunc(start(frag)/tileSize) * tileSize fragEd <- trunc(end(frag)/tileSize) * tileSize fragBC <- rep(S4Vectors::match(mcols(frag)$RG, cellNames), 2) rm(frag) gc() uniqIns <- sort(unique(c(fragSt, fragEd))) matGS <- Matrix::sparseMatrix(i = match(c(fragSt, fragEd), uniqIns), j = as.vector(fragBC), x = rep(1, 2 * length(fragSt)), dims = c(length(uniqIns), length(cellNames))) if (!is.null(ceiling)) { matGS@x[matGS@x > ceiling] <- ceiling } uniqueTiles <- IRanges(start = uniqIns, width = tileSize) rm(uniqIns, fragSt, fragEd, fragBC) gc() if (useGeneBoundaries) { geneStartz <- start(GenomicRanges::resize(geneRegionz, 1, "start")) geneEndz <- start(GenomicRanges::resize(geneRegionz, 1, "end")) pminGene <- pmin(geneStartz, geneEndz) pmaxGene <- pmax(geneStartz, geneEndz) idxMinus <- BiocGenerics::which(strand(geneRegionz) != "-") pReverse <- rep(max(extendDownstream), length(pminGene)) pReverse[idxMinus] <- rep(max(extendUpstream), length(idxMinus)) pReverseMin <- rep(min(extendDownstream), length(pminGene)) pReverseMin[idxMinus] <- rep(min(extendUpstream), length(idxMinus)) pForward <- rep(max(extendUpstream), length(pminGene)) pForward[idxMinus] <- rep(max(extendDownstream), length(idxMinus)) pForwardMin <- rep(min(extendUpstream), length(pminGene)) pForwardMin[idxMinus] <- rep(min(extendDownstream), length(idxMinus)) s <- pmax(c(1, pmaxGene[-length(pmaxGene)] + tileSize), pminGene - pReverse) s <- pmin(pminGene - pReverseMin, s) e <- pmin(c(pminGene[-1] - tileSize, pmaxGene[length(pmaxGene)] + pForward[length(pmaxGene)]), pmaxGene + pForward) e <- pmax(pmaxGene + pForwardMin, e) extendedGeneRegion <- IRanges(start = s, end = e) idx1 <- which(pminGene - pReverseMin < start(extendedGeneRegion)) if (length(idx1) > 0) { stop("Error in gene boundaries minError") } idx2 <- which(pmaxGene + pForwardMin > end(extendedGeneRegion)) if (length(idx2) > 0) { stop("Error in gene boundaries maxError") } rm(s, e, pReverse, pReverseMin, pForward, pForwardMin, geneStartz, geneEndz, pminGene, pmaxGene) } else { extendedGeneRegion <- ranges(suppressWarnings(extendGR(geneRegionz, upstream = max(extendUpstream), downstream = max(extendDownstream)))) } tmp <- suppressWarnings(findOverlaps(extendedGeneRegion, uniqueTiles)) x <- distance(ranges(geneRegionz)[queryHits(tmp)], uniqueTiles[subjectHits(tmp)]) isMinus <- BiocGenerics::which(strand(geneRegionz) == "-") signDist <- sign(start(uniqueTiles)[subjectHits(tmp)] - start(GenomicRanges::resize(geneRegionz, 1, "start"))[queryHits(tmp)]) signDist[isMinus] <- signDist[isMinus] * -1 x <- x * signDist x <- eval(parse(text = geneModel)) x <- x * mcols(geneRegionz)$geneWeight[queryHits(tmp)] if (!is.null(blacklist)) { if (length(blacklist) > 0) { blacklistz <- blacklist[[chrz]] if (!is.null(blacklistz) | length(blacklistz) > 0) { tilesBlacklist <- 1 * (!overlapsAny(uniqueTiles, ranges(blacklistz))) if (sum(tilesBlacklist == 0) > 0) { x <- x * tilesBlacklist[subjectHits(tmp)] } } } } tmp <- Matrix::sparseMatrix(i = queryHits(tmp), j = subjectHits(tmp), x = x, dims = c(length(geneRegionz), nrow(matGS))) matGS <- tmp %*% matGS colnames(matGS) <- cellNames totalGSz <- Matrix::colSums(matGS) .safeSaveRDS(matGS, file = paste0(tmpFile, "-", chrz, ".rds"), compress = FALSE) rm(isMinus, signDist, extendedGeneRegion, uniqueTiles) rm(matGS, tmp) gc() totalGSz }, error = function(e) { errorList <- list(ArrowFile = ArrowFile, geneRegions = geneRegions, blacklist = blacklist, chr = chrz, totalGSz = if (exists("totalGSz", inherits = FALSE)) totalGSz else "totalGSz", matGS = if (exists("matGS", inherits = FALSE)) matGS else "matGS") .logError(e, fn = ".addGeneScoreMat TmpGS", info = sampleName, errorList = errorList, logFile = logFile) }) totalGSz}, threads = subThreads):
Error Found Iteration 11 :
[1] "Error in .logError(e, fn = ".addGeneScoreMat TmpGS", info = sampleName, : \n Exiting See Error Above\n"
<simpleError in .logError(e, fn = ".addGeneScoreMat TmpGS", info = sampleName, errorList = errorList, logFile = logFile): Exiting See Error Above>
Error Found Iteration 19 :
[1] "Error in .logError(e, fn = ".addGeneScoreMat TmpGS", info = sampleName, : \n Exiting See Error Above\n"
<simpleError in .logError(e, fn = ".addGeneScoreMat TmpGS", info = sampleName, errorList = errorList, logFile = logFile): Exiting See Error Above>
Error Found Iteration 23 :
[1] "Error in .logError(e, fn = ".addGeneScoreMat TmpGS", info = sampleName, : \n Exiting See Error Above\n"
<simpleError in .logError(e, fn = ".addGeneScoreMat TmpGS", info = sampleName, errorList = errorList, logFile = logFile): Exiting See Error Above>

Attach your log file ArchR-createArrows-23b777aa4ca9-Date-2024-04-20_Time-16-00-33.055021.log

To Reproduce When I used the tutorial hematopoiesis dataset, I ran successfully.

I don't know what's going wrong.

Thank you for your kind help!!

rcorces commented 5 months ago

Hi @zcaiwei! Thanks for using ArchR! Lately, it has been very challenging for me to keep up with maintenance of this package and all of my other responsibilities as a PI. I have not been responding to issue posts and I have not been pushing updates to the software. We are actively searching to hire a computational biologist to continue to develop and maintain ArchR and related tools. If you know someone who might be a good fit, please let us know! In the meantime, your issue will likely go without a reply. Most issues with ArchR right not relate to compatibility. Try reverting to R 4.1 and Bioconductor 3.15. Newer versions of Seurat and Matrix also are causing issues. Sorry for not being able to provide active support for this package at this time.

256wangliu commented 2 months ago

<simpleError in Matrix::sparseMatrix(i = queryHits(tmp), j = subjectHits(tmp), x = x, dims = c(length(geneRegionz), nrow(matGS))): length(x) must not exceed length(i)> It seems there may be an issue with your fragments due to insufficient reads. To address this, try setting minTSS = 0.