GreenleafLab / ArchR

ArchR : Analysis of Regulatory Chromatin in R (www.ArchRProject.com)
MIT License
384 stars 137 forks source link

module scores (eg from genescorematrix, like seurat addModuleScore) #308

Closed juliabelk closed 2 years ago

juliabelk commented 4 years ago

Would it be possible to add a function to combine multiple gene scores? Such as the addModuleScore function in Seurat?

https://rdrr.io/github/satijalab/seurat/man/AddModuleScore.html

Thanks!

rcorces commented 4 years ago

+1 I like this idea

jgranja24 commented 4 years ago

Hi @juliabelk,

Sorry for the delays. I am now adding an addModuleScore to the new release. To use this you can use the following little tutorial below-


#Load Hidden Helper Functions
fn <- unclass(lsf.str(envir = asNamespace("ArchR"), all = TRUE))
for (i in seq_along(fn)) {
    tryCatch({
        eval(parse(text = paste0(fn[i], "<-ArchR:::", fn[i])))
    }, error = function(x) {
    })
}

#New Function
addModuleScore <- function(
    ArchRProj = NULL,
    useMatrix = NULL,
    features = NULL,
    groupBy = "Clusters",
    nBin = 25,
    nBgd = 100,
    name = "Module",
    seed = 1,
    threads = getArchRThreads(),
    logFile = createLogFile("addModuleScore")
    ){

    if(!is.null(seed)) set.seed(seed)

    #Get Feature DF
    featureDF <- ArchR:::.getFeatureDF(head(getArrowFiles(ArchRProj),2), subGroup=useMatrix)
    rownames(featureDF) <- paste0(featureDF$seqnames, ":", featureDF$idx)
    featureDF$Match <- seq_len(nrow(featureDF))

    if(useMatrix %ni% getAvailableMatrices(ArchRProj)){
        stop("useMatrix not in available matrices! See getAvailableMatrices!")
    }

    matrixClass <- h5read(getArrowFiles(ArchRProj)[1], paste0(useMatrix, "/Info/Class"))

    if(matrixClass == "Sparse.Assays.Matrix"){
        if(!all(unlist(lapply(unlist(features), function(x) grepl(":",x))))){
          .logMessage("When accessing features from a matrix of class Sparse.Assays.Matrix it requires seqnames\n(denoted by seqnames:name) specifying to which assay to pull the feature from.\nIf confused, try getFeatures(ArchRProj, useMatrix) to list out available formats for input!", logFile = logFile)
          stop("When accessing features from a matrix of class Sparse.Assays.Matrix it requires seqnames\n(denoted by seqnames:name) specifying to which assay to pull the feature from.\nIf confused, try getFeatures(ArchRProj, useMatrix) to list out available formats for input!")
        }
    }

    if(grepl(":",unlist(features)[1])){

        sname <- stringr::str_split(unlist(features),pattern=":",simplify=TRUE)[,1]
        name <- stringr::str_split(unlist(features),pattern=":",simplify=TRUE)[,2]

        idx <- lapply(seq_along(name), function(x){
          ix <- intersect(which(tolower(name[x]) == tolower(featureDF$name)), BiocGenerics::which(tolower(sname[x]) == tolower(featureDF$seqnames)))
          if(length(ix)==0){
            .logStop(sprintf("FeatureName (%s) does not exist! See getFeatures", name[x]), logFile = logFile)
          }
          ix
        }) %>% unlist

    }else{

        idx <- lapply(seq_along(unlist(features)), function(x){
          ix <- which(tolower(unlist(features)[x]) == tolower(featureDF$name))[1]
          if(length(ix)==0){
            .logStop(sprintf("FeatureName (%s) does not exist! See getFeatures", unlist(features)[x]), logFile = logFile)
          }
          ix
        }) %>% unlist

    }

    if(is.null(names(features))){
        names(features) <- paste0(name, seq_along(features))
    }else{
        names(features) <- paste0(name, ".", names(features))
    }

    featuresUse <- featureDF[idx,]
    featuresUse$Module <- Rle(stack(features)[,2])

    #Get Averages
    rS <- ArchR:::.getRowSums(ArrowFiles = getArrowFiles(ArchRProj), useMatrix = useMatrix)
    rS <- rS[order(rS[,3]), ]
    rS$Bins <- Rle(ggplot2::cut_number(x = rS[,3] + rnorm(length(rS[,3]))/1e30, n = nBin, labels = FALSE, right = FALSE))
    rS$Match <- match(paste0(rS$seqnames, ":", rS$idx), rownames(featureDF))

    if(nBgd > min(rS$Bins@lengths)){
        stop("nBgd must be lower than ", min(rS$Bins@lengths), "!")
    }

    idxMatch <- match(paste0(featuresUse$seqnames, ":", featuresUse$idx), paste0(rS$seqnames, ":", rS$idx))
    featuresUse$Bins <- as.vector(rS$Bins[idxMatch])

    #MakeLists
    featureList <- split(featuresUse$Match, featuresUse$Module)
    moduleList <- split(featuresUse$Bins, featuresUse$Module)
    binList <- split(rS$Match, rS$Bins)

    dfM <- lapply(seq_along(featureList), function(x){
        message("Computing Module ",x, " of ", length(featureList))
        binx <- binList[moduleList[[x]]]
        idxFgd <- featureList[[x]]
        idxBgd <- unlist(lapply(binx, function(x) sample(x, nBgd)), use.names=FALSE)
        m <- ArchR:::.getPartialMatrix(
            ArrowFiles = getArrowFiles(ArchRProj),
            featureDF = featureDF[c(idxFgd, idxBgd), ],
            useMatrix = useMatrix,
            cellNames = ArchRProj$cellNames,
            threads = threads,
            verbose = FALSE,
            doSampleCells = FALSE
        )
        Matrix::colMeans(m[seq_along(idxFgd), ]) - Matrix::colMeans(m[-seq_along(idxFgd), ])
    }) %>% Reduce("cbind", .)

    for(x in seq_len(ncol(dfM))){
        ArchRProj <- addCellColData(ArchRProj, data = dfM[,x], name=names(featureList)[x], cells=rownames(dfM), force = TRUE)
    }

    ArchRProj

}

#Test Function
projHeme3 <- addImputeWeights(projHeme3)
features <- list(
    BScore = c("MS4A1", "CD79A", "CD74"),
    TScore = c("CD3D", "CD8A", "GZMB", "CCR7", "LEF1")
)
projHeme3 <- addModuleScore(projHeme3, features = features, useMatrix = "GeneScoreMatrix")
p1 <- plotEmbedding(projHeme3, name="Module.TScore", imputeWeights = getImputeWeights(projHeme3))
p2 <- plotEmbedding(projHeme3, name="Module.BScore", imputeWeights = getImputeWeights(projHeme3))
plotPDF(ggAlignPlots(p1,p2,draw=F,type="h"))
Screen Shot 2020-10-14 at 7 35 26 PM

Hope this helps

Jeff

juliabelk commented 3 years ago

Just a follow up to this - is it possible to do a ModuleScore of peaks? I have been trying but

getFeatures(proj,useMatrix = "PeakMatrix")

seems to return NULL

rcorces commented 3 years ago

getFeatures() returns NULL because the PeakMatrix features dont have a column called name. Not sure if this is by design. @jgranja24 ?

dagarfield commented 3 years ago

Hey all -- I think I encountered an issue with this function in cases where you have only one set of genes. That is to say, this works as a feature set: features <- list( earlyPMC = earlyPMC, germMarkers = germMarkers )

But this doesn't: features <- list( earlyPMC = earlyPMC)

e.g

ArchRProj <- addModuleScore(ArchRProj, features = features, useMatrix = "GeneScoreMatrix")

Computing Module 1 of 1

Warning message in seq_len(ncol(dfM)): “first element used of 'length.out' argument” Error in seq_len(ncol(dfM)): argument must be coercible to non-negative integer Traceback:

  1. addModuleScore(projSpurp_doubletRemoved_filtered2, features = features, . useMatrix = "GeneScoreMatrix")

The issue seems to come from the very last part of the function for (x in seq_len(ncol(dfM))) { ArchRProj <- addCellColData(ArchRProj, data = dfM[, x], name = names(featureList)[x], cells = rownames(dfM), force = TRUE) }

This is because you get back a vector for dfM rather than a matrix/dataframe if your input list for features contains only a single set of genes. It is easy to fix by also adding a dummy feature set, but is not intuitive to the user.

rcorces commented 3 years ago

Thanks @dagarfield - @jgranja24 will take a look.

Also, you can use permalinks to point us to the exact line of code you're referring to. Like this:

https://github.com/GreenleafLab/ArchR/blob/2f022a448d8248a0f9afb33419bcbaeafe7731c0/R/ModuleScore.R#L97

NoemieL commented 3 years ago

Hi. This is very nice and it works well with GeneScoreMatrix. I wanted to do the same with GeneIntegrationMatrix but it is not adapted and returns this error:

Error in addModuleScore(proj_all_3, features = c("TNFRSF17", "SDC1", "CD19", : nBgd must be lower than 1!

Do you think it is possible to manage it? Thanks

rcorces commented 3 years ago

Looking at the code, it looks like addModuleScore() may not have been coded to handle gene expression? But i agree that this is an important feature to have. @jgranja24 ?

j-andrews7 commented 3 years ago

Note that the documentation for this function appears to be for the addImputeWeights() function currently: https://www.archrproject.com/reference/addModuleScore.html

Would also really like to see this implemented for gene expression.

rcorces commented 3 years ago

@j-andrews7 - thanks for pointing that out. I fixed the documentation via https://github.com/GreenleafLab/ArchR/commit/9366af8b6022f4d7f7d6ff3f65b16725c32fdde4 in release_1.0.2 branch

Still looking to carve out time to implement this for gene expression.

NoemieL commented 2 years ago

Hi, I get the following error when running addModuleScore while I did not get it before, do you have any idea what the problem is?

features <- list( BScore = Bcell, PlasmaScore = Plasmacell ) head(Bcell) [1] "AICDA" "AID" "ARNTL2" "BACH2" "BATF" "BATF3"

class(Bcell) [1] "character"

head(Plasmacell) [1] "AKT1" "ATF4" "ATF6" "BATF2" "BHLHA15" "BLIMP1"

class(Plasmacell) [1] "character"

ArchR_proj <- addModuleScore(MM_PreVsPost_KL_peak, features = features, useMatrix = "GeneScoreMatrix", name="Module_GeneScore") Error in .validInput(input = features, name = "features", valid = c("character")) : Input value for 'features' is not a character, (features = list) please supply valid input!

Thanks in advance

NoemieL commented 2 years ago

After reinstalling the last release, it looks like it is fine. devtools::install_github("GreenleafLab/ArchR", ref="release_1.0.2", repos = BiocManager::repositories())

rcorces commented 2 years ago

addModuleScore() has been updated and should work with all data types. The following changes were made on the dev branch and will be incorporated into release_1.0.3

https://github.com/GreenleafLab/ArchR/commit/44a0d27840d4e11a6ee04bd9f26f01761b1ff017#diff-b31996f658c3e45aeb4f21aa5f5ddc7cc150d4383da40f3a1a3955351ce32887 https://github.com/GreenleafLab/ArchR/commit/f49141f6c2a9c25dc27784d91af40135c8f38ee2#diff-b31996f658c3e45aeb4f21aa5f5ddc7cc150d4383da40f3a1a3955351ce32887 https://github.com/GreenleafLab/ArchR/commit/9183c43f237c8d5cfa7d8db0cb99421482a857a3#diff-b31996f658c3e45aeb4f21aa5f5ddc7cc150d4383da40f3a1a3955351ce32887

Ping-lin14 commented 2 years ago

addModuleScore() has been updated and should work with all data types. The following changes were made on the dev branch and will be incorporated into release_1.0.3

44a0d27 #diff-b31996f658c3e45aeb4f21aa5f5ddc7cc150d4383da40f3a1a3955351ce32887 f49141f #diff-b31996f658c3e45aeb4f21aa5f5ddc7cc150d4383da40f3a1a3955351ce32887 9183c43 #diff-b31996f658c3e45aeb4f21aa5f5ddc7cc150d4383da40f3a1a3955351ce32887

Hi @rcorces I've updated to the latest version using devtools::install_github("GreenleafLab/ArchR", ref="release_1.0.3", repos = BiocManager::repositories()), but addModuleScore still doesn't seem to work with GeneExpressionMatrix.

proj2 <- addModuleScore(proj2, features = features, useMatrix = "GeneExpressionMatrix")

Error in addModuleScore(proj2, features = features, useMatrix = "GeneExpressionMatrix") : 
  nBgd must be lower than 1!
rcorces commented 2 years ago

@Ping-lin14 - I'm unable to recapitulate your error using my own dataset.

> features <- list(
  BScore = c("MS4A1", "CD79A", "CD74"),
  TScore = c("CD3D", "CD8A", "GZMB", "CCR7", "LEF1")
)
> projMulti3 <- addModuleScore(projMulti3,
+                             useMatrix = "GeneExpressionMatrix",
+                             name = "Module",
+                             features = features)
Computing Module 1 of 2
Computing Module 2 of 2
Overriding previous entry for Module.BScore
Overriding previous entry for Module.TScore

> plotEmbedding(projMulti3,
+                     embedding = "UMAP_Combined",
+                     colorBy = "cellColData",
+                     name="Module.BScore",
+                     imputeWeights = getImputeWeights(projMulti3))
Getting ImputeWeights
ArchR logging to : ArchRLogs/ArchR-plotEmbedding-ffb154258efcc-Date-2022-09-13_Time-16-36-24.log
If there is an issue, please report to github with logFile!
Imputing Matrix
Plotting Embedding
1 
ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-ffb154258efcc-Date-2022-09-13_Time-16-36-24.log
> 

image

A-legac45 commented 10 months ago

Hello, I am not able to make it work

projMulti3 <- addModuleScore(project_Peaks_MACS2_RES0.5, useMatrix = "GeneExpressionMatrix",name = "Module",features = features) Error in addModuleScore(project_Peaks_MACS2_RES0.5, useMatrix = "GeneExpressionMatrix", : nBgd must be lower than 1!

A-legac45 commented 10 months ago

My code is the following,

features <- list( BScore = c("MS4A1", "CD79A", "CD74"), TScore = c("CD3D", "CD8A", "GZMB", "CCR7", "LEF1") ) projMulti3 <- addModuleScore(project_Peaks_MACS2_RES0.5, useMatrix = "GeneExpressionMatrix",name = "Module",features = features)

dgagler commented 1 month ago

Chiming in to say that I am also unable to get this to work using an integrated GEX matrix with the dev branch. Gene score is fine though.

My best guess is that it has something to do with line 121 in ModuleScore.R (44a0d27 #diff-b31996f658c3e45aeb4f21aa5f5ddc7cc150d4383da40f3a1a3955351ce32887 f49141f #diff-) which calls on the variable nBin which is seemingly undefined, leading to bin values of 1. Not sure why this would be exclusive to GEX matrices tho.