kstreet13 / scry

Collection of analysis methods for small count data generated by rafalab members (the methods, not the data).
19 stars 2 forks source link

No genes names when computing devianceFeatureSelection() with batch #25

Open Alexis-Varin opened 3 months ago

Alexis-Varin commented 3 months ago

Hello, I am encountering a problem when setting the batch parameter in devianceFeatureSelection(), I do not have the gene names associated with the deviance.

When doing

sce = devianceFeatureSelection(sds, nkeep = 5000)
deviance.genes = names(rowData(sce)$binomial_deviance)

I do obtain the genes names ordered by deviance, but when I would like to correct batch effect by adding factor of orig.ident :

sce = devianceFeatureSelection(sds, nkeep = 5000, batch = factor(colData(sds)$orig.ident))
deviance.genes = names(rowData(sce)$binomial_deviance)

The vector is empty, because rowData(sce)$binomial_deviance isn't a named numeric vector, just a numeric vector. I guess it comes from the fact that in your code, after computing deviance on each factor level, you rowSums() the results, but without renaming afterwards.

edit : I fixed it for me using the following modification of .compute_deviance_batch() :

    stopifnot(length(batch)==ncol(m))
    #each row is a gene, each column is deviance within a batch.
    #res<-matrix(0.0,nrow=nrow(m),ncol=nlevels(batch))
    tmp = list()
    for(i in seq_along(levels(batch))){
      b<-levels(batch)[i]
      #res[,i]<-.compute_deviance(m[,batch==b],fam=fam)
      tmp[[i]]<-.compute_deviance(m[,batch==b],fam=fam)
    }
    tmp = lapply(tmp, function(x) x[order(match(names(x), names(tmp[[1]])))])
    res = do.call(cbind,tmp)
    #deviance is additive across subsets of observations
    return(rowSums(res))
kstreet13 commented 3 months ago

Hi @Alexis-Varin,

Thanks for the notification about this! I also appreciate the suggested fix, but unfortunately, I wasn't able to verify that it gives the same answers. I implemented your version, but when I ran it on an example object (from the SingleCellExperiment documentation), it gave an odd error message:

ncells <- 100
u <- matrix(rpois(20000, 5), ncol=ncells)
v <- log2(u + 1)

pca <- matrix(runif(ncells*5), ncells)
tsne <- matrix(rnorm(ncells*2), ncells)

sce <- SingleCellExperiment(assays=list(counts=u, logcounts=v),
    reducedDims=SimpleList(PCA=pca, tSNE=tsne))
sce$batch <- sample(1:2, 100, replace = TRUE)

x1 <- devianceFeatureSelection(sce, batch = factor(sce$batch))
Error in recycleSingleBracketReplacementValue(value, x) : 
  replacement has length zero

Best, Kelly

Alexis-Varin commented 3 months ago

Hi @kstreet13 , That's strange indeed, I used your example fine, only adding rownames(v) <- rownames(u) <- paste0("gene", 1:200) to verify my code is working, and obtain with names(rowData(x1)$binomial_deviance) the vector of gene names as expected

[1] "gene1" "gene2" "gene3" "gene4" "gene5" "gene6" "gene7" "gene8" "gene9"
[10] "gene10" "gene11" "gene12" "gene13" "gene14" "gene15" "gene16" "gene17" "gene18" [19] "gene19" "gene20" "gene21" "gene22" "gene23" "gene24" "gene25" "gene26" "gene27" [28] "gene28" "gene29" "gene30" "gene31" "gene32" "gene33" "gene34" "gene35" "gene36" [37] "gene37" "gene38" "gene39" "gene40" "gene41" "gene42" "gene43" "gene44" "gene45" [46] "gene46" "gene47" "gene48" "gene49" "gene50" "gene51" "gene52" "gene53" "gene54" [55] "gene55" "gene56" "gene57" "gene58" "gene59" "gene60" "gene61" "gene62" "gene63" [64] "gene64" "gene65" "gene66" "gene67" "gene68" "gene69" "gene70" "gene71" "gene72" [73] "gene73" "gene74" "gene75" "gene76" "gene77" "gene78" "gene79" "gene80" "gene81" [82] "gene82" "gene83" "gene84" "gene85" "gene86" "gene87" "gene88" "gene89" "gene90" [91] "gene91" "gene92" "gene93" "gene94" "gene95" "gene96" "gene97" "gene98" "gene99" [100] "gene100" "gene101" "gene102" "gene103" "gene104" "gene105" "gene106" "gene107" "gene108" [109] "gene109" "gene110" "gene111" "gene112" "gene113" "gene114" "gene115" "gene116" "gene117" [118] "gene118" "gene119" "gene120" "gene121" "gene122" "gene123" "gene124" "gene125" "gene126" [127] "gene127" "gene128" "gene129" "gene130" "gene131" "gene132" "gene133" "gene134" "gene135" [136] "gene136" "gene137" "gene138" "gene139" "gene140" "gene141" "gene142" "gene143" "gene144" [145] "gene145" "gene146" "gene147" "gene148" "gene149" "gene150" "gene151" "gene152" "gene153" [154] "gene154" "gene155" "gene156" "gene157" "gene158" "gene159" "gene160" "gene161" "gene162" [163] "gene163" "gene164" "gene165" "gene166" "gene167" "gene168" "gene169" "gene170" "gene171" [172] "gene172" "gene173" "gene174" "gene175" "gene176" "gene177" "gene178" "gene179" "gene180" [181] "gene181" "gene182" "gene183" "gene184" "gene185" "gene186" "gene187" "gene188" "gene189" [190] "gene190" "gene191" "gene192" "gene193" "gene194" "gene195" "gene196" "gene197" "gene198" [199] "gene199" "gene200"

The only thing I did was copy your featureSelection.R entire code and only change the .compute_deviance_batch(), as well as not use any method and only a simple function for devianceFeatureSelection() just to be sure here is the entire code I am using without previous code as comments (the res<-matrix(0.0,nrow=nrow(m),ncol=nlevels(batch)) and the res[,i]<-.compute_deviance(m[,batch==b],fam=fam) which I removed) :

sparseBinomialDeviance<-function(X,sz){
  #X has features in cols, observations in rows
  #assume X is a sparseMatrix object
  X<-as(X,"CsparseMatrix")
  LP<-L1P<-Matrix::Diagonal(x = 1/sz) %*% X #recycling
  LP@x<-log(LP@x) #log transform nonzero elements only
  L1P@x<-log1p(-L1P@x) #rare case: -Inf if only a single gene nonzero in a cell
  ll_sat<-Matrix::colSums(X*(LP-L1P)+Matrix::Diagonal(x = sz) %*% L1P, na.rm=TRUE)
  sz_sum<-sum(sz)
  feature_sums<-Matrix::colSums(X)
  p<-feature_sums/sz_sum
  l1p<-log1p(-p)
  ll_null<-feature_sums*(log(p)-l1p)+sz_sum*l1p
  2*(ll_sat-ll_null)
}

denseBinomialDeviance<-function(X,sz){
  #X has features in cols, observations in rows
  P<-X/sz
  L1P<-log1p(-P)
  ll_sat<-DelayedArray::colSums(X*(log(P)-L1P)+sz*L1P, na.rm=TRUE)
  sz_sum<-sum(sz)
  feature_sums<-DelayedArray::colSums(X)
  p<-feature_sums/sz_sum
  l1p<-log1p(-p)
  ll_null<-feature_sums*(log(p)-l1p)+sz_sum*l1p
  2*(ll_sat-ll_null)
}

sparsePoissonDeviance<-function(X,sz){
  #X has features in cols, observations in rows
  X<-as(X,"CsparseMatrix")
  LP<-Matrix::Diagonal(x = 1/sz) %*% X #recycling
  LP@x<-log(LP@x) #log transform nonzero elements only
  ll_sat<-Matrix::colSums(X*LP, na.rm=TRUE)
  feature_sums<-Matrix::colSums(X)
  ll_null<-feature_sums*log(feature_sums/sum(sz))
  2*(ll_sat-ll_null)
}

densePoissonDeviance<-function(X,sz){
  #X has features in cols, observations in rows
  ll_sat<-DelayedArray::colSums(X*log(X/sz), na.rm=TRUE)
  feature_sums<-DelayedArray::colSums(X)
  ll_null<-feature_sums*log(feature_sums/sum(sz))
  2*(ll_sat-ll_null)
}

.compute_deviance<-function(m,fam=c("binomial","poisson")){
  #m is either a Matrix or matrix object (later: support DelayedArrays)
  #m a data matrix with genes=rows
  fam <- match.arg(fam)
  sz <- colSums(m)
  if(fam=="poisson"){
    lsz<-log(sz)
    #make geometric mean of sz be 1 for poisson
    sz <- exp(lsz-mean(lsz))
  }
  m<-t(m) #column slicing faster than row slicing for matrix in R.
  #note: genes are now in the COLUMNS of m
  if(is(m,"sparseMatrix")){
    if(fam=="binomial"){
      out <- sparseBinomialDeviance(m,sz)
    } else { #fam=="poisson"
      out <- sparsePoissonDeviance(m,sz)
    }
  } else { #m is either 1) an ordinary dense array or matrix
    # 2) a non-sparse Matrix object like dgeMatrix
    # 3) a dense object like HDF5Array (on disk) or DelayedArray (in memory)
    if(fam=="binomial"){
      out <- denseBinomialDeviance(m,sz)
    } else { #fam=="poisson"
      out <- densePoissonDeviance(m,sz)
    }
  }
  out[which(is.na(out))] <- 0
  return(out)
}

.compute_deviance_batch<-function(object,
                                  fam=c("binomial","poisson"),
                                  batch=NULL){
  #deviance but with batch indicator (batch=a factor)
  m<-object; rm(object)
  fam<-match.arg(fam)
  stopifnot(is.null(batch) || is(batch,"factor"))
  if(is.null(batch)){
    return(.compute_deviance(m,fam=fam))
  } else { #case where there is more than one batch
    stopifnot(length(batch)==ncol(m))
    #each row is a gene, each column is deviance within a batch.
    tmp = list()
    for(i in seq_along(levels(batch))){
      b<-levels(batch)[i]
      tmp[[i]]<-.compute_deviance(m[,batch==b],fam=fam)
    }
    tmp = lapply(tmp, function(x) x[order(match(names(x), names(tmp[[1]])))])
    res = do.call(cbind,tmp)
    #deviance is additive across subsets of observations
    return(rowSums(res))
  }
}

devianceFeatureSelection2 = function(object, assay = "counts",
                                     fam = c("binomial", "poisson"), batch = NULL,
                                     nkeep = NULL, sorted = FALSE){
  fam<-match.arg(fam)
  m <- assay(object, assay)
  dev<-.compute_deviance_batch(m, fam, batch)
  name<-paste(fam, "deviance", sep="_")
  rd<-rowData(object)
  rd[name]<-dev
  rowData(object)<-rd
  if(!is.null(nkeep) && nkeep>=length(dev)){
    nkeep<-NULL
  } #user wants to keep all features
  if(!is.null(nkeep)){ sorted<-TRUE } #force sorting if we are
  # taking a subset of rows
  if(sorted){
    o<-order(dev,decreasing=TRUE)
    object<-object[o,]
    if(is.null(nkeep)){ #sorting but no subsetting
      return(object)
    } else { #sorting and subsetting
      return(object[seq_len(nkeep),])
    }
  } else { #no sorting, no subsetting
    return(object)
  }
}