qingnanl / gsdensity

6 stars 3 forks source link

The selected method can only be used for equal-sized groups #9

Open Changwuuu opened 2 months ago

Changwuuu commented 2 months ago

Hello, I tried to run the sample code in "Quick start", but the following code gave an error. It seems that the method was wrong. Is there a good solution?

res <- compute.kld(coembed = ce,

qingnanl commented 2 months ago

Thanks. Can you please try to downgrade the package 'anticlust' to an older version (e.g., 0.6.1)? This may solve the issue.

Jorges1000 commented 3 days ago

I have the same problem, even with anticlust 0.6.1

Jorges1000 commented 3 days ago

Got it to work with this hack:

computekldFix <- function (coembed, genes.use, n.grids = 100, gene.set.list, gene.set.cutoff = 3, 
                          n.times = 100) 
{
  coembed.scale <- scale(coembed[genes.use, ])
  # grid.co <- compute.grid.coords(coembed = coembed, genes.use = genes.use, 
  # n.grids = n.grids)
  ###
  coembed <- scale(coembed[genes.use, ])
  # cl <- anticlust:::balanced_clustering(coembed, K = n.grids)
  ####
  # function (x=coembed, K = n.grids, method = "centroid", solver = NULL) 
  # {
  x=coembed
  K = n.grids
  anticlust:::input_validation_anticlustering(
    x, K,objective="distance", method = "centroid", 
    preclustering=TRUE, categories= NULL,repetitions= NULL)
  # anticlust:::validate_input_optimal_anticlustering(x, K, objective = "diversity", 
  #                                       solver=NULL)
  #####
  solver=NULL
  objective = "diversity"
  anticlust:::validate_data_matrix(x)
  anticlust:::validate_input(K, "K", objmode = "numeric", must_be_integer = TRUE, 
                             not_na = TRUE, not_function = TRUE)
  N <- nrow(as.matrix(x))
  if (!(length(K) == 1 || sum(K) == N)) {
    stop("Argument `K` is misspecified.")
  }
  # if (objective != "dispersion") {
  #   if (!all(K == K[1]) || length(K) == 1 && N%%K != 0) {
  #     stop("The selected method can only be used for equal-sized groups")
  #   }
  # }
  anticlust:::validate_input(objective, "objective", objmode = "character", 
                             len = 1, input_set = c("variance", "diversity", "kplus", 
                                                    "dispersion"), not_na = TRUE, not_function = TRUE)
  if (anticlust:::argument_exists(solver)) {
    validate_input(solver, "solver", objmode = "character", 
                   len = 1, input_set = c("glpk", "symphony"), not_na = TRUE, 
                   not_function = TRUE)
  }
  is_distance_matrix <- function(m) {
    m <- as.matrix(m)
    if (nrow(m) != ncol(m)) {
      return(FALSE)
    }}
  if (!is_distance_matrix(x)) {
    data <- as.matrix(x)}
  else{data <- x}
  cl <- anticlust:::nn_centroid_clustering(data, NROW(data)/K)
  # }
  ####
  grid.co <- aggregate(coembed, list(cl), mean)[, -1]
  # return(centroid.coords)
  ###
  dist.to.grid <- gsdensity:::vectorized_pdist(A = as.matrix(coembed.scale), 
                                               B = as.matrix(grid.co))
  bandwidth <- median(apply(dist.to.grid, 1, min))
  dist.to.grid.norm <- dist.to.grid/bandwidth
  density.contributions <- exp(-dist.to.grid.norm * dist.to.grid.norm/2)
  Q <- gsdensity:::compute.db(density.df = density.contributions)
  gene.set.names <- rep(NA, length(gene.set.list))
  klds <- rep(0, length(gene.set.list))
  len.gl <- rep(0, length(gene.set.list))
  rklds.avg <- rep(0, length(gene.set.list))
  rklds.sd <- rep(0, length(gene.set.list))
  pvalues <- rep(0, length(gene.set.list))
  for (i in 1:length(gene.set.list)) {
    gene.set.name <- names(gene.set.list)[i]
    gene.set <- intersect(genes.use, gene.set.list[[i]])
    len.gene.set <- length(gene.set)
    if (len.gene.set < gene.set.cutoff) {
      next
    }
    gene.set.names[i] <- gene.set.name
    len.gl[i] <- len.gene.set
    P <- gsdensity:::compute.db(density.df = density.contributions[gene.set, 
    ])
    kld <- sum(P * log(P/Q))
    klds[i] <- log(kld)
    rkld <- sapply(1:n.times, function(x) gsdensity:::sample.kld(
      density.df = density.contributions, 
      ref = Q, len.gene.set = len.gene.set))
    rkld.avg <- mean(log(rkld))
    rkld.sd <- sd(log(rkld))
    rklds.avg[i] <- rkld.avg
    rklds.sd[i] <- rkld.sd
    pvalue <- pnorm(log(kld), rkld.avg, rkld.sd, lower.tail = FALSE)
    pvalues[i] <- pvalue
  }
  out <- as.data.frame(cbind(gene.set.names, klds, len.gl, 
                             rklds.avg, rklds.sd, pvalues))
  out <- out[complete.cases(out), ]
  colnames(out) <- c("gene.set", "kld", "gene.set.length", 
                     "rkld.mean", "rkld.sd", "p.value")
  out$p.adj <- p.adjust(p = out$p.value, method = "fdr")
  return(out)
}