SONGDONGYUAN1994 / scDesign3

scDesign3 generates realistic in silico data for multimodal single-cell and spatial omics
https://songdongyuan1994.github.io/scDesign3/docs/index.html
MIT License
87 stars 25 forks source link

Error when simulating spatial transcriptomics data for benchmarking DE genes #13

Closed ttgump closed 10 months ago

ttgump commented 1 year ago

Hi, I want to simulate a spatial dataset based on Human DLPFC dataset to benchmark clustering and DE analysis. Here is my code:

  example_sce <- getRDS("2020_maynard_prefrontal-cortex", "151507")
  # print(example_sce)

  set.seed(101)
  dec <- scran::modelGeneVar(example_sce)
  top <- scran::getTopHVGs(dec, n = 2000)

  example_sce <- example_sce[top, !is.na(example_sce@colData$layer_guess_reordered)]

  mt_idx<- grep("mt-",rownames(example_sce))
  if(length(mt_idx)!=0){
    example_sce   <- example_sce[-mt_idx,]
  }

  set.seed(1)
  example_data <- construct_data(
    sce = example_sce,
    assay_use = "counts",
    celltype = "layer_guess_reordered",
    pseudotime = NULL,
    spatial = c("row", "col"),
    other_covariates = NULL,
    corr_by = "1"
  )

  example_marginal <- fit_marginal(
    data = example_data,
    predictor = "gene",
    mu_formula = "layer_guess_reordered",
    sigma_formula = "1",
    family_use = "nb",
    n_cores = 10,
    usebam = FALSE
  )

  set.seed(1)
  example_copula <- fit_copula(
    sce = example_sce,
    assay_use = "counts",
    marginal_list = example_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 10,
    new_covariate = NULL,
    input_data = example_data$dat
  )

  example_para <- extract_para(
    sce = example_sce,
    marginal_list = example_marginal,
    n_cores = 10,
    family_use = "nb",
    new_covariate = NULL,
    data = example_data$dat
  )

  diff <- apply(example_para$mean_mat, 2, function(x){max(x)-min(x)})
  diff_ordered <- order(diff, decreasing = TRUE)
  diff <- diff[diff_ordered]
  num_de <- 1000
  de_idx <- names(diff[1:num_de])
  non_de_idx <- names(diff[-(1:num_de)])
  non_de_mat <- apply(example_para$mean_mat[,non_de_idx], 2, function(x){
    avg <- (max(x)+min(x))/2
    new_mean <- rep(avg, length(x))
    return(new_mean)
  })
  example_para$mean_mat[,non_de_idx] <- non_de_mat

  set.seed(1)
  example_newcount <- simu_new(
    sce = example_sce,
    mean_mat = example_para$mean_mat,
    sigma_mat = example_para$sigma_mat,
    zero_mat = example_para$zero_mat,
    quantile_mat = NULL,
    copula_list = example_copula$copula_list,
    n_cores = 10,
    family_use = "nb",
    input_data = example_data$dat,
    new_covariate = example_data$newCovariate,
    important_feature = example_copula$important_feature
  )

The code will generate this error:

Use Copula to sample a multivariate quantile matrix
Sample Copula group 1 starts
Error in dimnames(x) <- dn : 
  length of 'dimnames' [2] not equal to array extent

If I don't manually set DE and no DE genes, that is to say if I delete this part, then the simulation works well.

  diff <- apply(example_para$mean_mat, 2, function(x){max(x)-min(x)})
  diff_ordered <- order(diff, decreasing = TRUE)
  diff <- diff[diff_ordered]
  num_de <- 1000
  de_idx <- names(diff[1:num_de])
  non_de_idx <- names(diff[-(1:num_de)])
  non_de_mat <- apply(example_para$mean_mat[,non_de_idx], 2, function(x){
    avg <- (max(x)+min(x))/2
    new_mean <- rep(avg, length(x))
    return(new_mean)
  })
  example_para$mean_mat[,non_de_idx] <- non_de_mat

Do you have any suggestions? Thanks. Best, Tian

SONGDONGYUAN1994 commented 1 year ago

@ttgump Hi Tian, I'm sorry for not getting back to you sooner. Your problem is caused by some NA values in your mean_mat. We have now updated our package by avoiding NA in mean_mat and now it should work for you. Please let me know if things go well!

Best, Dongyuan