SONGDONGYUAN1994 / scDesign3

scDesign3 generates realistic in silico data for multimodal single-cell and spatial omics
https://songdongyuan1994.github.io/scDesign3/docs/index.html
MIT License
74 stars 23 forks source link

Questions of gaussian family #56

Open TBW108 opened 1 month ago

TBW108 commented 1 month ago

Hi, Dongyuan! I have a question about generating scRNA-seq data using gaussian family. When I generate scRNA-seq data with gaussian family like the code below

example_data <- construct_data(
  sce = example_sce,
  assay_use = "normcounts",
  celltype = "cell_type",
  pseudotime = NULL,
  spatial = NULL,
  other_covariates = NULL,
  corr_by = "1",
  ncell = 2000,
)

example_marginal <- fit_marginal(
  data = example_data,
  predictor = "gene",
  mu_formula = "cell_type",
  sigma_formula = "1",
  family_use = "gaussian",
  n_cores = 50,
  usebam = FALSE
)

set.seed(123)
example_copula <- fit_copula(
  sce = example_sce,
  assay_use = "counts",
  marginal_list = example_marginal,
  family_use = "gaussian",
  copula = "gaussian",
  n_cores = 50,
  input_data = example_data$dat,
  parallelization='pbmcmapply'
)

example_para <- extract_para(
  sce = example_sce,
  marginal_list = example_marginal,
  n_cores = 50,
  family_use = "gaussian",
  new_covariate = example_data$newCovariate,
  data = example_data$dat
)

it reports errors like this image

How to solve it? How to generate lognorm scRNA-seq data with gaussian family? Thanks.

SONGDONGYUAN1994 commented 1 month ago

@TBW108 Hi TBW, in your fit_copula() function, you also need to specify the assay = 'normcounts'. Please let me know if it works.

Best, Dongyuan

TBW108 commented 1 month ago

Thanks! I have added assay_use = "normcounts" in fit_copula(). The code has been changed like this:

set.seed(123)
example_data <- construct_data(
  sce = example_sce,
  assay_use = "normcounts",
  celltype = "cell_type",
  pseudotime = NULL,
  spatial = NULL,
  other_covariates = NULL,
  corr_by = "1",
  ncell = 2000,
)

example_marginal <- fit_marginal(
  data = example_data,
  predictor = "gene",
  mu_formula = "cell_type",
  sigma_formula = "1",
  family_use = "gaussian",
  n_cores = 50,
  usebam = FALSE
)

set.seed(123)
example_copula <- fit_copula(
  sce = example_sce,
  assay_use = "normcounts",
  marginal_list = example_marginal,
  family_use = "gaussian",
  copula = "gaussian",
  n_cores = 50,
  input_data = example_data$dat,
)

example_para <- extract_para(
  sce = example_sce,
  assay_use = "normcounts",
  marginal_list = example_marginal,
  n_cores = 50,
  family_use = "gaussian",
  new_covariate = example_data$newCovariate,
  data = example_data$dat
)

However, R reports the error as below. image

Is there any problem with my setting? Or the parameter `corr_by' should be set as "ind"? Thanks!

SONGDONGYUAN1994 commented 1 month ago

Hi, Could you provide a reproducible dataset for the testing? I would be happy to check that.

Best, Dongyuan

TBW108 commented 1 month ago

Hi, I just use the default dataset in scDesign3, but I filter out some genes that are not in the H1 pathways. The code is provided as below

library(scDesign3,verbose =FALSE)
library(SingleCellExperiment)
library(ggplot2)
library(DuoClustering2018)
library(tidyverse)
library(msigdbr)
library(zellkonverter)
library(reticulate)
library(scuttle)
library(scran)

# get data
Zhengmix4eq_sce <- get("sce_filteredExpr10_Zhengmix4eq")(metadata = FALSE)
logNormCounts(Zhengmix4eq_sce)

# replace the row name with gene
rownames(Zhengmix4eq_sce)=rowData(Zhengmix4eq_sce)$symbol
example_sce=Zhengmix4eq_sce

# normcounts(example_sce)
# logcounts(example_sce)

# extract genes
genesets=msigdbr("Homo sapiens", "H")
all_filter_genes=c()
groups=list()
for(geneset in unique(genesets$gs_name)){
    genes=genesets$gene_symbol[genesets$gs_name==geneset]
    filter_genes=rowData(example_sce)$symbol[rowData(example_sce)$symbol %in% genes] # nolint
    if(length(filter_genes)>5){
        all_filter_genes<-union(all_filter_genes, filter_genes)
        groups[[geneset]]<-filter_genes
    }
}
all_filter_genes
example_sce=example_sce[rowData(example_sce)$symbol %in% all_filter_genes,]
example_sce

# extract cells and divide them into two types
# selected_cells <- which(colData(example_sce)$phenoid %in% c("b.cells","regulatory.t"))
selected_cells <- which(colData(example_sce)$phenoid == c("regulatory.t"))
example_sce <- example_sce[,selected_cells]
cell_id=seq(1,dim(example_sce)[2])
sampled_cell_id <- sample(cell_id, size = length(cell_id)/2, replace = FALSE)
cell_type=rep(0,dim(example_sce)[2])
cell_type[sampled_cell_id]=1

colData(example_sce)$cell_type <- as.factor(cell_type)
example_sce

# get parameters
set.seed(123)
example_data <- construct_data(
  sce = example_sce,
  assay_use = "normcounts",
  celltype = "cell_type",
  pseudotime = NULL,
  spatial = NULL,
  other_covariates = NULL,
  corr_by = "1",
  ncell = 2000,
)

example_marginal <- fit_marginal(
  data = example_data,
  predictor = "gene",
  mu_formula = "cell_type",
  sigma_formula = "1",
  family_use = "gaussian",
  n_cores = 50,
  usebam = FALSE
)

set.seed(123)
example_copula <- fit_copula(
  sce = example_sce,
  assay_use = "normcounts",
  marginal_list = example_marginal,
  family_use = "gaussian",
  copula = "gaussian",
  n_cores = 50,
  input_data = example_data$dat,
)

example_para <- extract_para(
  sce = example_sce,
  assay_use = "normcounts",
  marginal_list = example_marginal,
  n_cores = 50,
  family_use = "gaussian",
  new_covariate = example_data$newCovariate,
  data = example_data$dat
)

image