SONGDONGYUAN1994 / scDesign3

scDesign3 generates realistic in silico data for multimodal single-cell and spatial omics
https://songdongyuan1994.github.io/scDesign3/docs/index.html
MIT License
87 stars 24 forks source link

Generate new data given labels #53

Open feng-bao-ucsf opened 4 months ago

feng-bao-ucsf commented 4 months ago

Hi Dongyuan,

Thanks for sharing the work. I have a question about using the model: can I first train model on one dataset and learn the relation between cell type and omics parameters; then I have a new list of cell types, can I use trained model to generate the gene data based on the new cell type list? I did not find this function from examples, maybe I missed that.

Thank you!

Feng

SONGDONGYUAN1994 commented 4 months ago

Hi Feng, Sure, we have this function. Please check https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-introduction-vignette.html

It is not written clearly (I apologize); basically, in step-by-step mode, the final simu_new function has a parameter: new_covariate. Just input your new labels here. Please let me know if it works for you.

feng-bao-ucsf commented 4 months ago

Thank you so much for the quick reply, I tried to make a toy model for this, where I have a sce_train for model fitting and a sce_test for just inference. I only used sce_test in the last step. It seems there was a mismatching shape issue. Can you help me check this?

set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "label",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "label",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "label",
    sigma_formula = "label",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 2,
    family_use = "nb",
    new_covariate = con_data_train$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_test$dat,
    new_covariate = con_data_test$newCovariate,
    filtered_gene = con_data_test$filtered_gene
)
SONGDONGYUAN1994 commented 4 months ago

Oh I forgot this. extract_para also has the parameter new_covariate. Please also replace it with your test data. Thanks! filtered_gene in the last step should still use training data.

feng-bao-ucsf commented 4 months ago

Oh I forgot this. extract_para also has the parameter new_covariate. Please also replace it with your test data. Thanks! filtered_gene in the last step should still use training data.

Thank you for your information. I tested as suggested.


MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 1,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$newCovariate,
    filtered_gene = con_data_train$filtered_gene
)

The code stopped atextract_para. The error message is:

error in eval(predvars, data, env): object 'label' not found
Traceback:

1. extract_para(sce = sce_train, marginal_list = MOBSC_marginal, 
 .     n_cores = 1, family_use = "nb", new_covariate = con_data_test$newCovariate, 
 .     data = con_data_train$dat)
2. suppressMessages(paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], 
 .     y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores))
3. withCallingHandlers(expr, message = function(c) if (inherits(c, 
 .     classes)) tryInvokeRestart("muffleMessage"))
4. paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], 
 .     y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores)
5. mclapply(seq_len(n), do_one, mc.preschedule = mc.preschedule, 
 .     mc.set.seed = mc.set.seed, mc.silent = mc.silent, mc.cores = mc.cores, 
 .     mc.cleanup = mc.cleanup, affinity.list = affinity.list)
6. lapply(X = X, FUN = FUN, ...)
7. FUN(X[[i]], ...)
8. .mapply(FUN, dots, MoreArgs)
9. (function (x, y) 
 . {

But I checked con_data_test$newCovariate, and confirmed there is a column called label. I also tested if I replaced con_data_test$newCovariate with con_data_train$newCovariate, extract_para worked fine, but the code will stop at simu_new.

feng-bao-ucsf commented 4 months ago

I made a minimal test script here. Really appreciate your help!

library(scDesign3)
library(SingleCellExperiment)
library(dplyr)

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40582019")))

sce_train <- example_sce[1:10,1:50 ]
sce_test <- example_sce[1:10,51:80]

set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "cell_type",
    sigma_formula = "cell_type",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$newCovariate,
    filtered_gene = con_data_train$filtered_gene
)
SONGDONGYUAN1994 commented 4 months ago

Thanks for this demo! I found a bug and we will update the package as soon as possible. I will @ you once we update the package.

feng-bao-ucsf commented 4 months ago

Thanks for this demo! I found a bug and we will update the package as soon as possible. I will @ you once we update the package.

Thank you and look forward to your update.

qw130 commented 4 months ago

Hi Feng, @feng-bao-ucsf

I just updated the code. The newest code on GitHub should work.

When I was running the demo code you provided, I noticed that one parameter (new_covariate) in simu_new was not set properly, which will cause an error when you run simu_new. Please use the following code to run simu_new.

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,
    filtered_gene = con_data_train$filtered_gene
)

The parameter new_covariate in simu_new needs to be a dataframe which contains the columns for your specified covariates and a column named corr_group. con_data_test$dat will contain all the columns needed, but con_data_test$newCovariate will not contain the column corr_group if you don't modify the ncell parameter in construct_data.

Thanks again for your interest in scDesign3. Please let us know if you have any further questions!

feng-bao-ucsf commented 4 months ago
MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,

    filtered_gene = con_data_train$filtered_gene
)

Thank you so much for the quick response. I tested the code using the data from figshare, and it ran perfectly. However, when I tried to run with my own data, I found there might be some issues for the extract_para step. I tried multiple datasets and the error information was the same.

Here is my code to test the data and the data:

meta_intestine.csv

data_intestine.csv

library(scDesign3)
library(SingleCellExperiment)
library(ggplot2)
library(dplyr)

data <- read.csv("data_intestine.csv", row.names = 1)
meta <- read.csv("meta_intestine.csv", row.names = 1)

meta <- meta[, c("x", "y", "label")]

colnames(meta) <- c("x", "y", "cell_type")
# Add 1 to each element in the 'label' column
meta$label <- meta$cell_type + 1

sce <- SingleCellExperiment(
    assays = list(counts = t(data)),
    colData = meta
)
sce_train <- sce[1:10, 1:50]
sce_test <- sce[1:10, 51:80]

set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "cell_type",
    sigma_formula = "cell_type",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,
    filtered_gene = con_data_train$filtered_gene
)

The error info is:

Error in X[, pstart[i] - 1 + 1:object$nsdf[i]] <- Xp: number of items to replace is not a multiple of replacement length
Traceback:

1. extract_para(sce = sce_train, marginal_list = MOBSC_marginal, 
 .     n_cores = 1, family_use = "nb", new_covariate = con_data_test$newCovariate, 
 .     data = con_data_train$dat)
2. suppressMessages(paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], 
 .     y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores))
3. withCallingHandlers(expr, message = function(c) if (inherits(c, 
 .     classes)) tryInvokeRestart("muffleMessage"))
4. paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], 
 .     y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores)
5. mclapply(seq_len(n), do_one, mc.preschedule = mc.preschedule, 
 .     mc.set.seed = mc.set.seed, mc.silent = mc.silent, mc.cores = mc.cores, 
 .     mc.cleanup = mc.cleanup, affinity.list = affinity.list)
6. lapply(X = X, FUN = FUN, ...)
7. FUN(X[[i]], ...)
8. .mapply(FUN, dots, MoreArgs)
9. (function (x, y) 
 . {
 .     fit <- marginal_list[[x]]
feng-bao-ucsf commented 4 months ago

Dear Dongyuan and Qingyang, I also tried other example datasets from the scDesign3. It seems the same error message also happened for those. Can you help check that? Really appreciate it.

library(scDesign3)
library(SingleCellExperiment)
library(dplyr)

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581980")))
# I also tried this dataset
# example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581965")))

sce_train <- example_sce[1:10, 1:50]
sce_test <- example_sce[1:10, 51:80]

set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "cell_type",
    sigma_formula = "cell_type",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)
MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,
    filtered_gene = con_data_train$filtered_gene
)
qw130 commented 3 months ago

Hi Feng,

I just uploaded the code. The error is due to the incorrect data format that results when selecting only one column from a data frame.

I tested the newest code with your script above. The data with the link (https://figshare.com/ndownloader/files/40581980) ran with no error. The data with the link (https://figshare.com/ndownloader/files/40581965) produces an error. However, it causes an error because of the training and testing data. The training data does not contain CD14+ monocyte cells, while the testing data does; thus, the GAMLSS model can't predict the parameters for an unseen cell type. Once I removed the three CD14+ monocyte cells from the testing data, the data with the link (https://figshare.com/ndownloader/files/40581965) also ran with no error.

Thanks for letting us know about this bug, and please let us know if you have further questions.

feng-bao-ucsf commented 3 months ago

Thank you so much for the update. It works perfectly on the test datasets. I am working to extend the test to additional datasets and will close this slot shortly.

feng-bao-ucsf commented 3 months ago

I did another test on prediction of new samples but based on spatial positions. Here is my test code:

library(scDesign3)
library(SingleCellExperiment)
library(dplyr)

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40582019")))
# example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581977")))
sce_train <- example_sce[1:10, 1:50]
sce_test <- example_sce[1:10, 51:80]

set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = c("spatial1", "spatial2"),
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = c("spatial1", "spatial2"),
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "s(spatial1, spatial2, bs = 'gp', k= 400)",
    sigma_formula =1,
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,
    filtered_gene = con_data_train$filtered_gene
)

The code stops at fit_copula, with error messages:

Error in `colnames<-`(`*tmp*`, value = rownames(sce)): attempt to set 'colnames' on an object with less than two dimensions
Traceback:

1. fit_copula(sce = sce_train, assay_use = "counts", marginal_list = MOBSC_marginal, 
 .     family_use = "nb", copula = "gaussian", n_cores = 2, input_data = con_data_train$dat)
2. convert_n(sce = sce[qc_gene_idx, ], assay_use = assay_use, marginal_list = marginal_list[qc_gene_idx], 
 .     data = input_data, DT = DT, pseudo_obs = pseudo_obs, n_cores = n_cores, 
 .     family_use = family_use, epsilon = epsilon, parallelization = parallelization, 
 .     BPPARAM = BPPARAM)
3. `colnames<-`(`*tmp*`, value = rownames(sce))
4. stop("attempt to set 'colnames' on an object with less than two dimensions")

Any ideas on this error? Really appreciate your help!