Open feng-bao-ucsf opened 4 months ago
Hi Feng, Sure, we have this function. Please check https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-introduction-vignette.html
It is not written clearly (I apologize); basically, in step-by-step mode, the final simu_new
function has a parameter: new_covariate
. Just input your new labels here. Please let me know if it works for you.
Thank you so much for the quick reply, I tried to make a toy model for this, where I have a sce_train for model fitting and a sce_test for just inference. I only used sce_test in the last step. It seems there was a mismatching shape issue. Can you help me check this?
set.seed(123)
con_data_train <- construct_data(
sce = sce_train,
assay_use = "counts",
celltype = "label",
pseudotime = NULL,
spatial = NULL,
other_covariates = NULL,
corr_by = "1"
)
con_data_test <- construct_data(
sce = sce_test,
assay_use = "counts",
celltype = "label",
pseudotime = NULL,
spatial = NULL,
other_covariates = NULL,
corr_by = "1"
)
MOBSC_marginal <- fit_marginal(
data = con_data_train,
predictor = "gene",
mu_formula = "label",
sigma_formula = "label",
family_use = "nb",
n_cores = 2,
usebam = FALSE,
parallelization = "pbmcmapply"
)
MOBSC_copula <- fit_copula(
sce = sce_train,
assay_use = "counts",
marginal_list = MOBSC_marginal,
family_use = "nb",
copula = "gaussian",
n_cores = 2,
input_data = con_data_train$dat
)
MOBSC_para <- extract_para(
sce = sce_train,
marginal_list = MOBSC_marginal,
n_cores = 2,
family_use = "nb",
new_covariate = con_data_train$newCovariate,
data = con_data_train$dat
)
MOBSC_newcount <- simu_new(
sce = sce_train,
mean_mat = MOBSC_para$mean_mat,
sigma_mat = MOBSC_para$sigma_mat,
zero_mat = MOBSC_para$zero_mat,
quantile_mat = NULL,
copula_list = MOBSC_copula$copula_list,
n_cores = 2,
family_use = "nb",
input_data = con_data_test$dat,
new_covariate = con_data_test$newCovariate,
filtered_gene = con_data_test$filtered_gene
)
Oh I forgot this. extract_para
also has the parameter new_covariate
. Please also replace it with your test data. Thanks! filtered_gene
in the last step should still use training data.
Oh I forgot this.
extract_para
also has the parameternew_covariate
. Please also replace it with your test data. Thanks!filtered_gene
in the last step should still use training data.
Thank you for your information. I tested as suggested.
MOBSC_copula <- fit_copula(
sce = sce_train,
assay_use = "counts",
marginal_list = MOBSC_marginal,
family_use = "nb",
copula = "gaussian",
n_cores = 1,
input_data = con_data_train$dat
)
MOBSC_para <- extract_para(
sce = sce_train,
marginal_list = MOBSC_marginal,
n_cores = 1,
family_use = "nb",
new_covariate = con_data_test$newCovariate,
data = con_data_train$dat
)
MOBSC_newcount <- simu_new(
sce = sce_train,
mean_mat = MOBSC_para$mean_mat,
sigma_mat = MOBSC_para$sigma_mat,
zero_mat = MOBSC_para$zero_mat,
quantile_mat = NULL,
copula_list = MOBSC_copula$copula_list,
n_cores = 2,
family_use = "nb",
input_data = con_data_train$dat,
new_covariate = con_data_test$newCovariate,
filtered_gene = con_data_train$filtered_gene
)
The code stopped atextract_para
. The error message is:
error in eval(predvars, data, env): object 'label' not found
Traceback:
1. extract_para(sce = sce_train, marginal_list = MOBSC_marginal,
. n_cores = 1, family_use = "nb", new_covariate = con_data_test$newCovariate,
. data = con_data_train$dat)
2. suppressMessages(paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx],
. y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores))
3. withCallingHandlers(expr, message = function(c) if (inherits(c,
. classes)) tryInvokeRestart("muffleMessage"))
4. paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx],
. y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores)
5. mclapply(seq_len(n), do_one, mc.preschedule = mc.preschedule,
. mc.set.seed = mc.set.seed, mc.silent = mc.silent, mc.cores = mc.cores,
. mc.cleanup = mc.cleanup, affinity.list = affinity.list)
6. lapply(X = X, FUN = FUN, ...)
7. FUN(X[[i]], ...)
8. .mapply(FUN, dots, MoreArgs)
9. (function (x, y)
. {
But I checked con_data_test$newCovariate
, and confirmed there is a column called label
.
I also tested if I replaced con_data_test$newCovariate
with con_data_train$newCovariate
, extract_para
worked fine, but the code will stop at simu_new
.
I made a minimal test script here. Really appreciate your help!
library(scDesign3)
library(SingleCellExperiment)
library(dplyr)
example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40582019")))
sce_train <- example_sce[1:10,1:50 ]
sce_test <- example_sce[1:10,51:80]
set.seed(123)
con_data_train <- construct_data(
sce = sce_train,
assay_use = "counts",
celltype = "cell_type",
pseudotime = NULL,
spatial = NULL,
other_covariates = NULL,
corr_by = "1"
)
con_data_test <- construct_data(
sce = sce_test,
assay_use = "counts",
celltype = "cell_type",
pseudotime = NULL,
spatial = NULL,
other_covariates = NULL,
corr_by = "1"
)
MOBSC_marginal <- fit_marginal(
data = con_data_train,
predictor = "gene",
mu_formula = "cell_type",
sigma_formula = "cell_type",
family_use = "nb",
n_cores = 2,
usebam = FALSE,
parallelization = "pbmcmapply"
)
MOBSC_copula <- fit_copula(
sce = sce_train,
assay_use = "counts",
marginal_list = MOBSC_marginal,
family_use = "nb",
copula = "gaussian",
n_cores = 2,
input_data = con_data_train$dat
)
MOBSC_para <- extract_para(
sce = sce_train,
marginal_list = MOBSC_marginal,
n_cores = 1,
family_use = "nb",
new_covariate = con_data_test$newCovariate,
data = con_data_train$dat
)
MOBSC_newcount <- simu_new(
sce = sce_train,
mean_mat = MOBSC_para$mean_mat,
sigma_mat = MOBSC_para$sigma_mat,
zero_mat = MOBSC_para$zero_mat,
quantile_mat = NULL,
copula_list = MOBSC_copula$copula_list,
n_cores = 2,
family_use = "nb",
input_data = con_data_train$dat,
new_covariate = con_data_test$newCovariate,
filtered_gene = con_data_train$filtered_gene
)
Thanks for this demo! I found a bug and we will update the package as soon as possible. I will @ you once we update the package.
Thanks for this demo! I found a bug and we will update the package as soon as possible. I will @ you once we update the package.
Thank you and look forward to your update.
Hi Feng, @feng-bao-ucsf
I just updated the code. The newest code on GitHub should work.
When I was running the demo code you provided, I noticed that one parameter (new_covariate
) in simu_new
was not set properly, which will cause an error when you run simu_new
. Please use the following code to run simu_new
.
MOBSC_newcount <- simu_new(
sce = sce_train,
mean_mat = MOBSC_para$mean_mat,
sigma_mat = MOBSC_para$sigma_mat,
zero_mat = MOBSC_para$zero_mat,
quantile_mat = NULL,
copula_list = MOBSC_copula$copula_list,
n_cores = 2,
family_use = "nb",
input_data = con_data_train$dat,
new_covariate = con_data_test$dat,
filtered_gene = con_data_train$filtered_gene
)
The parameter new_covariate
in simu_new
needs to be a dataframe
which contains the columns for your specified covariates and a column named corr_group
. con_data_test$dat
will contain all the columns needed, but con_data_test$newCovariate
will not contain the column corr_group
if you don't modify the ncell
parameter in construct_data
.
Thanks again for your interest in scDesign3. Please let us know if you have any further questions!
MOBSC_newcount <- simu_new( sce = sce_train, mean_mat = MOBSC_para$mean_mat, sigma_mat = MOBSC_para$sigma_mat, zero_mat = MOBSC_para$zero_mat, quantile_mat = NULL, copula_list = MOBSC_copula$copula_list, n_cores = 2, family_use = "nb", input_data = con_data_train$dat, new_covariate = con_data_test$dat, filtered_gene = con_data_train$filtered_gene )
Thank you so much for the quick response. I tested the code using the data from figshare, and it ran perfectly. However, when I tried to run with my own data, I found there might be some issues for the extract_para
step. I tried multiple datasets and the error information was the same.
Here is my code to test the data and the data:
library(scDesign3)
library(SingleCellExperiment)
library(ggplot2)
library(dplyr)
data <- read.csv("data_intestine.csv", row.names = 1)
meta <- read.csv("meta_intestine.csv", row.names = 1)
meta <- meta[, c("x", "y", "label")]
colnames(meta) <- c("x", "y", "cell_type")
# Add 1 to each element in the 'label' column
meta$label <- meta$cell_type + 1
sce <- SingleCellExperiment(
assays = list(counts = t(data)),
colData = meta
)
sce_train <- sce[1:10, 1:50]
sce_test <- sce[1:10, 51:80]
set.seed(123)
con_data_train <- construct_data(
sce = sce_train,
assay_use = "counts",
celltype = "cell_type",
pseudotime = NULL,
spatial = NULL,
other_covariates = NULL,
corr_by = "1"
)
con_data_test <- construct_data(
sce = sce_test,
assay_use = "counts",
celltype = "cell_type",
pseudotime = NULL,
spatial = NULL,
other_covariates = NULL,
corr_by = "1"
)
MOBSC_marginal <- fit_marginal(
data = con_data_train,
predictor = "gene",
mu_formula = "cell_type",
sigma_formula = "cell_type",
family_use = "nb",
n_cores = 2,
usebam = FALSE,
parallelization = "pbmcmapply"
)
MOBSC_copula <- fit_copula(
sce = sce_train,
assay_use = "counts",
marginal_list = MOBSC_marginal,
family_use = "nb",
copula = "gaussian",
n_cores = 2,
input_data = con_data_train$dat
)
MOBSC_para <- extract_para(
sce = sce_train,
marginal_list = MOBSC_marginal,
n_cores = 1,
family_use = "nb",
new_covariate = con_data_test$newCovariate,
data = con_data_train$dat
)
MOBSC_newcount <- simu_new(
sce = sce_train,
mean_mat = MOBSC_para$mean_mat,
sigma_mat = MOBSC_para$sigma_mat,
zero_mat = MOBSC_para$zero_mat,
quantile_mat = NULL,
copula_list = MOBSC_copula$copula_list,
n_cores = 2,
family_use = "nb",
input_data = con_data_train$dat,
new_covariate = con_data_test$dat,
filtered_gene = con_data_train$filtered_gene
)
The error info is:
Error in X[, pstart[i] - 1 + 1:object$nsdf[i]] <- Xp: number of items to replace is not a multiple of replacement length
Traceback:
1. extract_para(sce = sce_train, marginal_list = MOBSC_marginal,
. n_cores = 1, family_use = "nb", new_covariate = con_data_test$newCovariate,
. data = con_data_train$dat)
2. suppressMessages(paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx],
. y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores))
3. withCallingHandlers(expr, message = function(c) if (inherits(c,
. classes)) tryInvokeRestart("muffleMessage"))
4. paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx],
. y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores)
5. mclapply(seq_len(n), do_one, mc.preschedule = mc.preschedule,
. mc.set.seed = mc.set.seed, mc.silent = mc.silent, mc.cores = mc.cores,
. mc.cleanup = mc.cleanup, affinity.list = affinity.list)
6. lapply(X = X, FUN = FUN, ...)
7. FUN(X[[i]], ...)
8. .mapply(FUN, dots, MoreArgs)
9. (function (x, y)
. {
. fit <- marginal_list[[x]]
Dear Dongyuan and Qingyang, I also tried other example datasets from the scDesign3. It seems the same error message also happened for those. Can you help check that? Really appreciate it.
library(scDesign3)
library(SingleCellExperiment)
library(dplyr)
example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581980")))
# I also tried this dataset
# example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581965")))
sce_train <- example_sce[1:10, 1:50]
sce_test <- example_sce[1:10, 51:80]
set.seed(123)
con_data_train <- construct_data(
sce = sce_train,
assay_use = "counts",
celltype = "cell_type",
pseudotime = NULL,
spatial = NULL,
other_covariates = NULL,
corr_by = "1"
)
con_data_test <- construct_data(
sce = sce_test,
assay_use = "counts",
celltype = "cell_type",
pseudotime = NULL,
spatial = NULL,
other_covariates = NULL,
corr_by = "1"
)
MOBSC_marginal <- fit_marginal(
data = con_data_train,
predictor = "gene",
mu_formula = "cell_type",
sigma_formula = "cell_type",
family_use = "nb",
n_cores = 2,
usebam = FALSE,
parallelization = "pbmcmapply"
)
MOBSC_copula <- fit_copula(
sce = sce_train,
assay_use = "counts",
marginal_list = MOBSC_marginal,
family_use = "nb",
copula = "gaussian",
n_cores = 2,
input_data = con_data_train$dat
)
MOBSC_para <- extract_para(
sce = sce_train,
marginal_list = MOBSC_marginal,
n_cores = 1,
family_use = "nb",
new_covariate = con_data_test$newCovariate,
data = con_data_train$dat
)
MOBSC_newcount <- simu_new(
sce = sce_train,
mean_mat = MOBSC_para$mean_mat,
sigma_mat = MOBSC_para$sigma_mat,
zero_mat = MOBSC_para$zero_mat,
quantile_mat = NULL,
copula_list = MOBSC_copula$copula_list,
n_cores = 2,
family_use = "nb",
input_data = con_data_train$dat,
new_covariate = con_data_test$dat,
filtered_gene = con_data_train$filtered_gene
)
Hi Feng,
I just uploaded the code. The error is due to the incorrect data format that results when selecting only one column from a data frame.
I tested the newest code with your script above. The data with the link (https://figshare.com/ndownloader/files/40581980) ran with no error. The data with the link (https://figshare.com/ndownloader/files/40581965) produces an error. However, it causes an error because of the training and testing data. The training data does not contain CD14+ monocyte cells, while the testing data does; thus, the GAMLSS model can't predict the parameters for an unseen cell type. Once I removed the three CD14+ monocyte cells from the testing data, the data with the link (https://figshare.com/ndownloader/files/40581965) also ran with no error.
Thanks for letting us know about this bug, and please let us know if you have further questions.
Thank you so much for the update. It works perfectly on the test datasets. I am working to extend the test to additional datasets and will close this slot shortly.
I did another test on prediction of new samples but based on spatial positions. Here is my test code:
library(scDesign3)
library(SingleCellExperiment)
library(dplyr)
example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40582019")))
# example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581977")))
sce_train <- example_sce[1:10, 1:50]
sce_test <- example_sce[1:10, 51:80]
set.seed(123)
con_data_train <- construct_data(
sce = sce_train,
assay_use = "counts",
celltype = "cell_type",
pseudotime = NULL,
spatial = c("spatial1", "spatial2"),
other_covariates = NULL,
corr_by = "1"
)
con_data_test <- construct_data(
sce = sce_test,
assay_use = "counts",
celltype = "cell_type",
pseudotime = NULL,
spatial = c("spatial1", "spatial2"),
other_covariates = NULL,
corr_by = "1"
)
MOBSC_marginal <- fit_marginal(
data = con_data_train,
predictor = "gene",
mu_formula = "s(spatial1, spatial2, bs = 'gp', k= 400)",
sigma_formula =1,
family_use = "nb",
n_cores = 2,
usebam = FALSE,
parallelization = "pbmcmapply"
)
MOBSC_copula <- fit_copula(
sce = sce_train,
assay_use = "counts",
marginal_list = MOBSC_marginal,
family_use = "nb",
copula = "gaussian",
n_cores = 2,
input_data = con_data_train$dat
)
MOBSC_para <- extract_para(
sce = sce_train,
marginal_list = MOBSC_marginal,
n_cores = 1,
family_use = "nb",
new_covariate = con_data_test$newCovariate,
data = con_data_train$dat
)
MOBSC_newcount <- simu_new(
sce = sce_train,
mean_mat = MOBSC_para$mean_mat,
sigma_mat = MOBSC_para$sigma_mat,
zero_mat = MOBSC_para$zero_mat,
quantile_mat = NULL,
copula_list = MOBSC_copula$copula_list,
n_cores = 2,
family_use = "nb",
input_data = con_data_train$dat,
new_covariate = con_data_test$dat,
filtered_gene = con_data_train$filtered_gene
)
The code stops at fit_copula
, with error messages:
Error in `colnames<-`(`*tmp*`, value = rownames(sce)): attempt to set 'colnames' on an object with less than two dimensions
Traceback:
1. fit_copula(sce = sce_train, assay_use = "counts", marginal_list = MOBSC_marginal,
. family_use = "nb", copula = "gaussian", n_cores = 2, input_data = con_data_train$dat)
2. convert_n(sce = sce[qc_gene_idx, ], assay_use = assay_use, marginal_list = marginal_list[qc_gene_idx],
. data = input_data, DT = DT, pseudo_obs = pseudo_obs, n_cores = n_cores,
. family_use = family_use, epsilon = epsilon, parallelization = parallelization,
. BPPARAM = BPPARAM)
3. `colnames<-`(`*tmp*`, value = rownames(sce))
4. stop("attempt to set 'colnames' on an object with less than two dimensions")
Any ideas on this error? Really appreciate your help!
Hi Dongyuan,
Thanks for sharing the work. I have a question about using the model: can I first train model on one dataset and learn the relation between cell type and omics parameters; then I have a new list of cell types, can I use trained model to generate the gene data based on the new cell type list? I did not find this function from examples, maybe I missed that.
Thank you!
Feng