Describe the bug
I ran dsb normalization with muon.prot.pp.dsb() and also with the original implementation in R, with the function DSBNormalizeProtein(). The results are not the same.
One possible issue is the float overflow problem that I mentioned in my other issue #130. But even when using float64 to solve that problem, still the results are different. I am not sure what is the cause.
I applied only step I of dsb normalization, without applying step II (without denoising and isotype controls).
In the following code, I very slightly modified the muon.prot.pp.dsb() function in order to 1) store the means and stds in the AnnData object and 2) to be able to choose between mean subtracting and standardization (mean subtracting and dividing by std). Then, I tried running dsb normalization with float32 (default) and with float64.
import muon
import scanpy as sc
import numpy as np
from typing import Optional, Iterable, Tuple, Union
from numbers import Integral, Real
from warnings import warn
import numpy as np
import pandas as pd
from scipy.sparse import issparse, csc_matrix, csr_matrix
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from anndata import AnnData
from mudata import MuData
def dsb(
data: Union[AnnData, MuData],
data_raw: Optional[Union[AnnData, MuData]] = None,
pseudocount: Integral = 10,
denoise_counts: bool = True,
isotype_controls: Optional[Iterable[str]] = None,
empty_counts_range: Optional[Tuple[Real, Real]] = None,
cell_counts_range: Optional[Tuple[Real, Real]] = None,
add_layer: bool = False,
scale_factor: str = "standardize",
mean_name = "mean_empty",
std_name = "std_empty",
random_state: Optional[Union[int, np.random.RandomState, None]] = None,
) -> Union[None, MuData]:
toreturn = None
if data_raw is None:
if empty_counts_range is None or cell_counts_range is None:
raise ValueError(
"data_raw is None, assuming data is the unfiltered object, but no count ranges provided"
)
if max(*empty_counts_range) > min(*cell_counts_range):
raise ValueError("overlapping count ranges")
if not isinstance(data, MuData) or "prot" not in data.mod or "rna" not in data.mod:
raise TypeError(
"No data_raw given, assuming data is the unfiltered object, but data is not MuData"
" or does not contain 'prot' and 'rna' modalities"
)
if data.mod["rna"].n_obs != data.mod["prot"].n_obs:
raise ValueError("different numbers of cells in 'rna' and 'prot' modalities.")
log10umi = np.log10(np.asarray(data.mod["rna"].X.sum(axis=1)).squeeze() + 1)
empty_idx = np.where(
(log10umi >= min(*empty_counts_range)) & (log10umi < max(*empty_counts_range))
)[0]
cell_idx = np.where(
(log10umi >= min(*cell_counts_range)) & (log10umi < max(*cell_counts_range))
)[0]
cellidx = data.mod["prot"].obs_names[cell_idx]
empty = data.mod["prot"][empty_idx, :]
data = data[cellidx, :].copy()
cells = data.mod["prot"]
toreturn = data
elif isinstance(data_raw, AnnData):
empty = data_raw
elif isinstance(data_raw, MuData) and "prot" in data_raw.mod:
empty = data_raw["prot"]
else:
raise TypeError("data_raw must be an AnnData or a MuData object with 'prot' modality")
if isinstance(data, AnnData):
cells = data
elif isinstance(data, MuData) and "prot" in data.mod:
cells = data["prot"]
else:
raise TypeError("data must be an AnnData or a MuData object with 'prot' modality")
if pseudocount < 0:
raise ValueError("pseudocount cannot be negative")
if cells.shape[1] != empty.shape[1]: # this should only be possible if mudata_raw != None
raise ValueError("data and data_raw have different numbers of proteins")
if empty_counts_range is None: # mudata_raw != None
warn(
"empty_counts_range values are not provided, treating all the non-cells as empty droplets"
)
empty = empty[~empty.obs_names.isin(cells.obs_names)]
else:
warn(
f"empty_counts_range will be deprecated in the future versions",
DeprecationWarning,
stacklevel=2,
)
if data_raw is not None:
if not isinstance(data_raw, MuData) or "rna" not in data_raw.mod:
warn(
"data_raw must be a MuData object with 'rna' modality, ignoring empty_counts_range and treating all the non-cells as empty droplets"
)
empty = empty[~empty.obs_names.isin(cells.obs_names)]
else:
# data_raw is a MuData with 'rna' modality and empty_counts_range values are provided
log10umi = np.log10(np.asarray(data_raw.mod["rna"].X.sum(axis=1)).squeeze() + 1)
bc_umis = pd.DataFrame({"log10umi": log10umi}, index=data_raw.mod["rna"].obs_names)
empty_droplets = bc_umis.query(
f"log10umi >= {min(*empty_counts_range)} & log10umi < {max(*empty_counts_range)}"
).index.values
empty_len_orig = len(empty_droplets)
empty_droplets = np.array([i for i in empty_droplets if i not in cells.obs_names])
empty_len = len(empty_droplets)
if empty_len != empty_len_orig:
warn(
f"Dropping {empty_len_orig - empty_len} empty droplets as they are already defined as cells"
)
empty = empty[empty_droplets].copy()
if data_raw is not None and cell_counts_range is not None:
warn("cell_counts_range values are ignored since cells are provided in data")
empty_scaled = (
np.log(empty.X + pseudocount)
if not issparse(empty.X)
else np.log(empty.X.toarray() + pseudocount)
)
cells_scaled = (
np.log(cells.X + pseudocount)
if not issparse(cells.X)
else np.log(cells.X.toarray() + pseudocount)
)
if scale_factor == "standardize":
cells_scaled = (cells_scaled - empty_scaled.mean(axis=0)) / empty_scaled.std(axis=0)
elif scale_factor == "mean_subtract":
cells_scaled = cells_scaled - empty_scaled.mean(axis=0)
else:
raise ValueError("Unknown scale factor")
if denoise_counts:
bgmeans = np.empty(cells_scaled.shape[0], np.float32)
# init_params needs to be random, otherwise fitted variance for one of the n_components
# sometimes goes to 0
sharedvar = GaussianMixture(
n_components=2, covariance_type="tied", init_params="random", random_state=random_state
)
separatevar = GaussianMixture(
n_components=2, covariance_type="full", init_params="random", random_state=random_state
)
for c in range(cells_scaled.shape[0]):
sharedvar.fit(cells_scaled[c, :, np.newaxis])
separatevar.fit(cells_scaled[c, :, np.newaxis])
if sharedvar.bic(cells_scaled[c, :, np.newaxis]) < separatevar.bic(
cells_scaled[c, :, np.newaxis]
):
bgmeans[c] = np.min(sharedvar.means_)
else:
bgmeans[c] = np.min(separatevar.means_)
if isotype_controls is not None:
pca = PCA(n_components=1, whiten=True)
ctrl_idx = np.where(cells.var_names.isin(set(isotype_controls)))[0]
if len(ctrl_idx) < len(isotype_controls):
warn("Some isotype controls are not present in the data.")
covar = pca.fit_transform(
np.hstack((cells_scaled[:, ctrl_idx], bgmeans.reshape(-1, 1)))
)
else:
covar = bgmeans[:, np.newaxis]
reg = LinearRegression(fit_intercept=True, copy_X=False)
reg.fit(covar, cells_scaled)
cells_scaled -= reg.predict(covar) - reg.intercept_
if add_layer:
cells.layers["dsb"] = cells_scaled
else:
cells.X = cells_scaled
cells.uns[mean_name] = empty_scaled.mean(axis=0)
cells.uns[std_name] = empty_scaled.std(axis=0)
return toreturn
raw = sc.read_10x_h5("/home/marchena/Downloads/5k_pbmc_protein_v3_nextgem_raw_feature_bc_matrix.h5", gex_only=False)
filtered = sc.read_10x_h5("/home/marchena/Downloads/5k_pbmc_protein_v3_nextgem_filtered_feature_bc_matrix.h5", gex_only=False)
# Add cellranger background as .obs.
cellranger_background = [barcode not in filtered.obs_names for barcode in raw.obs_names]
raw.obs["cellranger_background"] = np.array(cellranger_background).astype(str)
# Separate into RNA and protein.
raw_rna = raw[:, raw.var.feature_types == "Gene Expression"].copy()
raw_protein = raw[:, raw.var.feature_types == "Antibody Capture"].copy()
raw_protein.X = raw_protein.X.astype(np.float64)
# Filter to protein of real cells (CellRanger defined cells) and background (empty droplets)·
filtered_protein = raw_protein[filtered.obs_names, :].copy()
# filtered_protein = filtered_protein[:, sorted(filtered_protein.var_names)].copy() # Sort protein names alphabetically.
cellranger_background_protein = raw_protein[raw_protein.obs["cellranger_background"] == True].copy()
# Performing DSB normalization
print("performing DSB normalization")
# DSB normalization with CellRanger background.
dsb(data=filtered_protein, data_raw=raw_protein, add_layer=True, denoise_counts=False, scale_factor="standardize")
filtered_protein.layers["dsb_cellranger_background_standardized"] = filtered_protein.layers["dsb"]
# DSB normalization with CellRanger background.
dsb(data=filtered_protein, data_raw=raw_protein, add_layer=True, denoise_counts=False, scale_factor="mean_subtract")
filtered_protein.layers["dsb_cellranger_background_mean_subtracted"] = filtered_protein.layers["dsb"]
dsb(data=filtered_protein, data_raw=raw_protein, add_layer=True, denoise_counts=False)
print(filtered_protein["AAACCCACAGGCTTGC-1", :].layers["dsb_cellranger_background_standardized"])
print(filtered_protein["AAACCCACAGGCTTGC-1", :].layers["dsb_cellranger_background_mean_subtracted"])
Expected behaviour
I expect resulting values from dsb normalization to be the same when using the muon.prot.pp.dsb() function and when using the DSBNormalizeProtein() function.
At least the results are the same in muon and DSBNormalizeProtein in the dsb-normalized mean-subtracted data. But they differ quite a lot for the dsb-normalized standardized data.
System
OS: Debian 12
Python version 3.10.9, R version 4.3.2
Versions of libraries involved:
muon 0.1.3
numpy 1.23.5
dsb (in R) 1.0.3
Seurat (in R) 5.0.1
Describe the bug I ran dsb normalization with muon.prot.pp.dsb() and also with the original implementation in R, with the function DSBNormalizeProtein(). The results are not the same.
One possible issue is the float overflow problem that I mentioned in my other issue #130. But even when using float64 to solve that problem, still the results are different. I am not sure what is the cause.
I applied only step I of dsb normalization, without applying step II (without denoising and isotype controls).
To Reproduce You can download public 10X data used for reproduction here: https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.2/5k_pbmc_protein_v3_nextgem
Code in Python:
In the following code, I very slightly modified the muon.prot.pp.dsb() function in order to 1) store the means and stds in the AnnData object and 2) to be able to choose between mean subtracting and standardization (mean subtracting and dividing by std). Then, I tried running dsb normalization with float32 (default) and with float64.
Code in R:
I chose one random cell, and I printed the results with muon and with DSBNormalizeProtein.
Results:
Variable names (I made sure they are in the same order in Python and in R):
['CD3_TotalSeqB', 'CD4_TotalSeqB', 'CD8a_TotalSeqB', 'CD11b_TotalSeqB', 'CD14_TotalSeqB', 'CD15_TotalSeqB', 'CD16_TotalSeqB', 'CD19_TotalSeqB', 'CD20_TotalSeqB', 'CD25_TotalSeqB', 'CD27_TotalSeqB', 'CD28_TotalSeqB', 'CD34_TotalSeqB', 'CD45RA_TotalSeqB', 'CD45RO_TotalSeqB', 'CD56_TotalSeqB', 'CD62L_TotalSeqB', 'CD69_TotalSeqB', 'CD80_TotalSeqB', 'CD86_TotalSeqB', 'CD127_TotalSeqB', 'CD137_TotalSeqB', 'CD197_TotalSeqB', 'CD274_TotalSeqB', 'CD278_TotalSeqB', 'CD335_TotalSeqB', 'PD-1_TotalSeqB', 'HLA-DR_TotalSeqB', 'TIGIT_TotalSeqB', 'IgG1_control_TotalSeqB', 'IgG2a_control_TotalSeqB', 'IgG2b_control_TotalSeqB']
Results with muon (standardized):
filtered_protein["AAACCCACAGGCTTGC-1", :].layers["dsb_cellranger_background_standardized"] ArrayView([[ 2.24209817e+01, 7.97605222e+01, 3.27524343e+01, 1.01896238e+02, 6.72075433e+01, 2.54922213e+01, 6.30766888e+00, 2.69828327e+01, 6.59583746e+00, 2.09951417e+01, 9.61899233e+00, 1.32811409e+01, -7.53716050e-03, 8.59540199e+00, 1.37866711e+02, 1.77650023e+01, 4.33477639e+01, 3.35010410e+01, 1.16825349e+01, 1.64375925e+02, 1.50746341e+01, 2.98116927e+01, 2.02711318e+01, 1.33769954e+01, 2.27085405e+01, 4.91354419e+01, 2.07959619e+01, 4.14388818e+01, 3.06126357e+01, 3.43748698e+01, 1.80149515e+01, 2.04173610e+01]])
Results with DSBNormalizeProtein(standardized):
print(as.numeric(gsub("[^0-9.-]", " ", protein_norm_cellranger_background_standardized[,"AAACCCACAGGCTTGC-1"]))) [1] 8.43003277 31.01810317 16.03528261 56.81191511 47.10295457 19.91736582 [7] 3.24141469 12.13492974 3.98687791 13.57313381 5.63090276 7.08948603 [13] -0.01155433 4.62074321 55.41078980 9.01906626 24.64123381 28.86757574 [19] 8.85428064 57.70327936 7.93175744 22.99841186 17.26202583 11.19919058 [25] 9.72820548 29.96541035 12.77329873 25.89788880 17.45682287 28.27723582 [31] 14.04324470 17.78213660
Results with muon(mean-subtracted):
filtered_protein["AAACCCACAGGCTTGC-1", :].layers["dsb_cellranger_background_mean_subtracted"] ArrayView([[ 7.85233208e-01, 3.36662951e+00, 6.40288604e-01, 5.74106719e+00, 4.32003687e+00, 6.39655516e-01, 9.41907551e-02, 4.04341785e-01, 1.79907069e-01, 3.35326355e-01, 5.25338134e-01, 4.66695892e-01, -6.05242030e-06, 4.00781733e-01, 2.92156372e+00, 2.61271244e-01, 1.09694506e+00, 1.52165582e+00, 9.48765079e-02, 2.85510166e+00, 3.34572583e-01, 3.35747133e-01, 6.38931880e-01, 1.81421583e-01, 4.68342352e-01, 5.86978133e-01, 3.35253473e-01, 2.40091659e+00, 3.35813684e-01, 4.04691723e-01, 1.81708783e-01, 3.35230944e-01]])
Results with DSBNormalizeProtein(mean-subtracted):
print(as.numeric(gsub("[^0-9.-]", " ", protein_norm_cellranger_background_mean_subtracted[,"AAACCCACAGGCTTGC-1"]))) [1] 0.78316953 3.36412786 0.63969682 5.73934891 4.31871258 0.63915635 [7] 0.09385562 0.40392249 0.17925452 0.33492792 0.52356371 0.46534380 [13] -1.74122050506931e-05 0.39906340 2.92043190 0.26088380 1.09611706 1.52100442 [19] 0.09474775 2.85426741 0.33377382 0.33555682 0.63843736 0.18123752 [25] 0.46732050 0.58668348 0.33482056 2.39930670 0.33554836 0.40453866 [31] 0.18155219 0.33503109
Expected behaviour I expect resulting values from dsb normalization to be the same when using the muon.prot.pp.dsb() function and when using the DSBNormalizeProtein() function.
At least the results are the same in muon and DSBNormalizeProtein in the dsb-normalized mean-subtracted data. But they differ quite a lot for the dsb-normalized standardized data.
System