saezlab / decoupler-py

Python package to perform enrichment analysis from omics data.
https://decoupler-py.readthedocs.io/
GNU General Public License v3.0
154 stars 23 forks source link

Failure with .get_pseudobulk() when grouping/sample columns are categorical #147

Closed dnjst closed 3 days ago

dnjst commented 1 month ago

Describe the bug

I am making a pseudobulk version of my anndata object, which I have run through .strings_to_categoricals() as a normal cleaning operation (anndata seems to want this).

However, I get the error

'Categorical' with dtype category does not support reduction 'all'

When I try to make pseudobulk. I can fix this by running:

adata.obs["sample_column"] = adata.obs["sample_column"].astype(str)
adata.obs["groups_column"] = adata.obs["groups_column"].astype(str)

which leads me to believe the problem has to do with the obs columns being categorical.

Is there a way decoupler could be upgraded to handle this scenario internally?

Additional context

Cell In[17], line 10, in build_pdata(adata, sample_col, groups_col)
      8 def build_pdata(adata, sample_col, groups_col):
---> 10     pdata = decoupler.get_pseudobulk(adata,
     11                              sample_col=sample_col,
     12                              groups_col=groups_col,
     13                              mode='sum',
     14                              min_cells=0,
     15                              min_counts=0,
     16                             )
     17             # convert to CPM
     18     pdata.layers["boolean"] = binarize(pdata.layers["psbulk_props"], threshold=0.1) # bool as >10% = ON

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/decoupler/utils_anndata.py:360, in get_pseudobulk(adata, sample_col, groups_col, obs, layer, use_raw, mode, min_cells, min_counts, dtype, skip_checks, min_prop, min_smpls, remove_empty)
    357 check_X(X, mode=mode, skip_checks=skip_checks)
    359 # Format inputs
--> 360 obs, groups_col, smples, groups, n_rows = format_psbulk_inputs(sample_col, groups_col, obs)
    361 n_cols = adata.shape[1]
    362 new_obs = pd.DataFrame(columns=obs.columns)

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/decoupler/utils_anndata.py:182, in format_psbulk_inputs(sample_col, groups_col, obs)
    178     groups_col = joined_cols
    180 # Filter extra columns in obs
    181 cols = obs.groupby([sample_col, groups_col],
--> 182                    observed=True).apply(lambda x: x.apply(lambda y: len(y.unique()) == 1)).all(0)
    183 obs = obs.loc[:, cols]
    185 # Get unique samples and groups

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/pandas/core/frame.py:11628, in DataFrame.all(self, axis, bool_only, skipna, **kwargs)
  11620 @doc(make_doc("all", ndim=2))
  11621 def all(
  11622     self,
   (...)
  11626     **kwargs,
  11627 ) -> Series | bool:
> 11628     result = self._logical_func(
  11629         "all", nanops.nanall, axis, bool_only, skipna, **kwargs
  11630     )
  11631     if isinstance(result, Series):
  11632         result = result.__finalize__(self, method="all")

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/pandas/core/generic.py:12208, in NDFrame._logical_func(self, name, func, axis, bool_only, skipna, **kwargs)
  12205         obj = self._get_bool_data()
  12206     return obj._reduce_axis1(name, func, skipna=skipna)
> 12208 return self._reduce(
  12209     func,
  12210     name=name,
  12211     axis=axis,
  12212     skipna=skipna,
  12213     numeric_only=bool_only,
  12214     filter_type="bool",
  12215 )

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/pandas/core/frame.py:11562, in DataFrame._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
  11558     df = df.T
  11560 # After possibly _get_data and transposing, we are now in the
  11561 #  simple case where we can use BlockManager.reduce
> 11562 res = df._mgr.reduce(blk_func)
  11563 out = df._constructor_from_mgr(res, axes=res.axes).iloc[0]
  11564 if out_dtype is not None and out.dtype != "boolean":

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/pandas/core/internals/managers.py:1500, in BlockManager.reduce(self, func)
   1498 res_blocks: list[Block] = []
   1499 for blk in self.blocks:
-> 1500     nbs = blk.reduce(func)
   1501     res_blocks.extend(nbs)
   1503 index = Index([None])  # placeholder

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/pandas/core/internals/blocks.py:404, in Block.reduce(self, func)
    398 @final
    399 def reduce(self, func) -> list[Block]:
    400     # We will apply the function and reshape the result into a single-row
    401     #  Block with the same mgr_locs; squeezing will be done at a higher level
    402     assert self.ndim == 2
--> 404     result = func(self.values)
    406     if self.values.ndim == 1:
    407         res_values = result

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/pandas/core/frame.py:11470, in DataFrame._reduce.<locals>.blk_func(values, axis)
  11468     dtype_has_keepdims[values.dtype] = has_keepdims
  11469 if has_keepdims:
> 11470     return values._reduce(name, skipna=skipna, keepdims=True, **kwds)
  11471 else:
  11472     warnings.warn(
  11473         f"{type(values)}._reduce will require a `keepdims` parameter "
  11474         "in the future",
  11475         FutureWarning,
  11476         stacklevel=find_stack_level(),
  11477     )

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/pandas/core/arrays/categorical.py:2359, in Categorical._reduce(self, name, skipna, keepdims, **kwargs)
   2356 def _reduce(
   2357     self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
   2358 ):
-> 2359     result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
   2360     if name in ["argmax", "argmin"]:
   2361         # don't wrap in Categorical!
   2362         return result

File /~/conda_envs/notebook_env/lib/python3.11/site-packages/pandas/core/arrays/base.py:1954, in ExtensionArray._reduce(self, name, skipna, keepdims, **kwargs)
   1952 meth = getattr(self, name, None)
   1953 if meth is None:
-> 1954     raise TypeError(
   1955         f"'{type(self).__name__}' with dtype {self.dtype} "
   1956         f"does not support reduction '{name}'"
   1957     )
   1958 result = meth(skipna=skipna, **kwargs)
   1959 if keepdims:

TypeError: 'Categorical' with dtype category does not support reduction 'all'
PauBadiaM commented 1 month ago

Hi @dnjst ,

I've made a new release of decoupler (1.8.0) that should fix this issue, you can install it with:

pip install decoupler==1.8.0

let me know how it goes.