scverse / scvi-tools

Deep probabilistic analysis of single-cell and spatial omics data
http://scvi-tools.org/
BSD 3-Clause "New" or "Revised" License
1.25k stars 355 forks source link

TotalVI duplicate label on axis error while running differential_expression after model training #3052

Closed HChungLab closed 10 hours ago

HChungLab commented 14 hours ago

We trained CITE-seq data with TotalVI, the model converged and ran successfully, but we are running into issues when using the differential_expression function

de_df = model.differential_expression(
    groupby="rna_subset:leiden_totalVI", delta=0.5, batch_correction=False
)
de_df.head(5)
DE...:   0%|          | 0/12 [00:00<?, ?it/s]
/gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/scvi/model/_totalvi.py:433: UserWarning: Make sure the registered protein expression in anndata contains unnormalized count data.
  adata = self._validate_anndata(adata)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[94], line 1
----> 1 de_df = model.differential_expression(
      2     groupby="rna_subset:leiden_totalVI", delta=0.5, batch_correction=False
      3 )
      4 de_df.head(5)

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/scvi/model/_totalvi.py:774, in TOTALVI.differential_expression(self, adata, groupby, group1, group2, idx1, idx2, mode, delta, batch_size, all_stats, batch_correction, batchid1, batchid2, fdr_target, silent, protein_prior_count, scale_protein, sample_protein_mixing, include_protein_background, **kwargs)
    760 model_fn = partial(
    761     self._expression_for_de,
    762     scale_protein=scale_protein,
   (...)
    766     batch_size=batch_size,
    767 )
    768 col_names = np.concatenate(
    769     [
    770         np.asarray(_get_var_names_from_manager(adata_manager)),
    771         self.protein_state_registry.column_names,
    772     ]
    773 )
--> 774 result = _de_core(
    775     adata_manager,
    776     model_fn,
    777     None,
    778     groupby,
    779     group1,
    780     group2,
    781     idx1,
    782     idx2,
    783     all_stats,
    784     cite_seq_raw_counts_properties,
    785     col_names,
    786     mode,
    787     batchid1,
    788     batchid2,
    789     delta,
    790     batch_correction,
    791     fdr_target,
    792     silent,
    793     **kwargs,
    794 )
    796 return result

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/scvi/model/base/_de_core.py:133, in _de_core(adata_manager, model_fn, representation_fn, groupby, group1, group2, idx1, idx2, all_stats, all_stats_fn, col_names, mode, batchid1, batchid2, delta, batch_correction, fdr, silent, **kwargs)
    131 res = res.sort_values(by=sort_key, ascending=False)
    132 if mode == "change":
--> 133     res[f"is_de_fdr_{fdr}"] = _fdr_de_prediction(res["proba_de"], fdr=fdr)
    134 if idx1 is None:
    135     g2 = "Rest" if group2 is None else group2

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/pandas/core/frame.py:4311, in DataFrame.__setitem__(self, key, value)
   4308     self._setitem_array([key], value)
   4309 else:
   4310     # set column
-> 4311     self._set_item(key, value)

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/pandas/core/frame.py:4524, in DataFrame._set_item(self, key, value)
   4514 def _set_item(self, key, value) -> None:
   4515     """
   4516     Add series to DataFrame in specified column.
   4517 
   (...)
   4522     ensure homogeneity.
   4523     """
-> 4524     value, refs = self._sanitize_column(value)
   4526     if (
   4527         key in self.columns
   4528         and value.ndim == 1
   4529         and not isinstance(value.dtype, ExtensionDtype)
   4530     ):
   4531         # broadcast across multiple columns if necessary
   4532         if not self.columns.is_unique or isinstance(self.columns, MultiIndex):

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/pandas/core/frame.py:5263, in DataFrame._sanitize_column(self, value)
   5261     if not isinstance(value, Series):
   5262         value = Series(value)
-> 5263     return _reindex_for_setitem(value, self.index)
   5265 if is_list_like(value):
   5266     com.require_length_match(value, self.index)

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/pandas/core/frame.py:12692, in _reindex_for_setitem(value, index)
  12688 except ValueError as err:
  12689     # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
  12690     if not value.index.is_unique:
  12691         # duplicate axis
> 12692         raise err
  12694     raise TypeError(
  12695         "incompatible index of inserted column with frame index"
  12696     ) from err
  12697 return reindexed_value, None

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/pandas/core/frame.py:12687, in _reindex_for_setitem(value, index)
  12685 # GH#4107
  12686 try:
> 12687     reindexed_value = value.reindex(index)._values
  12688 except ValueError as err:
  12689     # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
  12690     if not value.index.is_unique:
  12691         # duplicate axis

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/pandas/core/series.py:5153, in Series.reindex(self, index, axis, method, copy, level, fill_value, limit, tolerance)
   5136 @doc(
   5137     NDFrame.reindex,  # type: ignore[has-type]
   5138     klass=_shared_doc_kwargs["klass"],
   (...)
   5151     tolerance=None,
   5152 ) -> Series:
-> 5153     return super().reindex(
   5154         index=index,
   5155         method=method,
   5156         copy=copy,
   5157         level=level,
   5158         fill_value=fill_value,
   5159         limit=limit,
   5160         tolerance=tolerance,
   5161     )

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/pandas/core/generic.py:5610, in NDFrame.reindex(self, labels, index, columns, axis, method, copy, level, fill_value, limit, tolerance)
   5607     return self._reindex_multi(axes, copy, fill_value)
   5609 # perform the reindex on the axes
-> 5610 return self._reindex_axes(
   5611     axes, level, limit, tolerance, method, fill_value, copy
   5612 ).__finalize__(self, method="reindex")

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/pandas/core/generic.py:5633, in NDFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   5630     continue
   5632 ax = self._get_axis(a)
-> 5633 new_index, indexer = ax.reindex(
   5634     labels, level=level, limit=limit, tolerance=tolerance, method=method
   5635 )
   5637 axis = self._get_axis_number(a)
   5638 obj = obj._reindex_with_indexers(
   5639     {axis: [new_index, indexer]},
   5640     fill_value=fill_value,
   5641     copy=copy,
   5642     allow_dups=False,
   5643 )

File /gpfs/gibbs/project/chung/ah2636/envs/TotalVI_env4/lib/python3.12/site-packages/pandas/core/indexes/base.py:4429, in Index.reindex(self, target, method, level, limit, tolerance)
   4426     raise ValueError("cannot handle a non-unique multi-index!")
   4427 elif not self.is_unique:
   4428     # GH#42568
-> 4429     raise ValueError("cannot reindex on an axis with duplicate labels")
   4430 else:
   4431     indexer, _ = self.get_indexer_non_unique(target)

ValueError: cannot reindex on an axis with duplicate labels

Versions:

1.2.0

canergen commented 10 hours ago

Please use different names for genes and proteins by adding e.g. "cite-" to all proteins. You can do so after training the model by changing model.adata.