scverse / scanpy

Single-cell analysis in Python. Scales to >1M cells.
https://scanpy.readthedocs.io
BSD 3-Clause "New" or "Revised" License
1.9k stars 599 forks source link

rank_genes_groups_dotplot does not work when using reference and using rankby_abs, or setting; values_to_plot='logfoldchanges' #2078

Open Xparx opened 2 years ago

Xparx commented 2 years ago

Note: Please read this guide detailing how to provide the necessary information for us to reproduce your bug.

As the title says. A specific set of combinations of keywords to rank gene groups and plotting throws an error unexpectedly.

Minimal code sample (that we can copy&paste without having any data)

adata = sc.datasets.paul15()
sc.tl.rank_genes_groups(adata, groupby='paul15_clusters', key_added='GG', use_raw=False, reference='1Ery')
rax = sc.pl.rank_genes_groups_dotplot(adata, key='GG', # , rankby_abs= None,
                                      n_genes=3, cmap='PiYG_r', swap_axes=True,
                                      show=False, values_to_plot='logfoldchanges',
                                      vmin=None, vmax=None)
WARNING: In Scanpy 0.*, this returned logarithmized data. Now it returns non-logarithmized data.
... storing 'paul15_clusters' as categorical
Trying to set attribute `.uns` of view, copying.
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var'
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
ERROR: the given dot_color_df data frame has a different shape thanthe data frame used for the dot size. Both data frames needto have the same index and columns
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-110-708ec3ea001f> in <module>
      1 adata = sc.datasets.paul15()
      2 sc.tl.rank_genes_groups(adata, groupby='paul15_clusters', key_added='GG', use_raw=False, reference='1Ery')
----> 3 rax = sc.pl.rank_genes_groups_dotplot(adata, key='GG', # , rankby_abs= None,
      4                                       n_genes=3, cmap='PiYG_r', swap_axes=True,
      5                                       show=False, values_to_plot='logfoldchanges',

~/.virtualenvs/pytorch_latest/lib/python3.8/site-packages/scanpy/plotting/_tools/__init__.py in rank_genes_groups_dotplot(adata, groups, n_genes, groupby, values_to_plot, var_names, gene_symbols, min_logfoldchange, key, show, save, return_fig, **kwds)
    861     tl.rank_genes_groups
    862     """
--> 863     return _rank_genes_groups_plot(
    864         adata,
    865         plot_type='dotplot',

~/.virtualenvs/pytorch_latest/lib/python3.8/site-packages/scanpy/plotting/_tools/__init__.py in _rank_genes_groups_plot(adata, plot_type, groups, n_genes, groupby, values_to_plot, var_names, min_logfoldchange, key, show, save, return_fig, gene_symbols, **kwds)
    534             from .._dotplot import dotplot
    535 
--> 536             _pl = dotplot(
    537                 adata,
    538                 var_names,

~/.virtualenvs/pytorch_latest/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py in dotplot(adata, var_names, groupby, use_raw, log, num_categories, expression_cutoff, mean_only_expressed, cmap, dot_max, dot_min, standard_scale, smallest_dot, title, colorbar_title, size_title, figsize, dendrogram, gene_symbols, var_group_positions, var_group_labels, var_group_rotation, layer, swap_axes, dot_color_df, show, save, ax, return_fig, vmin, vmax, vcenter, norm, **kwds)
    940         del kwds['color_map']
    941 
--> 942     dp = DotPlot(
    943         adata,
    944         var_names,

~/.virtualenvs/pytorch_latest/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py in __init__(self, adata, var_names, groupby, use_raw, log, num_categories, categories_order, title, figsize, gene_symbols, var_group_positions, var_group_labels, var_group_rotation, layer, expression_cutoff, mean_only_expressed, standard_scale, dot_color_df, dot_size_df, ax, vmin, vmax, vcenter, norm, **kwds)
    215             # get the same order for rows and columns in the dot_color_df
    216             # using the order from the doc_size_df
--> 217             dot_color_df = dot_color_df.loc[dot_size_df.index][dot_size_df.columns]
    218 
    219         self.dot_color_df = dot_color_df

~/.virtualenvs/pytorch_latest/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    893 
    894             maybe_callable = com.apply_if_callable(key, self.obj)
--> 895             return self._getitem_axis(maybe_callable, axis=axis)
    896 
    897     def _is_scalar_access(self, key: Tuple):

~/.virtualenvs/pytorch_latest/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1111                     raise ValueError("Cannot index with multidimensional key")
   1112 
-> 1113                 return self._getitem_iterable(key, axis=axis)
   1114 
   1115             # nested tuple slicing

~/.virtualenvs/pytorch_latest/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis)
   1051 
   1052         # A collection of keys
-> 1053         keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
   1054         return self.obj._reindex_with_indexers(
   1055             {axis: [keyarr, indexer]}, copy=True, allow_dups=True

~/.virtualenvs/pytorch_latest/lib/python3.8/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1264             keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
   1265 
-> 1266         self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
   1267         return keyarr, indexer
   1268 

~/.virtualenvs/pytorch_latest/lib/python3.8/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1319 
   1320             with option_context("display.max_seq_items", 10, "display.width", 80):
-> 1321                 raise KeyError(
   1322                     "Passing list-likes to .loc or [] with any missing labels "
   1323                     "is no longer supported. "

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: CategoricalIndex(['1Ery'], categories=['1Ery', '2Ery', '3Ery', '4Ery', '5Ery', '6Ery', '7MEP', '8Mk', ...], ordered=False, name='paul15_clusters', dtype='category'). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

Versions

sc.logging.print_versions() WARNING: If you miss a compact list, please try `print_header`! ----- anndata 0.7.6 scanpy 1.8.2 sinfo 0.3.1 ----- PIL 8.2.0 anndata 0.7.6 autoreload NA backcall 0.2.0 cffi 1.14.5 configobj 5.0.6 cycler 0.10.0 cython_runtime NA dateutil 2.8.1 decorator 4.4.2 git 3.1.14 gitdb 4.0.7 google NA gpytorch 1.4.1 h5py 3.2.1 igraph 0.9.6 inferelator NA ipykernel 5.5.3 ipython_genutils 0.2.0 ipywidgets 7.6.3 jedi 0.18.0 joblib 1.0.1 kiwisolver 1.3.1 leidenalg 0.8.4 llvmlite 0.36.0 matplotlib 3.4.1 mpl_toolkits NA natsort 7.1.1 numba 0.53.1 numexpr 2.7.3 numpy 1.20.2 packaging 20.9 pandas 1.2.4 parso 0.8.2 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA prompt_toolkit 3.0.18 ptyprocess 0.7.0 pycparser 2.20 pygments 2.8.1 pynndescent 0.5.2 pyparsing 2.4.7 pytz 2021.1 scanpy 1.8.2 scipy 1.6.3 seaborn 0.11.1 sinfo 0.3.1 sitecustomize NA six 1.15.0 sklearn 0.24.2 smmap 4.0.0 statsmodels 0.12.2 storemagic NA supirfactor NA tables 3.6.1 texttable 1.6.3 torch 1.9.0+cu102 tornado 6.1 tqdm 4.60.0 traitlets 5.0.5 typing_extensions NA umap 0.5.1 wcwidth 0.2.5 zmq 22.0.3 ----- IPython 7.22.0 jupyter_client 6.1.12 jupyter_core 4.7.1 notebook 6.3.0 ----- Python 3.8.5 (default, Jan 27 2021, 15:41:15) [GCC 9.3.0] Linux-5.4.0-91-generic-x86_64-with-glibc2.29 12 logical CPU cores, x86_64 ----- Session information updated at 2021-12-10 17:16
ivirshup commented 2 years ago

Thanks for the report. I can broadly reproduce the error for passing values_to_plot. The error I get is a little different, but I expect that's due to pandas versions.

A more minimal example:

import scanpy as sc

adata = sc.datasets.pbmc3k_processed().raw.to_adata()
sc.tl.rank_genes_groups(adata, groupby="louvain", reference="B cells")

# Errors with any of  ['scores', 'logfoldchanges', 'pvals', 'pvals_adj','log10_pvals', 'log10_pvals_adj']
sc.pl.rank_genes_groups_dotplot(adata, values_to_plot='logfoldchanges')
Traceback ```pytb ERROR: the given dot_color_df data frame has a different shape thanthe data frame used for the dot size. Both data frames needto have the same index and columns --------------------------------------------------------------------------- KeyError Traceback (most recent call last) /var/folders/bd/43q20k0n6z15tdfzxvd22r7c0000gn/T/ipykernel_62013/1545772980.py in 1 while len(possible_vals) > 0: ----> 2 sc.pl.rank_genes_groups_dotplot(adata, values_to_plot=possible_vals.pop()) 3 ~/github/scanpy/scanpy/plotting/_tools/__init__.py in rank_genes_groups_dotplot(adata, groups, n_genes, groupby, values_to_plot, var_names, gene_symbols, min_logfoldchange, key, show, save, return_fig, **kwds) 861 tl.rank_genes_groups 862 """ --> 863 return _rank_genes_groups_plot( 864 adata, 865 plot_type='dotplot', ~/github/scanpy/scanpy/plotting/_tools/__init__.py in _rank_genes_groups_plot(adata, plot_type, groups, n_genes, groupby, values_to_plot, var_names, min_logfoldchange, key, show, save, return_fig, gene_symbols, **kwds) 534 from .._dotplot import dotplot 535 --> 536 _pl = dotplot( 537 adata, 538 var_names, ~/github/scanpy/scanpy/plotting/_dotplot.py in dotplot(adata, var_names, groupby, use_raw, log, num_categories, expression_cutoff, mean_only_expressed, cmap, dot_max, dot_min, standard_scale, smallest_dot, title, colorbar_title, size_title, figsize, dendrogram, gene_symbols, var_group_positions, var_group_labels, var_group_rotation, layer, swap_axes, dot_color_df, show, save, ax, return_fig, vmin, vmax, vcenter, norm, **kwds) 940 del kwds['color_map'] 941 --> 942 dp = DotPlot( 943 adata, 944 var_names, ~/github/scanpy/scanpy/plotting/_dotplot.py in __init__(self, adata, var_names, groupby, use_raw, log, num_categories, categories_order, title, figsize, gene_symbols, var_group_positions, var_group_labels, var_group_rotation, layer, expression_cutoff, mean_only_expressed, standard_scale, dot_color_df, dot_size_df, ax, vmin, vmax, vcenter, norm, **kwds) 215 # get the same order for rows and columns in the dot_color_df 216 # using the order from the doc_size_df --> 217 dot_color_df = dot_color_df.loc[dot_size_df.index][dot_size_df.columns] 218 219 self.dot_color_df = dot_color_df /usr/local/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key) 929 930 maybe_callable = com.apply_if_callable(key, self.obj) --> 931 return self._getitem_axis(maybe_callable, axis=axis) 932 933 def _is_scalar_access(self, key: tuple): /usr/local/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1151 raise ValueError("Cannot index with multidimensional key") 1152 -> 1153 return self._getitem_iterable(key, axis=axis) 1154 1155 # nested tuple slicing /usr/local/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis) 1091 1092 # A collection of keys -> 1093 keyarr, indexer = self._get_listlike_indexer(key, axis) 1094 return self.obj._reindex_with_indexers( 1095 {axis: [keyarr, indexer]}, copy=True, allow_dups=True /usr/local/lib/python3.9/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis) 1312 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) 1313 -> 1314 self._validate_read_indexer(keyarr, indexer, axis) 1315 1316 if needs_i8_conversion(ax.dtype) or isinstance( /usr/local/lib/python3.9/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis) 1375 1376 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) -> 1377 raise KeyError(f"{not_found} not in index") 1378 1379 KeyError: "['B cells'] not in index" ```

For rankby_abs it does error, but is that a valid argument to pass to this function?

Xparx commented 2 years ago

I "fixed" the issue I had by eddting the _dotplot.py module editing the DotPlot class. Switching the top line for the bottom line.

# dot_color_df = dot_color_df.loc[dot_size_df.index][dot_size_df.columns]
dot_color_df = dot_color_df.reindex(dot_size_df.index).reindex(columns=dot_size_df.columns)

I'm not sure the output is what is desired but for my case at least it is the same for cases where it wored before.

Xparx commented 2 years ago

I will retract the above snippet. It let's the function work for what it worked for before but the new results are nonsense.