Allocate step fails with "ValueError: negative dimensions are not allowed" on large stereoseq dataset

saeyslab / napari-sparrow

Other

17 stars 0 forks source link

On a large stereoseq dataset (around 30 million cells, and 25000 genes), allocation of transcripts to segmented cells fails.

This step

sdata = sp.tb.allocate( 
    sdata=sdata,
    allocate_from_shapes_layer=False,
    labels_layer='segmentation_mask',
    chunks=1500)

triggers the error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[15], line 8
      1 # Assign transcripts to segmented cells/nuclei,
      2 # whose outlines are stored in the 'segmentation_mask_boundaries' layer.
      3 # sdata = sp.tb.allocate( 
      4 #     sdata=sdata,
      5 #     shapes_layer='segmentation_mask_boundaries'
      6 #   )
----> 8 sdata = sp.tb.allocate( 
      9     sdata=sdata,
     10     allocate_from_shapes_layer=False,
     11     labels_layer='segmentation_mask',
     12     chunks=1500
     13   )

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\sparrow\table\_allocation.py:192, in allocate(sdata, labels_layer, shapes_layer, points_layer, allocate_from_shapes_layer, chunks)
    188 cell_counts = combined_partitions.groupby([_CELL_INDEX, "gene"]).size()
    190 coordinates, cell_counts = dask.compute(coordinates, cell_counts, scheduler="threads")
--> 192 cell_counts = cell_counts.unstack(fill_value=0)
    193 # convert dtype of columns to "object", otherwise error writing to zarr.
    194 cell_counts.columns = cell_counts.columns.astype(str)

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\series.py:4615, in Series.unstack(self, level, fill_value, sort)
   4570 """
   4571 Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.
   4572 
   (...)
   4611 b    2    4
   4612 """
   4613 from pandas.core.reshape.reshape import unstack
-> 4615 return unstack(self, level, fill_value, sort)

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:517, in unstack(obj, level, fill_value, sort)
    515 if is_1d_only_ea_dtype(obj.dtype):
    516     return _unstack_extension_series(obj, level, fill_value, sort=sort)
--> 517 unstacker = _Unstacker(
    518     obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
    519 )
    520 return unstacker.get_result(
    521     obj._values, value_columns=None, fill_value=fill_value
    522 )

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:154, in _Unstacker.__init__(self, index, level, constructor, sort)
    146 if num_cells > np.iinfo(np.int32).max:
    147     warnings.warn(
    148         f"The following operation may generate {num_cells} cells "
    149         f"in the resulting pandas object.",
    150         PerformanceWarning,
    151         stacklevel=find_stack_level(),
    152     )
--> 154 self._make_selectors()

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:206, in _Unstacker._make_selectors(self)
    203 self.full_shape = ngroups, stride
    205 selector = self.sorted_labels[-1] + stride * comp_index + self.lift
--> 206 mask = np.zeros(np.prod(self.full_shape), dtype=bool)
    207 mask.put(selector, True)
    209 if mask.sum() < len(self.index):

ValueError: negative dimensions are not allowed

The error is due to a known 32-bit integer overflow issue in pandas, which only occurs on Windows, not on macOS or Linux.

--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[15], line 8 1 # Assign transcripts to segmented cells/nuclei, 2 # whose outlines are stored in the 'segmentation_mask_boundaries' layer. 3 # sdata = sp.tb.allocate( 4 # sdata=sdata, 5 # shapes_layer='segmentation_mask_boundaries' 6 # ) ----> 8 sdata = sp.tb.allocate( 9 sdata=sdata, 10 allocate_from_shapes_layer=False, 11 labels_layer='segmentation_mask', 12 chunks=1500 # we use a smaller than default value, hoping that it avoid the " negative dimensions are not allowed" error raised somewhere in pandas (Arne, perhaps triggered by many small cells in the chunk, and pandas building a pivot table) 13 ) # chunks=2500 also gives the error on the full image File K:\Frank\git\napari-sparrow\src\sparrow\table\_allocation.py:192, in allocate(sdata, labels_layer, shapes_layer, points_layer, allocate_from_shapes_layer, chunks) 188 cell_counts = combined_partitions.groupby([_CELL_INDEX, "gene"]).size() 190 coordinates, cell_counts = dask.compute(coordinates, cell_counts, scheduler="threads") --> 192 cell_counts = cell_counts.unstack(fill_value=0) 193 # convert dtype of columns to "object", otherwise error writing to zarr. 194 cell_counts.columns = cell_counts.columns.astype(str) File c:\Users\frankvn\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\series.py:4615, in Series.unstack(self, level, fill_value, sort) 4570 """ 4571 Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. 4572 (...) 4611 b 2 4 4612 """ 4613 from pandas.core.reshape.reshape import unstack -> 4615 return unstack(self, level, fill_value, sort) File c:\Users\frankvn\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:517, in unstack(obj, level, fill_value, sort) 515 if is_1d_only_ea_dtype(obj.dtype): 516 return _unstack_extension_series(obj, level, fill_value, sort=sort) --> 517 unstacker = _Unstacker( 518 obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort 519 ) 520 return unstacker.get_result( 521 obj._values, value_columns=None, fill_value=fill_value 522 ) File c:\Users\frankvn\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:154, in _Unstacker.__init__(self, index, level, constructor, sort) 146 if num_cells > np.iinfo(np.int32).max: 147 warnings.warn( 148 f"The following operation may generate {num_cells} cells " 149 f"in the resulting pandas object.", 150 PerformanceWarning, 151 stacklevel=find_stack_level(), 152 ) --> 154 self._make_selectors() File c:\Users\frankvn\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:207, in _Unstacker._make_selectors(self) 205 selector = self.sorted_labels[-1] + stride * comp_index + self.lift 206 mask = np.zeros(np.prod(self.full_shape), dtype=bool) --> 207 mask.put(selector, True) 209 if mask.sum() < len(self.index): 210 raise ValueError("Index contains duplicate entries, cannot reshape") IndexError: index 1284583229 is out of bounds for axis 0 with size 1284576184

saeyslab / napari-sparrow

Allocate step fails with "ValueError: negative dimensions are not allowed" on large stereoseq dataset #199