saeyslab / napari-sparrow

Other
17 stars 0 forks source link

Allocate step fails with "ValueError: negative dimensions are not allowed" on large stereoseq dataset #199

Closed SilverViking closed 1 month ago

SilverViking commented 1 month ago

On a large stereoseq dataset (around 30 million cells, and 25000 genes), allocation of transcripts to segmented cells fails.

This step

sdata = sp.tb.allocate( 
    sdata=sdata,
    allocate_from_shapes_layer=False,
    labels_layer='segmentation_mask',
    chunks=1500) 

triggers the error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[15], line 8
      1 # Assign transcripts to segmented cells/nuclei,
      2 # whose outlines are stored in the 'segmentation_mask_boundaries' layer.
      3 # sdata = sp.tb.allocate( 
      4 #     sdata=sdata,
      5 #     shapes_layer='segmentation_mask_boundaries'
      6 #   )
----> 8 sdata = sp.tb.allocate( 
      9     sdata=sdata,
     10     allocate_from_shapes_layer=False,
     11     labels_layer='segmentation_mask',
     12     chunks=1500
     13   )

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\sparrow\table\_allocation.py:192, in allocate(sdata, labels_layer, shapes_layer, points_layer, allocate_from_shapes_layer, chunks)
    188 cell_counts = combined_partitions.groupby([_CELL_INDEX, "gene"]).size()
    190 coordinates, cell_counts = dask.compute(coordinates, cell_counts, scheduler="threads")
--> 192 cell_counts = cell_counts.unstack(fill_value=0)
    193 # convert dtype of columns to "object", otherwise error writing to zarr.
    194 cell_counts.columns = cell_counts.columns.astype(str)

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\series.py:4615, in Series.unstack(self, level, fill_value, sort)
   4570 """
   4571 Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.
   4572 
   (...)
   4611 b    2    4
   4612 """
   4613 from pandas.core.reshape.reshape import unstack
-> 4615 return unstack(self, level, fill_value, sort)

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:517, in unstack(obj, level, fill_value, sort)
    515 if is_1d_only_ea_dtype(obj.dtype):
    516     return _unstack_extension_series(obj, level, fill_value, sort=sort)
--> 517 unstacker = _Unstacker(
    518     obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
    519 )
    520 return unstacker.get_result(
    521     obj._values, value_columns=None, fill_value=fill_value
    522 )

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:154, in _Unstacker.__init__(self, index, level, constructor, sort)
    146 if num_cells > np.iinfo(np.int32).max:
    147     warnings.warn(
    148         f"The following operation may generate {num_cells} cells "
    149         f"in the resulting pandas object.",
    150         PerformanceWarning,
    151         stacklevel=find_stack_level(),
    152     )
--> 154 self._make_selectors()

File c:\Users\frank\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:206, in _Unstacker._make_selectors(self)
    203 self.full_shape = ngroups, stride
    205 selector = self.sorted_labels[-1] + stride * comp_index + self.lift
--> 206 mask = np.zeros(np.prod(self.full_shape), dtype=bool)
    207 mask.put(selector, True)
    209 if mask.sum() < len(self.index):

ValueError: negative dimensions are not allowed

The error is due to a known 32-bit integer overflow issue in pandas, which only occurs on Windows, not on macOS or Linux.

SilverViking commented 1 month ago

For another stereoseq dataset, allocation fails with a different error, but the root cause is almost certainly the same:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[15], line 8
      1 # Assign transcripts to segmented cells/nuclei,
      2 # whose outlines are stored in the 'segmentation_mask_boundaries' layer.
      3 # sdata = sp.tb.allocate( 
      4 #     sdata=sdata,
      5 #     shapes_layer='segmentation_mask_boundaries'
      6 #   )
----> 8 sdata = sp.tb.allocate( 
      9     sdata=sdata,
     10     allocate_from_shapes_layer=False,
     11     labels_layer='segmentation_mask',
     12     chunks=1500   # we use a smaller than default value, hoping that it avoid the " negative dimensions are not allowed" error raised somewhere in pandas (Arne, perhaps triggered by many small cells in the chunk, and pandas building a pivot table)
     13   )  # chunks=2500 also gives the error on the full image

File K:\Frank\git\napari-sparrow\src\sparrow\table\_allocation.py:192, in allocate(sdata, labels_layer, shapes_layer, points_layer, allocate_from_shapes_layer, chunks)
    188 cell_counts = combined_partitions.groupby([_CELL_INDEX, "gene"]).size()
    190 coordinates, cell_counts = dask.compute(coordinates, cell_counts, scheduler="threads")
--> 192 cell_counts = cell_counts.unstack(fill_value=0)
    193 # convert dtype of columns to "object", otherwise error writing to zarr.
    194 cell_counts.columns = cell_counts.columns.astype(str)

File c:\Users\frankvn\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\series.py:4615, in Series.unstack(self, level, fill_value, sort)
   4570 """
   4571 Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.
   4572 
   (...)
   4611 b    2    4
   4612 """
   4613 from pandas.core.reshape.reshape import unstack
-> 4615 return unstack(self, level, fill_value, sort)

File c:\Users\frankvn\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:517, in unstack(obj, level, fill_value, sort)
    515 if is_1d_only_ea_dtype(obj.dtype):
    516     return _unstack_extension_series(obj, level, fill_value, sort=sort)
--> 517 unstacker = _Unstacker(
    518     obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
    519 )
    520 return unstacker.get_result(
    521     obj._values, value_columns=None, fill_value=fill_value
    522 )

File c:\Users\frankvn\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:154, in _Unstacker.__init__(self, index, level, constructor, sort)
    146 if num_cells > np.iinfo(np.int32).max:
    147     warnings.warn(
    148         f"The following operation may generate {num_cells} cells "
    149         f"in the resulting pandas object.",
    150         PerformanceWarning,
    151         stacklevel=find_stack_level(),
    152     )
--> 154 self._make_selectors()

File c:\Users\frankvn\Miniconda3\envs\napari-sparrow\lib\site-packages\pandas\core\reshape\reshape.py:207, in _Unstacker._make_selectors(self)
    205 selector = self.sorted_labels[-1] + stride * comp_index + self.lift
    206 mask = np.zeros(np.prod(self.full_shape), dtype=bool)
--> 207 mask.put(selector, True)
    209 if mask.sum() < len(self.index):
    210     raise ValueError("Index contains duplicate entries, cannot reshape")

IndexError: index 1284583229 is out of bounds for axis 0 with size 1284576184

Probably integer overflow in this case results in a positive number, but one that does not correspond to an existing index.