scverse / anndata

Annotated data.
http://anndata.readthedocs.io
BSD 3-Clause "New" or "Revised" License
575 stars 152 forks source link

Error when merging 2 different anndatas with int64 indptr csr_matrix chunked dask arrays #1356

Closed mumichae closed 3 months ago

mumichae commented 8 months ago

Please make sure these conditions are met

Report

I'm trying to merge 2 datasets of data downloaded from

using dask sparse arrays and forcing scipy.sparse.csr_matrix.indptr to int64 (workaround for larger datasets that I'm working with). This causes an issue if I don't also set indices to int64

Code:

import scanpy as sc
from dask import array as da
from scipy.sparse import csr_matrix

def csr_matrix_int64_indptr(x):
    x = csr_matrix(x)
    x.indptr = x.indptr.astype(np.int64)
    # x.indices = x.indices.astype(np.int64) # seems to be necessary to avoid "ValueError: Output dtype not compatible with inputs."
    return x

# assuming adata.X is a scipy.sparse.csr_matrix
adata.X = da.from_array(adata.X).map_blocks(csr_matrix_int64_indptr)
adata2.X = da.from_array(adata.X).map_blocks(csr_matrix_int64_indptr)

sc.concat([adata, adata2]).X.persist()

Traceback:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[24], line 1
----> 1 sc.concat([adata, adata2]).X.persist()

File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/dask/base.py:352, in DaskMethodsMixin.persist(self, **kwargs)
    313 def persist(self, **kwargs):
    314     """Persist this dask collection into memory
    315
    316     This turns a lazy Dask collection into a Dask collection with the same
   (...)                                                                                                                                                                                                                                                                                                                       ──────────────────────────────────
    350     dask.persist
    351     """
--> 352     (result,) = persist(self, traverse=False, **kwargs)
    353     return result

File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/dask/base.py:1002, in persist(traverse, optimize_graph, scheduler, *args, **kwargs)                                                                                                                                                    
    999     postpersists.append((rebuild, a_keys, state))
   1001 with shorten_traceback():
-> 1002     results = schedule(dsk, keys, **kwargs)
   1004 d = dict(zip(keys, results))
   1005 results2 = [r({k: d[k] for k in ks}, *s) for r, ks, s in postpersists]

File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/dask/array/chunk.py:420, in getitem(obj, index)
    399 """Getitem function
    400
    401 This function creates a copy of the desired selection for array-like
   (...)                                                                                                                                                                                                                                                                                                                       dptr/.zarray
    417
    418 """
    419 try:
--> 420     result = obj[index]
    421 except IndexError as e:
    422     raise ValueError(
    423         "Array chunk size or shape is unknown. "                                                                                                                                                                                                                                                                               "login-01" 09:32 06-Feb-24
    424         "Possible solution with x.compute_chunk_sizes()"
    425     ) from e

File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/scipy/sparse/_index.py:70, in IndexMixin.__getitem__(self, key)
     68         return self._get_sliceXslice(row, col)
     69     elif col.ndim == 1:
---> 70         return self._get_sliceXarray(row, col)
     71     raise IndexError('index results in >2 dimensions')
     72 elif row.ndim == 1:

File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/scipy/sparse/_csr.py:207, in _csr_base._get_sliceXarray(self, row, col)
    206 def _get_sliceXarray(self, row, col):
--> 207     return self._major_slice(row)._minor_index_fancy(col)

File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/scipy/sparse/_compressed.py:774, in _cs_matrix._minor_index_fancy(self, idx)                                                                                                                                                           
    772 col_offsets = np.zeros(N, dtype=idx_dtype)
    773 res_indptr = np.empty_like(self.indptr)
--> 774 csr_column_index1(k, idx, M, N, self.indptr, self.indices,
    775                   col_offsets, res_indptr)
    777 # pass 2: copy indices/data for selected idxs
    778 col_order = np.argsort(idx).astype(idx_dtype, copy=False)

ValueError: Output dtype not compatible with inputs.

Versions

-----
IPython             8.21.0
anndata             0.10.4
dask                2024.1.1
scanpy              1.10.0.dev198+g89869212
scipy               1.12.0
session_info        1.0.0
utils               NA
-----
PIL                 10.2.0
array_api_compat    1.4.1
asciitree           NA
asttokens           NA
click               8.1.7
cloudpickle         3.0.0
cycler              0.12.1
cython_runtime      NA
cytoolz             0.12.2
dateutil            2.8.2
decorator           5.1.1
distributed         2024.1.1
exceptiongroup      1.2.0
executing           2.0.1
fasteners           0.17.3
fsspec              2023.12.2
h5py                3.10.0
igraph              0.11.3
importlib_metadata  NA
jedi                0.19.1
jinja2              3.1.3
joblib              1.3.2
kiwisolver          1.4.5
legacy_api_wrap     NA
leidenalg           0.10.2
llvmlite            0.41.1
locket              NA
lz4                 4.3.3
markupsafe          2.1.4
matplotlib          3.8.2
mpl_toolkits        NA
msgpack             1.0.7
natsort             8.4.0
numba               0.58.1
numcodecs           0.12.1
numpy               1.26.3
packaging           23.2
pandas              2.2.0
parso               0.8.3
pickleshare         0.7.5
prompt_toolkit      3.0.42
psutil              5.9.8
pure_eval           0.2.2
pyarrow             15.0.0
pygments            2.17.2
pyparsing           3.1.1
pytz                2023.4
six                 1.16.0
sklearn             1.4.0
sortedcontainers    2.4.0
sparse              0.15.1
stack_data          0.6.2
tblib               3.0.0
texttable           1.7.0
threadpoolctl       3.2.0
tlz                 0.12.2
toolz               0.12.1
tornado             6.3.3
traitlets           5.14.1
typing_extensions   NA
wcwidth             0.2.13
yaml                6.0.1
zarr                2.16.1
zict                3.0.0
zipp                NA
zoneinfo            NA
-----
Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]
Linux-4.18.0-477.27.1.el8_8.x86_64-x86_64-with-glibc2.28
-----
Session information updated at 2024-02-06 09:43
flying-sheep commented 3 months ago

Fixed in #1348