using dask sparse arrays and forcing scipy.sparse.csr_matrix.indptr to int64 (workaround for larger datasets that I'm working with). This causes an issue if I don't also set indices to int64
Code:
import scanpy as sc
from dask import array as da
from scipy.sparse import csr_matrix
def csr_matrix_int64_indptr(x):
x = csr_matrix(x)
x.indptr = x.indptr.astype(np.int64)
# x.indices = x.indices.astype(np.int64) # seems to be necessary to avoid "ValueError: Output dtype not compatible with inputs."
return x
# assuming adata.X is a scipy.sparse.csr_matrix
adata.X = da.from_array(adata.X).map_blocks(csr_matrix_int64_indptr)
adata2.X = da.from_array(adata.X).map_blocks(csr_matrix_int64_indptr)
sc.concat([adata, adata2]).X.persist()
Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[24], line 1
----> 1 sc.concat([adata, adata2]).X.persist()
File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/dask/base.py:352, in DaskMethodsMixin.persist(self, **kwargs)
313 def persist(self, **kwargs):
314 """Persist this dask collection into memory
315
316 This turns a lazy Dask collection into a Dask collection with the same
(...) ──────────────────────────────────
350 dask.persist
351 """
--> 352 (result,) = persist(self, traverse=False, **kwargs)
353 return result
File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/dask/base.py:1002, in persist(traverse, optimize_graph, scheduler, *args, **kwargs)
999 postpersists.append((rebuild, a_keys, state))
1001 with shorten_traceback():
-> 1002 results = schedule(dsk, keys, **kwargs)
1004 d = dict(zip(keys, results))
1005 results2 = [r({k: d[k] for k in ks}, *s) for r, ks, s in postpersists]
File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/dask/array/chunk.py:420, in getitem(obj, index)
399 """Getitem function
400
401 This function creates a copy of the desired selection for array-like
(...) dptr/.zarray
417
418 """
419 try:
--> 420 result = obj[index]
421 except IndexError as e:
422 raise ValueError(
423 "Array chunk size or shape is unknown. " "login-01" 09:32 06-Feb-24
424 "Possible solution with x.compute_chunk_sizes()"
425 ) from e
File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/scipy/sparse/_index.py:70, in IndexMixin.__getitem__(self, key)
68 return self._get_sliceXslice(row, col)
69 elif col.ndim == 1:
---> 70 return self._get_sliceXarray(row, col)
71 raise IndexError('index results in >2 dimensions')
72 elif row.ndim == 1:
File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/scipy/sparse/_csr.py:207, in _csr_base._get_sliceXarray(self, row, col)
206 def _get_sliceXarray(self, row, col):
--> 207 return self._major_slice(row)._minor_index_fancy(col)
File /hpc/projects/hca_integration/mambaforge/envs/scanpy/lib/python3.10/site-packages/scipy/sparse/_compressed.py:774, in _cs_matrix._minor_index_fancy(self, idx)
772 col_offsets = np.zeros(N, dtype=idx_dtype)
773 res_indptr = np.empty_like(self.indptr)
--> 774 csr_column_index1(k, idx, M, N, self.indptr, self.indices,
775 col_offsets, res_indptr)
777 # pass 2: copy indices/data for selected idxs
778 col_order = np.argsort(idx).astype(idx_dtype, copy=False)
ValueError: Output dtype not compatible with inputs.
Please make sure these conditions are met
Report
I'm trying to merge 2 datasets of data downloaded from
using dask sparse arrays and forcing scipy.sparse.csr_matrix.indptr to int64 (workaround for larger datasets that I'm working with). This causes an issue if I don't also set indices to int64
Code:
Traceback:
Versions