chanzuckerberg / cellxgene-census

CZ CELLxGENE Discover Census
https://chanzuckerberg.github.io/cellxgene-census/
MIT License
84 stars 22 forks source link

[python] ExperimentDataPipe fails unit tests with tiledbsoma 1.7rc1 #955

Closed bkmartinjr closed 9 months ago

bkmartinjr commented 9 months ago

The ExperimentDataPipe appears to be reliant on private API in somacore, which changes in the upcoming tiledbsoma 1.7 release. This will need to be addressed before our package is updated.

E AttributeError: module 'somacore.query._fast_csr' has no attribute 'read_scipy_csr'

Details:

________________ test_non_batched[6-3-pytorch_x_value_gen-True] ________________
soma_experiment = <Experiment '/tmp/pytest-of-runner/pytest-0/test_non_batched_6_3_pytorch_x0/exp' (open for 'r') (2 items)
    'obs': '...xp/obs' (unopened)
    'ms': 'file:///tmp/pytest-of-runner/pytest-0/test_non_batched_6_3_pytorch_x0/exp/ms' (unopened)>
use_eager_fetch = True
    @pytest.mark.experimental
    # noinspection PyTestParametrized
    @pytest.mark.parametrize(
        "obs_range,var_range,X_value_gen,use_eager_fetch",
        [(6, 3, pytorch_x_value_gen, use_eager_fetch) for use_eager_fetch in (True, False)],
    )
    def test_non_batched(soma_experiment: Experiment, use_eager_fetch: bool) -> None:
        exp_data_pipe = ExperimentDataPipe(
            soma_experiment,
            measurement_name="RNA",
            X_name="raw",
            obs_column_names=["label"],
            use_eager_fetch=use_eager_fetch,
        )
        row_iter = iter(exp_data_pipe)

>       row = next(row_iter)
api/python/cellxgene_census/tests/experimental/ml/test_pytorch.py:145: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../../_tool/Python/3.11.7/x64/lib/python3.11/site-packages/torch/utils/data/datapipes/_hook_iterator.py:173: in wrap_generator
    response = gen.send(None)
../../_tool/Python/3.11.7/x64/lib/python3.11/site-packages/cellxgene_census/experimental/ml/pytorch.py:584: in __iter__
    for datum_ in obs_and_x_iter:
../../_tool/Python/3.11.7/x64/lib/python3.11/site-packages/cellxgene_census/experimental/ml/pytorch.py:252: in __next__
    obs_partial, X_partial = self._read_partial_torch_batch(self.batch_size - len(obs))
../../_tool/Python/3.11.7/x64/lib/python3.11/site-packages/cellxgene_census/experimental/ml/pytorch.py:300: in _read_partial_torch_batch
    self.soma_chunk: _SOMAChunk = next(self.soma_chunk_iter)
../../_tool/Python/3.11.7/x64/lib/python3.11/site-packages/cellxgene_census/experimental/util/_eager_iter.py:33: in __next__
    res = self._future.result()
../../_tool/Python/3.11.7/x64/lib/python3.11/concurrent/futures/_base.py:449: in result
    return self.__get_result()
../../_tool/Python/3.11.7/x64/lib/python3.11/concurrent/futures/_base.py:401: in __get_result
    raise self._exception
../../_tool/Python/3.11.7/x64/lib/python3.11/concurrent/futures/thread.py:58: in run
    result = self.fn(*self.args, **self.kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
self = <cellxgene_census.experimental.ml.pytorch._ObsAndXSOMAIterator object at 0x7f6be7a99850>
    def __next__(self) -> _SOMAChunk:
        pytorch_logger.debug("Retrieving next SOMA chunk...")
        start_time = time()

        # If no more batches to iterate through, raise StopIteration, as all iterators do when at end
        obs_joinids_chunk = next(self.obs_joinids_chunks_iter)

        obs_batch = (
            self.obs.read(
                coords=(obs_joinids_chunk,),
                column_names=self.obs_column_names,
            )
            .concat()
            .to_pandas()
            .set_index("soma_joinid")
        )
        assert obs_batch.shape[0] == obs_joinids_chunk.shape[0]

        # handle case of empty result (first batch has 0 rows)
        if len(obs_batch) == 0:
            raise StopIteration

        # reorder obs rows to match obs_joinids_chunk ordering, which may be shuffled
        obs_batch = obs_batch.reindex(obs_joinids_chunk, copy=False)

        # note: order of rows in returned CSR matches the order of the requested obs_joinids, so no need to reindex
>       X_batch = _fast_csr.read_scipy_csr(self.X, pa.array(obs_joinids_chunk), pa.array(self.var_joinids))
E       AttributeError: module 'somacore.query._fast_csr' has no attribute 'read_scipy_csr'
E       This exception is thrown by __iter__ of ExperimentDataPipe(batch_size=1, measurement_name='RNA', obs_column_names=['soma_joinid', 'label'], obs_query=None, return_sparse_X=False, shuffle=functools.partial(<function IterDataPipe.register_datapipe_as_function.<locals>.class_function at 0x7f6ca37fff60>, <class 'torch.utils.data.datapipes.iter.combinatorics.ShufflerIterDataPipe'>, False, ExperimentDataPipe), soma_chunk_size=149130808, use_eager_fetch=True, var_query=None)
../../_tool/Python/3.11.7/x64/lib/python3.11/site-packages/cellxgene_census/experimental/ml/pytorch.py:170: AttributeError