scverse / anndata

Annotated data.
http://anndata.readthedocs.io
BSD 3-Clause "New" or "Revised" License
577 stars 154 forks source link

TypeError when writing string columns to h5ad #1571

Open mtvector opened 3 months ago

mtvector commented 3 months ago

Please make sure these conditions are met

Report

I'm getting the following error when I attempt to write an h5ad file from an anndata:

#It occurs whether or not you try to force type casting
for col in adata.obs.select_dtypes(['object','string[python]','string']).columns:
    adatas[k].obs[col] = adatas[k].obs[col].astype('string')

adata.write_h5ad(working_filename)

Traceback:

TypeError                                 Traceback (most recent call last)
Cell In[26], line 12
---> 12 adata.write_h5ad(working_fn)

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_core/anndata.py:1929](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_core/anndata.py#line=1928), in AnnData.write_h5ad(self, filename, compression, compression_opts, as_dense)
   1926 if filename is None:
   1927     filename = self.filename
-> 1929 write_h5ad(
   1930     Path(filename),
   1931     self,
   1932     compression=compression,
   1933     compression_opts=compression_opts,
   1934     as_dense=as_dense,
   1935 )
   1937 if self.isbacked:
   1938     self.file.filename = filename

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/h5ad.py:104](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/h5ad.py#line=103), in write_h5ad(filepath, adata, as_dense, dataset_kwargs, **kwargs)
    102 elif adata.raw is not None:
    103     write_elem(f, "raw", adata.raw, dataset_kwargs=dataset_kwargs)
--> 104 write_elem(f, "obs", adata.obs, dataset_kwargs=dataset_kwargs)
    105 write_elem(f, "var", adata.var, dataset_kwargs=dataset_kwargs)
    106 write_elem(f, "obsm", dict(adata.obsm), dataset_kwargs=dataset_kwargs)

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/registry.py:359](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/registry.py#line=358), in write_elem(store, k, elem, dataset_kwargs)
    335 def write_elem(
    336     store: GroupStorageType,
    337     k: str,
   (...)
    340     dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
    341 ) -> None:
    342     """
    343     Write an element to a storage group using anndata encoding.
    344 
   (...)
    357         E.g. for zarr this would be `chunks`, `compressor`.
    358     """
--> 359     Writer(_REGISTRY).write_elem(store, k, elem, dataset_kwargs=dataset_kwargs)

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/utils.py:243](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/utils.py#line=242), in report_write_key_on_error.<locals>.func_wrapper(*args, **kwargs)
    241     raise ValueError("No element found in args.")
    242 try:
--> 243     return func(*args, **kwargs)
    244 except Exception as e:
    245     path = _get_display_path(store)

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/registry.py:309](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/registry.py#line=308), in Writer.write_elem(self, store, k, elem, dataset_kwargs, modifiers)
    303 write_func = partial(
    304     self.find_writer(dest_type, elem, modifiers),
    305     _writer=self,
    306 )
    308 if self.callback is None:
--> 309     return write_func(store, k, elem, dataset_kwargs=dataset_kwargs)
    310 return self.callback(
    311     write_func,
    312     store,
   (...)
    316     iospec=self.registry.get_spec(elem),
    317 )

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/registry.py:57](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/registry.py#line=56), in write_spec.<locals>.decorator.<locals>.wrapper(g, k, *args, **kwargs)
     55 @wraps(func)
     56 def wrapper(g: GroupStorageType, k: str, *args, **kwargs):
---> 57     result = func(g, k, *args, **kwargs)
     58     g[k].attrs.setdefault("encoding-type", spec.encoding_type)
     59     g[k].attrs.setdefault("encoding-version", spec.encoding_version)

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/methods.py:709](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/methods.py#line=708), in write_dataframe(f, key, df, _writer, dataset_kwargs)
    704 _writer.write_elem(
    705     group, index_name, df.index._values, dataset_kwargs=dataset_kwargs
    706 )
    707 for colname, series in df.items():
    708     # TODO: this should write the "true" representation of the series (i.e. the underlying array or ndarray depending)
--> 709     _writer.write_elem(
    710         group, colname, series._values, dataset_kwargs=dataset_kwargs
    711     )

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/utils.py:243](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/utils.py#line=242), in report_write_key_on_error.<locals>.func_wrapper(*args, **kwargs)
    241     raise ValueError("No element found in args.")
    242 try:
--> 243     return func(*args, **kwargs)
    244 except Exception as e:
    245     path = _get_display_path(store)

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/registry.py:296](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/site-packages/anndata/_io/specs/registry.py#line=295), in Writer.write_elem(self, store, k, elem, dataset_kwargs, modifiers)
    294 # Normalize k to absolute path
    295 if not PurePosixPath(k).is_absolute():
--> 296     k = str(PurePosixPath(store.name) [/](https://aidc-ondemand-prd.corp.alleninstitute.org/) k)
    298 if k == "[/](https://aidc-ondemand-prd.corp.alleninstitute.org/)":
    299     store.clear()

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/pathlib.py:477](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/pathlib.py#line=476), in PurePath.__new__(cls, *args)
    475 if cls is PurePath:
    476     cls = PureWindowsPath if os.name == 'nt' else PurePosixPath
--> 477 return cls._from_parts(args)

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/pathlib.py:509](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/pathlib.py#line=508), in PurePath._from_parts(cls, args)
    504 @classmethod
    505 def _from_parts(cls, args):
    506     # We need to call _parse_args on the instance, so as to get the
    507     # right flavour.
    508     self = object.__new__(cls)
--> 509     drv, root, parts = self._parse_args(args)
    510     self._drv = drv
    511     self._root = root

File [~/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/pathlib.py:493](https://aidc-ondemand-prd.corp.alleninstitute.org/node/n211/61332/lab/workspaces/auto-m/tree/Matthew/code/hmba-crossspecies-v1/preprocessing/rna/Matthew/utils/miniforge3/envs/scanpy/lib/python3.11/pathlib.py#line=492), in PurePath._parse_args(cls, args)
    491     parts += a._parts
    492 else:
--> 493     a = os.fspath(a)
    494     if isinstance(a, str):
    495         # Force-cast str subclasses to str (issue #21127)
    496         parts.append(str(a))

TypeError: expected str, bytes or os.PathLike object, not NoneType
Error raised while writing key 'orig.ident' of <class 'h5py._hl.group.Group'> to /??

This occurs for an adata like this:

AnnData object with n_obs × n_vars = 38856 × 27912
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'doubcall'
    var: 'gene', 'original_symbol'
    layers: 'UMIs'

The .obs types are as follows:

orig.ident       string[python]
nCount_RNA              float64
nFeature_RNA              int32
doubcall             string[python]

And it seems all the string[python] typed columns all yield this problem.

Any insight you could give would be very helpful. Maybe I'm missing something obvious? Thanks!

Versions

anndata             0.10.8
h5py                3.11.0
matplotlib          3.9.1
numpy               1.26.4
pandas              2.2.2
scanpy              1.10.2
scipy               1.11.4
seaborn             0.13.2
session_info        1.0.0
v1utils             0.1.0
-----
PIL                 10.4.0
asttokens           NA
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0
debugpy             1.8.2
decorator           5.1.1
executing           2.0.1
igraph              0.11.6
ipykernel           6.29.5
jedi                0.19.1
joblib              1.3.2
kiwisolver          1.4.5
legacy_api_wrap     NA
leidenalg           0.10.2
llvmlite            0.43.0
mpl_toolkits        NA
natsort             8.4.0
numba               0.60.0
packaging           24.1
parso               0.8.4
patsy               0.5.6
pickleshare         0.7.5
platformdirs        4.2.2
prompt_toolkit      3.0.47
psutil              6.0.0
pure_eval           0.2.3
pydev_ipython       NA
pydevconsole        NA
pydevd              2.9.5
pydevd_file_utils   NA
pydevd_plugins      NA
pydevd_tracing      NA
pygments            2.18.0
pyparsing           3.1.2
pytz                2023.3.post1
six                 1.16.0
sklearn             1.3.2
stack_data          0.6.2
statsmodels         0.14.2
texttable           1.7.0
threadpoolctl       3.2.0
tornado             6.4.1
traitlets           5.14.3
typing_extensions   NA
wcwidth             0.2.13
zmq                 26.0.3
-----
IPython             8.26.0
jupyter_client      8.6.2
jupyter_core        5.7.2
-----
Python 3.11.9 | packaged by conda-forge | (main, Apr 19 2024, 18:36:13) [GCC 12.3.0]
Linux-3.10.0-1160.25.1.el7.x86_64-x86_64-with-glibc2.17
-----
Session information updated at 2024-07-31 00:59
ilan-gold commented 3 months ago

I think this is related to both https://github.com/scverse/anndata/issues/1577 and https://github.com/scverse/anndata/issues/679. I cannot reproduce this, though:

from scanpy.datasets import pbmc3k_processed

adata = pbmc3k_processed()
adata.obs['louvain'] = adata.obs['louvain'].astype('string')
adata.write_h5ad('foo.h5ad')

works for me.

But

In [40]: ad.write_h5ad('foo.h5ad', adata, convert_strings_to_categoricals=False)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[40], line 1
----> 1 ad.write_h5ad('foo.h5ad', adata, convert_strings_to_categoricals=False)

AttributeError: module 'anndata' has no attribute 'write_h5ad'

In [41]: ad._io.write_h5ad('foo.h5ad', adata, convert_strings_to_categoricals=False)
---------------------------------------------------------------------------
IORegistryError                           Traceback (most recent call last)
Cell In[41], line 1
----> 1 ad._io.write_h5ad('foo.h5ad', adata, convert_strings_to_categoricals=False)

File ~/Projects/Theis/anndata/src/anndata/_io/h5ad.py:103, in write_h5ad(filepath, adata, as_dense, convert_strings_to_categoricals, dataset_kwargs, **kwargs)
    101 elif adata.raw is not None:
    102     write_elem(f, "raw", adata.raw, dataset_kwargs=dataset_kwargs)
--> 103 write_elem(f, "obs", adata.obs, dataset_kwargs=dataset_kwargs)
    104 write_elem(f, "var", adata.var, dataset_kwargs=dataset_kwargs)
    105 write_elem(f, "obsm", dict(adata.obsm), dataset_kwargs=dataset_kwargs)

File ~/Projects/Theis/anndata/src/anndata/_io/specs/registry.py:432, in write_elem(store, k, elem, dataset_kwargs)
    408 def write_elem(
    409     store: GroupStorageType,
    410     k: str,
   (...)
    413     dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
    414 ) -> None:
    415     """
    416     Write an element to a storage group using anndata encoding.
    417
   (...)
    430         E.g. for zarr this would be `chunks`, `compressor`.
    431     """
--> 432     Writer(_REGISTRY).write_elem(store, k, elem, dataset_kwargs=dataset_kwargs)

File ~/Projects/Theis/anndata/src/anndata/_io/utils.py:247, in report_write_key_on_error.<locals>.func_wrapper(*args, **kwargs)
    245     raise ValueError("No element found in args.")
    246 try:
--> 247     return func(*args, **kwargs)
    248 except Exception as e:
    249     path = _get_display_path(store)

File ~/Projects/Theis/anndata/src/anndata/_io/specs/registry.py:357, in Writer.write_elem(self, store, k, elem, dataset_kwargs, modifiers)
    354 write_func = self.find_write_func(dest_type, elem, modifiers)
    356 if self.callback is None:
--> 357     return write_func(store, k, elem, dataset_kwargs=dataset_kwargs)
    358 return self.callback(
    359     write_func,
    360     store,
   (...)
    364     iospec=self.registry.get_spec(elem),
    365 )

File ~/Projects/Theis/anndata/src/anndata/_io/specs/registry.py:73, in write_spec.<locals>.decorator.<locals>.wrapper(g, k, *args, **kwargs)
     71 @wraps(func)
     72 def wrapper(g: GroupStorageType, k: str, *args, **kwargs):
---> 73     result = func(g, k, *args, **kwargs)
     74     g[k].attrs.setdefault("encoding-type", spec.encoding_type)
     75     g[k].attrs.setdefault("encoding-version", spec.encoding_version)

File ~/Projects/Theis/anndata/src/anndata/_io/specs/methods.py:863, in write_dataframe(f, key, df, _writer, dataset_kwargs)
    858 _writer.write_elem(
    859     group, index_name, df.index._values, dataset_kwargs=dataset_kwargs
    860 )
    861 for colname, series in df.items():
    862     # TODO: this should write the "true" representation of the series (i.e. the underlying array or ndarray depending)
--> 863     _writer.write_elem(
    864         group, colname, series._values, dataset_kwargs=dataset_kwargs
    865     )

File ~/Projects/Theis/anndata/src/anndata/_io/utils.py:247, in report_write_key_on_error.<locals>.func_wrapper(*args, **kwargs)
    245     raise ValueError("No element found in args.")
    246 try:
--> 247     return func(*args, **kwargs)
    248 except Exception as e:
    249     path = _get_display_path(store)

File ~/Projects/Theis/anndata/src/anndata/_io/specs/registry.py:354, in Writer.write_elem(self, store, k, elem, dataset_kwargs, modifiers)
    351 elif k in store:
    352     del store[k]
--> 354 write_func = self.find_write_func(dest_type, elem, modifiers)
    356 if self.callback is None:
    357     return write_func(store, k, elem, dataset_kwargs=dataset_kwargs)

File ~/Projects/Theis/anndata/src/anndata/_io/specs/registry.py:321, in Writer.find_write_func(self, dest_type, elem, modifiers)
    317         return self.registry.get_write(
    318             dest_type, pattern, modifiers, writer=self
    319         )
    320 # Raises IORegistryError
--> 321 return self.registry.get_write(dest_type, type(elem), modifiers, writer=self)

File ~/Projects/Theis/anndata/src/anndata/_io/specs/registry.py:137, in IORegistry.get_write(self, dest_type, src_type, modifiers, writer)
    134     dest_type = h5py.Group
    136 if (dest_type, src_type, modifiers) not in self.write:
--> 137     raise IORegistryError._from_write_parts(dest_type, src_type, modifiers)
    138 internal = self.write[(dest_type, src_type, modifiers)]
    139 return partial(internal, _writer=writer)

IORegistryError: No method registered for writing <class 'pandas.core.arrays.string_.StringArray'> into <class 'h5py._hl.group.Group'>
Error raised while writing key 'louvain' of <class 'h5py._hl.group.Group'> to /obs

errors on main, which is why I linked the issue. When I use 0.10.8, I can't reproduce your specific problem either (or any, the write_h5ad call works). Could you share a clearer reproducer?

github-actions[bot] commented 4 weeks ago

This issue has been automatically marked as stale because it has not had recent activity. Please add a comment if you want to keep the issue open. Thank you for your contributions!