RadioAstronomySoftwareGroup / pyuvdata

A pythonic interface for radio astronomy interferometry data (uvfits, miriad, others)
https://pyuvdata.readthedocs.io/en/latest/index.html
BSD 2-Clause "Simplified" License
83 stars 26 forks source link

can't parse `antenna_names` from file #1467

Closed wfarah closed 1 month ago

wfarah commented 1 month ago

Using pyuvdata==3.0.0, h5py==3.11.0, numpy==2.0.1, reading in a file throws this error:

File /opt/mnt/miniconda3/envs/pyuvdata/lib/python3.12/site-packages/pyuvdata/uvdata/uvdata.py:9883, in UVData.read_uvh5(self, filename, **kwargs)
   9876     raise ValueError(
   9877         "Reading multiple files from class specific "
   9878         "read functions is no longer supported. "
   9879         "Use the generic `uvdata.read` function instead."
   9880     )
   9882 uvh5_obj = uvh5.UVH5()
-> 9883 uvh5_obj.read_uvh5(filename, **kwargs)
   9884 self._convert_from_filetype(uvh5_obj)
   9885 del uvh5_obj

File /opt/mnt/miniconda3/envs/pyuvdata/lib/python3.12/site-packages/pyuvdata/uvdata/uvh5.py:1051, in UVH5.read_uvh5(self, filename, antenna_nums, antenna_names, ant_str, bls, frequencies, freq_chans, times, time_range, lsts, lst_range, polarizations, blt_inds, phase_center_ids, catalog_names, keep_all_metadata, read_data, data_array_dtype, multidim_index, remove_flex_pol, background_lsts, run_check, check_extra, run_check_acceptability, strict_uvw_antpos_check, fix_old_proj, fix_use_ant_pos, check_autos, fix_autos, use_future_array_shapes, blt_order, blts_are_rectangular, time_axis_faster_than_bls, recompute_nbls, astrometry_library)
   1048 self._filename.form = (1,)
   1050 # open hdf5 file for reading
-> 1051 self._read_header(
   1052     meta,
   1053     run_check=run_check,
   1054     check_extra=check_extra,
   1055     run_check_acceptability=run_check_acceptability,
   1056     background_lsts=background_lsts,
   1057     astrometry_library=astrometry_library,
   1058 )
   1060 if read_data:
   1061     # Now read in the data
   1062     self._get_data(
   1063         meta.datagrp,
   1064         antenna_nums=antenna_nums,
   (...)
   1080         multidim_index=multidim_index,
   1081     )

File /opt/mnt/miniconda3/envs/pyuvdata/lib/python3.12/site-packages/pyuvdata/uvdata/uvh5.py:680, in UVH5._read_header(self, filename, **kwargs)
    656 def _read_header(
    657     self, filename: str | Path | FastUVH5Meta | h5py.File | h5py.Group, **kwargs
    658 ):
    659     """
    660     Read header information from a UVH5 file.
    661
   (...)
    678     None
    679     """
--> 680     self._read_header_with_fast_meta(filename, **kwargs)

File /opt/mnt/miniconda3/envs/pyuvdata/lib/python3.12/site-packages/pyuvdata/uvdata/uvh5.py:502, in UVH5._read_header_with_fast_meta(self, filename, run_check, check_extra, run_check_acceptability, blt_order, blts_are_rectangular, time_axis_faster_than_bls, background_lsts, recompute_nbls, astrometry_library)
    490 self.time_array = obj.time_array
    491 required_telescope_keys = [
    492     "telescope_name",
    493     "latitude",
   (...)
    500     "antenna_positions",
    501 ]
--> 502 self.telescope = Telescope.from_hdf5(
    503     filename,
    504     required_keys=required_telescope_keys,
    505     run_check=run_check,
    506     check_extra=check_extra,
    507     run_check_acceptability=run_check_acceptability,
    508 )
    509 self._set_telescope_requirements()
    511 if "lst_array" in obj.header:

File /opt/mnt/miniconda3/envs/pyuvdata/lib/python3.12/site-packages/pyuvdata/telescopes.py:863, in Telescope.from_hdf5(cls, filename, required_keys, run_check, check_extra, run_check_acceptability)
    861 for attr, tel_attr in telescope_attrs.items():
    862     try:
--> 863         setattr(tel_obj, tel_attr, getattr(meta, attr))
    864     except (AttributeError, KeyError) as e:
    865         if attr in required_keys:

File /opt/mnt/miniconda3/envs/pyuvdata/lib/python3.12/functools.py:995, in cached_property.__get__(self, instance, owner)
    993 val = cache.get(self.attrname, _NOT_FOUND)
    994 if val is _NOT_FOUND:
--> 995     val = self.func(instance)
    996     try:
    997         cache[self.attrname] = val

File /opt/mnt/miniconda3/envs/pyuvdata/lib/python3.12/site-packages/pyuvdata/utils/io/hdf5.py:677, in HDF5Meta.antenna_names(self)
    674 @cached_property
    675 def antenna_names(self) -> list[str]:
    676     """The antenna names in the file."""
--> 677     return np.char.decode(self.header["antenna_names"][:], encoding="utf8")

File /opt/mnt/miniconda3/envs/pyuvdata/lib/python3.12/site-packages/numpy/_core/strings.py:481, in decode(a, encoding, errors)
    439 def decode(a, encoding=None, errors=None):
    440     r"""
    441     Calls :meth:`bytes.decode` element-wise.
    442
   (...)
    478
    479     """
    480     return _to_bytes_or_str_array(
--> 481         _vec_string(a, np.object_, 'decode', _clean_args(encoding, errors)),
    482         np.str_(''))

TypeError: string operation on non-string array

Upon further inspection, it seems like the np.char.decode in the header-parsing function that reads antenna_names is the culprit. I can reproduce the same error if I try to do it manually:

In [23]: import h5py

In [24]: f = h5py.File("./uvh5_60529_59611_075439_AzEl_0001.uvh5")

In [25]: header = f["/Header"]

In [26]: header["antenna_names"][:]
Out[26]:
array([b'1a', b'1b', b'1c', b'1d', b'1e', b'1f', b'1g', b'1h', b'1j',
       b'1k', b'2a', b'2b', b'2c', b'2d', b'2e', b'2f', b'2g', b'2h',
       b'2j', b'2k', b'2l', b'2m', b'3c', b'3d', b'3e', b'3f', b'3g',
       b'3h', b'3j', b'3l', b'4e', b'4f', b'4g', b'4h', b'4j', b'4k',
       b'4l', b'5b', b'5c', b'5e', b'5g', b'5h'], dtype=object)

In [27]: np.char.decode(header["antenna_names"][:], encoding="utf8")
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [27], in <cell line: 1>()
----> 1 np.char.decode(header["antenna_names"][:], encoding="utf8")

File /opt/mnt/miniconda3/lib/python3.9/site-packages/numpy/core/defchararray.py:615, in decode(a, encoding, errors)
    572 @array_function_dispatch(_code_dispatcher)
    573 def decode(a, encoding=None, errors=None):
    574     r"""
    575     Calls ``bytes.decode`` element-wise.
    576
   (...)
    612
    613     """
    614     return _to_bytes_or_str_array(
--> 615         _vec_string(a, object_, 'decode', _clean_args(encoding, errors)))

TypeError: string operation on non-string array
bhazelton commented 1 month ago

This is specifically a bug for antenna_names that are stored as variable length strings in the HDF5 file (as opposed to fixed length strings).

After some digging, I think it's clear that this was introduced in v3.0. Prior to that the decoding was actually done in a loop (in a list comprehension) so it wasn’t calling np.char.decode. That was introduced as a computational improvement in version 3.0, but we didn’t have a test that included files with variable length strings, so didn’t realize we were introducing a bug.