euroargodev / argopy

A python library for Argo data beginners and experts
https://argopy.readthedocs.io
European Union Public License 1.2
176 stars 38 forks source link

Cache is not working with the ArgoIndex #345

Closed gmaze closed 4 months ago

gmaze commented 4 months ago

I noticed a while ago some strange behaviour of the ArgoIndex

It may happen that using the cache argument, the ArgoIndex may not find the index file in the cachedir and report a FileNotFoundError error

This happens for both Pandas and pyarrow backends

MCVE Code Sample

from argopy import ArgoIndex
ArgoIndex(cache=True).load()

will throw the error:

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[13], line 1
----> 1 ArgoIndex(cache=True).load()

File ~/git/github/euroargodev/argopy/argopy/stores/argo_index_pa.py:126, in indexstore_pyarrow.load(self, nrows, force)
    123         self.index = csv2index(f, self.index_path)
    125 if self.cache and self.index.shape[0] > 0:
--> 126     self._write(self.fs["client"], this_path, self.index, fmt=self.ext)
    127     self.index = self._read(self.fs["client"].fs, this_path)
    128     self.index_path_cache = this_path

File ~/git/github/euroargodev/argopy/argopy/stores/argo_index_proto.py:389, in ArgoIndexStoreProto._write(self, fs, path, obj, fmt)
    387 if fmt == "parquet":
    388     fmt = "pq"
--> 389 with fs.open(this_path, "wb") as handle:
    390     write_this[fmt](obj, handle)
    391     if fs.protocol == "memory":

File ~/git/github/euroargodev/argopy/argopy/stores/filesystems.py:190, in argo_store_proto.open(self, path, *args, **kwargs)
    188 self.register(path)
    189 # log.debug("Opening path: %s" % path)
--> 190 return self.fs.open(path, *args, **kwargs)

File ~/miniconda3/envs/argopy-pull318/lib/python3.9/site-packages/fsspec/implementations/cached.py:449, in CachingFileSystem.__getattribute__.<locals>.<lambda>(*args, **kw)
    399 def __getattribute__(self, item):
    400     if item in {
    401         "load_cache",
    402         "_open",
   (...)
    447         # all the methods defined in this class. Note `open` here, since
    448         # it calls `_open`, but is actually in superclass
--> 449         return lambda *args, **kw: getattr(type(self), item).__get__(self)(
    450             *args, **kw
    451         )
    452     if item in ["__reduce_ex__"]:
    453         raise AttributeError

File ~/miniconda3/envs/argopy-pull318/lib/python3.9/site-packages/fsspec/spec.py:1293, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
   1291 else:
   1292     ac = kwargs.pop("autocommit", not self._intrans)
-> 1293     f = self._open(
   1294         path,
   1295         mode=mode,
   1296         block_size=block_size,
   1297         autocommit=ac,
   1298         cache_options=cache_options,
   1299         **kwargs,
   1300     )
   1301     if compression is not None:
   1302         from fsspec.compression import compr

File ~/miniconda3/envs/argopy-pull318/lib/python3.9/site-packages/fsspec/implementations/cached.py:449, in CachingFileSystem.__getattribute__.<locals>.<lambda>(*args, **kw)
    399 def __getattribute__(self, item):
    400     if item in {
    401         "load_cache",
    402         "_open",
   (...)
    447         # all the methods defined in this class. Note `open` here, since
    448         # it calls `_open`, but is actually in superclass
--> 449         return lambda *args, **kw: getattr(type(self), item).__get__(self)(
    450             *args, **kw
    451         )
    452     if item in ["__reduce_ex__"]:
    453         raise AttributeError

File ~/miniconda3/envs/argopy-pull318/lib/python3.9/site-packages/fsspec/implementations/cached.py:666, in WholeFileCacheFileSystem._open(self, path, mode, **kwargs)
    664 path = self._strip_protocol(path)
    665 if "r" not in mode:
--> 666     fn = self._make_local_details(path)
    667     user_specified_kwargs = {
    668         k: v
    669         for k, v in kwargs.items()
    670         # those kwargs were added by open(), we don't want them
    671         if k not in ["autocommit", "block_size", "cache_options"]
    672     }
    673     return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs)

File ~/miniconda3/envs/argopy-pull318/lib/python3.9/site-packages/fsspec/implementations/cached.py:449, in CachingFileSystem.__getattribute__.<locals>.<lambda>(*args, **kw)
    399 def __getattribute__(self, item):
    400     if item in {
    401         "load_cache",
    402         "_open",
   (...)
    447         # all the methods defined in this class. Note `open` here, since
    448         # it calls `_open`, but is actually in superclass
--> 449         return lambda *args, **kw: getattr(type(self), item).__get__(self)(
    450             *args, **kw
    451         )
    452     if item in ["__reduce_ex__"]:
    453         raise AttributeError

File ~/miniconda3/envs/argopy-pull318/lib/python3.9/site-packages/fsspec/implementations/cached.py:612, in WholeFileCacheFileSystem._make_local_details(self, path)
    605 hash = self._mapper(path)
    606 fn = os.path.join(self.storage[-1], hash)
    607 detail = {
    608     "original": path,
    609     "fn": hash,
    610     "blocks": True,
    611     "time": time.time(),
--> 612     "uid": self.fs.ukey(path),
    613 }
    614 self._metadata.update_file(path, detail)
    615 logger.debug("Copying %s to local cache", path)

File ~/miniconda3/envs/argopy-pull318/lib/python3.9/site-packages/fsspec/spec.py:1332, in AbstractFileSystem.ukey(self, path)
   1330 def ukey(self, path):
   1331     """Hash of file properties, to tell if it has changed"""
-> 1332     return sha256(str(self.info(path)).encode()).hexdigest()

File ~/miniconda3/envs/argopy-pull318/lib/python3.9/site-packages/fsspec/implementations/memory.py:160, in MemoryFileSystem.info(self, path, **kwargs)
    153     return {
    154         "name": path,
    155         "size": filelike.size,
    156         "type": "file",
    157         "created": getattr(filelike, "created", None),
    158     }
    159 else:
--> 160     raise FileNotFoundError(path)

FileNotFoundError: https://data-argo.ifremer.fr/ar_index_global_prof.txt/local.pq

Expected Output

We expect a nice load of the index:

<argoindex.pyarrow>
Host: https://data-argo.ifremer.fr/
Index: ar_index_global_prof.txt
Convention: ar_index_global_prof (Profile directory file of the Argo GDAC)
Loaded: True (2960762 records)
Searched: False

Versions

Output of `argopy.show_versions()` SYSTEM ------ commit: None python: 3.9.19 | packaged by conda-forge | (main, Mar 20 2024, 12:53:33) [Clang 16.0.6 ] python-bits: 64 OS: Darwin OS-release: 21.6.0 machine: x86_64 processor: i386 byteorder: little LC_ALL: en_US.UTF-8 LANG: en_US.UTF-8 LOCALE: en_US.UTF-8 libhdf5: 1.14.3 libnetcdf: 4.9.2 INSTALLED VERSIONS: CORE ------------------------ aiohttp : 3.9.3 argopy : 0.1.14 erddapy : 2.2.0 fsspec : 2024.3.1 netCDF4 : 1.6.5 packaging : 24.0 requests : 2.31.0 scipy : 1.13.0 toolz : 0.12.1 xarray : 2024.3.0 INSTALLED VERSIONS: EXT.UTIL ---------------------------- gsw : 3.6.17 tqdm : 4.66.2 zarr : 2.17.2 INSTALLED VERSIONS: EXT.PERF ---------------------------- dask : 2024.4.1 distributed : 2024.4.1 h5netcdf : 1.3.0 pyarrow : 15.0.2 INSTALLED VERSIONS: EXT.PLOT ---------------------------- IPython : 8.18.1 cartopy : 0.23.0 ipykernel : 6.29.3 ipywidgets : 8.1.2 matplotlib : 3.8.4 pyproj : 3.6.1 seaborn : 0.13.2 INSTALLED VERSIONS: DEV ----------------------- aiofiles : 23.2.1 black : 24.3.0 bottleneck : 1.3.8 cfgrib : 0.9.11.0 cftime : 1.6.3 conda : - flake8 : 7.0.0 nc_time_axis: - numpy : 1.26.4 pandas : 2.2.2 pip : 24.0 pytest : 8.1.1 pytest_cov : 5.0.0 pytest_env : 1.1.3 pytest_localftpserver: 0.0.0 setuptools : - sphinx : - INSTALLED VERSIONS: PIP ----------------------- pytest-reportlog: -