NeurodataWithoutBorders / pynwb

A Python API for working with Neurodata stored in the NWB Format
https://pynwb.readthedocs.io
Other
175 stars 85 forks source link

[Bug]: FileNotFoundError when streaming from Dandi archive #1644

Open mayofaulkner opened 1 year ago

mayofaulkner commented 1 year ago

What happened?

Hello,

I was following this tutorial on streaming NWB and got a FileNotFoundError: error when copying the code snippets provided.

Thanks for your help and your great documentation!!

Steps to Reproduce

from dandi.dandiapi import DandiAPIClient

dandiset_id = '000006'  # ephys dataset from the Svoboda Lab
filepath = 'sub-anm372795/sub-anm372795_ses-20170718.nwb'  # 450 kB file
with DandiAPIClient() as client:
    asset = client.get_dandiset(dandiset_id, 'draft').get_asset_by_path(filepath)
    s3_url = asset.get_content_url(follow_redirects=1, strip_query=True)
import fsspec
import pynwb
import h5py
from fsspec.implementations.cached import CachingFileSystem

# first, create a virtual filesystem based on the http protocol and use
# caching to save accessed data to RAM.
fs = CachingFileSystem(
    fs=fsspec.filesystem("http"),
    cache_storage="nwb-cache",  # Local folder for the cache
)

# next, open the file
with fs.open(s3_url, "rb") as f:
    with h5py.File(f) as file:
        with pynwb.NWBHDF5IO(file=file, load_namespaces=True) as io:
            nwbfile = io.read()
            print(nwbfile.acquisition['lick_times'].time_series['lick_left_times'].data[:])

### Traceback

```python
gaierror                                  Traceback (most recent call last)
File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/aiohttp/connector.py:1152, in TCPConnector._create_direct_connection(self, req, traces, timeout, client_error)
   1148 try:
   1149     # Cancelling this lookup should not cancel the underlying lookup
   1150     #  or else the cancel event will get broadcast to all the waiters
   1151     #  across all connections.
-> 1152     hosts = await asyncio.shield(host_resolved)
   1153 except asyncio.CancelledError:

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/aiohttp/connector.py:874, in TCPConnector._resolve_host(self, host, port, traces)
    872         await trace.send_dns_resolvehost_start(host)
--> 874 addrs = await self._resolver.resolve(host, port, family=self._family)
    875 if traces:

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/aiohttp/resolver.py:33, in ThreadedResolver.resolve(self, hostname, port, family)
     30 async def resolve(
     31     self, hostname: str, port: int = 0, family: int = socket.AF_INET
     32 ) -> List[Dict[str, Any]]:
---> 33     infos = await self._loop.getaddrinfo(
     34         hostname,
     35         port,
     36         type=socket.SOCK_STREAM,
     37         family=family,
     38         flags=socket.AI_ADDRCONFIG,
     39     )
     41     hosts = []

File ~/opt/anaconda3/envs/nwb/lib/python3.9/asyncio/base_events.py:861, in BaseEventLoop.getaddrinfo(self, host, port, family, type, proto, flags)
    859     getaddr_func = socket.getaddrinfo
--> 861 return await self.run_in_executor(
    862     None, getaddr_func, host, port, family, type, proto, flags)

File ~/opt/anaconda3/envs/nwb/lib/python3.9/concurrent/futures/thread.py:58, in _WorkItem.run(self)
     57 try:
---> 58     result = self.fn(*self.args, **self.kwargs)
     59 except BaseException as exc:

File ~/opt/anaconda3/envs/nwb/lib/python3.9/socket.py:954, in getaddrinfo(host, port, family, type, proto, flags)
    953 addrlist = []
--> 954 for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    955     af, socktype, proto, canonname, sa = res

gaierror: [Errno 8] nodename nor servname provided, or not known

The above exception was the direct cause of the following exception:

ClientConnectorError                      Traceback (most recent call last)
File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/implementations/http.py:411, in HTTPFileSystem._info(self, url, **kwargs)
    409 try:
    410     info.update(
--> 411         await _file_info(
    412             self.encode_url(url),
    413             size_policy=policy,
    414             session=session,
    415             **self.kwargs,
    416             **kwargs,
    417         )
    418     )
    419     if info.get("size") is not None:

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/implementations/http.py:823, in _file_info(url, session, size_policy, **kwargs)
    822 elif size_policy == "get":
--> 823     r = await session.get(url, allow_redirects=ar, **kwargs)
    824 else:

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/aiohttp/client.py:536, in ClientSession._request(self, method, str_or_url, params, data, json, cookies, headers, skip_auto_headers, auth, allow_redirects, max_redirects, compress, chunked, expect100, raise_for_status, read_until_eof, proxy, proxy_auth, timeout, verify_ssl, fingerprint, ssl_context, ssl, proxy_headers, trace_request_ctx, read_bufsize)
    535         assert self._connector is not None
--> 536         conn = await self._connector.connect(
    537             req, traces=traces, timeout=real_timeout
    538         )
    539 except asyncio.TimeoutError as exc:

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/aiohttp/connector.py:540, in BaseConnector.connect(self, req, traces, timeout)
    539 try:
--> 540     proto = await self._create_connection(req, traces, timeout)
    541     if self._closed:

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/aiohttp/connector.py:901, in TCPConnector._create_connection(self, req, traces, timeout)
    900 else:
--> 901     _, proto = await self._create_direct_connection(req, traces, timeout)
    903 return proto

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/aiohttp/connector.py:1166, in TCPConnector._create_direct_connection(self, req, traces, timeout, client_error)
   1164     # in case of proxy it is not ClientProxyConnectionError
   1165     # it is problem of resolving proxy ip itself
-> 1166     raise ClientConnectorError(req.connection_key, exc) from exc
   1168 last_exc: Optional[Exception] = None

ClientConnectorError: Cannot connect to host dandiarchive.s3.amazonaws.com:443 ssl:default [nodename nor servname provided, or not known]

The above exception was the direct cause of the following exception:

FileNotFoundError                         Traceback (most recent call last)
Cell In[3], line 14
      8 fs = CachingFileSystem(
      9     fs=fsspec.filesystem("http"),
     10     cache_storage="nwb-cache",  # Local folder for the cache
     11 )
     13 # next, open the file
---> 14 with fs.open(s3_url, "rb") as f:
     15     with h5py.File(f) as file:
     16         with pynwb.NWBHDF5IO(file=file, load_namespaces=True) as io:

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/implementations/cached.py:444, in CachingFileSystem.__getattribute__.<locals>.<lambda>(*args, **kw)
    408 def __getattribute__(self, item):
    409     if item in [
    410         "load_cache",
    411         "_open",
   (...)
    442         # all the methods defined in this class. Note `open` here, since
    443         # it calls `_open`, but is actually in superclass
--> 444         return lambda *args, **kw: getattr(type(self), item).__get__(self)(
    445             *args, **kw
    446         )
    447     if item in ["__reduce_ex__"]:
    448         raise AttributeError

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/spec.py:1135, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
   1133 else:
   1134     ac = kwargs.pop("autocommit", not self._intrans)
-> 1135     f = self._open(
   1136         path,
   1137         mode=mode,
   1138         block_size=block_size,
   1139         autocommit=ac,
   1140         cache_options=cache_options,
   1141         **kwargs,
   1142     )
   1143     if compression is not None:
   1144         from fsspec.compression import compr

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/implementations/cached.py:444, in CachingFileSystem.__getattribute__.<locals>.<lambda>(*args, **kw)
    408 def __getattribute__(self, item):
    409     if item in [
    410         "load_cache",
    411         "_open",
   (...)
    442         # all the methods defined in this class. Note `open` here, since
    443         # it calls `_open`, but is actually in superclass
--> 444         return lambda *args, **kw: getattr(type(self), item).__get__(self)(
    445             *args, **kw
    446         )
    447     if item in ["__reduce_ex__"]:
    448         raise AttributeError

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/implementations/cached.py:354, in CachingFileSystem._open(self, path, mode, block_size, autocommit, cache_options, **kwargs)
    352 # call target filesystems open
    353 self._mkcache()
--> 354 f = self.fs._open(
    355     path,
    356     mode=mode,
    357     block_size=block_size,
    358     autocommit=autocommit,
    359     cache_options=cache_options,
    360     cache_type="none",
    361     **kwargs,
    362 )
    363 if self.compression:
    364     comp = (
    365         infer_compression(path)
    366         if self.compression == "infer"
    367         else self.compression
    368     )

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/implementations/http.py:350, in HTTPFileSystem._open(self, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)
    348 kw["asynchronous"] = self.asynchronous
    349 kw.update(kwargs)
--> 350 size = size or self.info(path, **kwargs)["size"]
    351 session = sync(self.loop, self.set_session)
    352 if block_size and size:

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/asyn.py:114, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
    111 @functools.wraps(func)
    112 def wrapper(*args, **kwargs):
    113     self = obj or args[0]
--> 114     return sync(self.loop, func, *args, **kwargs)

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/asyn.py:99, in sync(loop, func, timeout, *args, **kwargs)
     97     raise FSTimeoutError from return_result
     98 elif isinstance(return_result, BaseException):
---> 99     raise return_result
    100 else:
    101     return return_result

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/asyn.py:54, in _runner(event, coro, result, timeout)
     52     coro = asyncio.wait_for(coro, timeout=timeout)
     53 try:
---> 54     result[0] = await coro
     55 except Exception as ex:
     56     result[0] = ex

File ~/opt/anaconda3/envs/nwb/lib/python3.9/site-packages/fsspec/implementations/http.py:424, in HTTPFileSystem._info(self, url, **kwargs)
    421     except Exception as exc:
    422         if policy == "get":
    423             # If get failed, then raise a FileNotFoundError
--> 424             raise FileNotFoundError(url) from exc
    425         logger.debug(str(exc))
    427 return {"name": url, "size": None, **info, "type": "file"}

FileNotFoundError: https://dandiarchive.s3.amazonaws.com/blobs/43b/f3a/43bf3a81-4a0b-433f-b471-1f10303f9d35

Operating System

macOS

Python Executable

Conda

Python Version

3.9

Package Versions

environment_for_issue.txt

Code of Conduct

rly commented 1 year ago

Thanks for the bug report @mayofaulkner . We narrowed down the issue to being related to a corrupted or improperly configured cache, such that the cache metadata was updated to say that the file exists locally, but it did not exist. Clearing the cache with fs.clear_cache() resolved the issue.

We will add a note about this to the documentation.