MITgcm / xmitgcm

Read MITgcm mds binary files into xarray
http://xmitgcm.readthedocs.io
MIT License
56 stars 65 forks source link

HTTP error when calling LLC4320 outputs #269

Open roxyboy opened 3 years ago

roxyboy commented 3 years ago

Dear xmitgcm community,

I started getting the following error when trying to call the LLC4320 run from the NASA ECCO portal using the llcreader:

from xmitgcm import llcreader
model = llcreader.ECCOPortalLLC2160Model()
dsw = model.get_dataset(varnames=['Eta'], type='latlon').sel(time=slice('2012-02-01','2012-04-30'),
                                                             i=slice(7600,None),i_g=slice(7600,None),
                                                             j=slice(4600,5300),j_g=slice(4600,5300),
                                                             k=0)
dsw.Eta.isel(time=0).plot()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-555b716ec66c> in <module>
      3                                                              j=slice(4600,5300),j_g=slice(4600,5300),
      4                                                              k=0)
----> 5 dsw.Eta.isel(time=0).plot()

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/plot/plot.py in __call__(self, **kwargs)
    444 
    445     def __call__(self, **kwargs):
--> 446         return plot(self._da, **kwargs)
    447 
    448     # we can't use functools.wraps here since that also modifies the name / qualname

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/plot/plot.py in plot(darray, row, col, col_wrap, ax, hue, rtol, subplot_kws, **kwargs)
    161 
    162     """
--> 163     darray = darray.squeeze().compute()
    164 
    165     plot_dims = set(darray.dims)

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/dataarray.py in compute(self, **kwargs)
    891         """
    892         new = self.copy(deep=False)
--> 893         return new.load(**kwargs)
    894 
    895     def persist(self, **kwargs) -> "DataArray":

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/dataarray.py in load(self, **kwargs)
    865         dask.array.compute
    866         """
--> 867         ds = self._to_temp_dataset().load(**kwargs)
    868         new = self._from_temp_dataset(ds)
    869         self._variable = new._variable

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/dataset.py in load(self, **kwargs)
    739 
    740             # evaluate all the dask arrays simultaneously
--> 741             evaluated_data = da.compute(*lazy_data.values(), **kwargs)
    742 
    743             for k, data in zip(lazy_data, evaluated_data):

/srv/conda/envs/notebook/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
    561         postcomputes.append(x.__dask_postcompute__())
    562 
--> 563     results = schedule(dsk, keys, **kwargs)
    564     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    565 

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
   2653                     should_rejoin = False
   2654             try:
-> 2655                 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
   2656             finally:
   2657                 for f in futures.values():

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
   1962             else:
   1963                 local_worker = None
-> 1964             return self.sync(
   1965                 self._gather,
   1966                 futures,

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    836             return future
    837         else:
--> 838             return sync(
    839                 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
    840             )

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    338     if error[0]:
    339         typ, exc, tb = error[0]
--> 340         raise exc.with_traceback(tb)
    341     else:
    342         return result[0]

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/utils.py in f()
    322             if callback_timeout is not None:
    323                 future = asyncio.wait_for(future, callback_timeout)
--> 324             result[0] = yield future
    325         except Exception as exc:
    326             error[0] = sys.exc_info()

/srv/conda/envs/notebook/lib/python3.8/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
   1827                             exc = CancelledError(key)
   1828                         else:
-> 1829                             raise exception.with_traceback(traceback)
   1830                         raise exc
   1831                     if errors == "skip":

/srv/conda/envs/notebook/lib/python3.8/site-packages/xmitgcm/llcreader/llcmodel.py in _get_facet_chunk()
    504         read_offset = start * dtype.itemsize # in bytes
    505         read_length  = (end - start) * dtype.itemsize # in bytes
--> 506         file.seek(read_offset)
    507         buffer = file.read(read_length)
    508         data = np.frombuffer(buffer, dtype=dtype)

/srv/conda/envs/notebook/lib/python3.8/site-packages/fsspec/implementations/http.py in seek()
    552 
    553     def seek(self, *args, **kwargs):
--> 554         raise ValueError("Cannot seek streaming HTTP file")
    555 
    556     async def _read(self, num=-1):

ValueError: Cannot seek streaming HTTP file

Checking with the NASA Ames Research Center, it seems that their portal is up and running. I haven't able to track down the cause for this error but could you please advise what I should try?

Thank you.

roxyboy commented 3 years ago

The same syntax was working fine till last week so I'm not sure what's changed...

rabernat commented 3 years ago

The ECCO data portal is down unfortunately. Completely beyond our control.

rabernat commented 3 years ago

Or maybe this error is not related to this. Pinging @martindurant for possible fsspec API change? Martin, does this error make sense to you?

    553     def seek(self, *args, **kwargs):
--> 554         raise ValueError("Cannot seek streaming HTTP file")
    555 
    556     async def _read(self, num=-1):

ValueError: Cannot seek streaming HTTP file
martindurant commented 3 years ago

This would indicate that the size of the file was not available, which maybe the server doesn't support it (unlikely if it worked before). More likely, there either is no size temporarily, or the server is doing some kind of forwarding. It is worth running debug on the exception above, to see if the response looks like data or like some error message.

If you can reach the same URL using requests and can fetch specific bytes ranges with the Range header, but fsspec keeps falling back to streaming, then indeed there may be a bug in fsspec.

rabernat commented 3 years ago

As of right now, ~the ECCO data portal IS down, which would be consistent with the failure to return the file size.~

https://data.nas.nasa.gov/ecco/data.php?dir=/eccodata/llc_4320/compressed/0000154224

image

Correction the "size" column in the html website is apparently independent of the HTTP headers.

roxyboy commented 3 years ago

The absence of "size" or "modification time" is indicative of the portal being broken.

I passed this on to the ECCO portal maintainer and he said that the "size"s are not shown regularly on the website. He asked whether I could download some data from the links upon which I could, so he says that the portal should be working fine...

roxyboy commented 3 years ago

This would indicate that the size of the file was not available, which maybe the server doesn't support it (unlikely if it worked before). More likely, there either is no size temporarily, or the server is doing some kind of forwarding. It is worth running debug on the exception above, to see if the response looks like data or like some error message.

If you can reach the same URL using requests and can fetch specific bytes ranges with the Range header, but fsspec keeps falling back to streaming, then indeed there may be a bug in fsspec.

@martindurant And I apologize and advance for my ignorance but could you show me the syntax for debugging? I'm a beginner in fsspec...

martindurant commented 3 years ago

I meant literally typing debug, if this is being run in jupyter/ipython. Else, you'd have to put a try/except with pdb.pm() in the except branch. You would need to learn how to use pdb, though.

martindurant commented 3 years ago

Oh, I see this is within dask. You might want to run without dask, just as far as the exception. I have no idea how to do that with llcreader.ECCOPortalLLC2160Model.

rabernat commented 3 years ago

The following works with fsspec 0.8.5 and 2021.6.0

import fsspec
url = "https://data.nas.nasa.gov/ecco/download_data.php?file=/eccodata/llc_4320/compressed/0000154224/Eta.0000154224.data.shrunk"
with fsspec.open(url, mode='rb') as fp:
    # test size works
    assert fp.size == 562551864
with fsspec.open(url, mode='rb') as fp:
    # test seek works
    fp.seek(2)

@roxyboy could you try the same code from your environment?

rabernat commented 3 years ago

Also, @roxyboy's example failed to reproduce for me on https://us-central1-b.gcp.pangeo.io/. Following code works fine.

from xmitgcm import llcreader
model = llcreader.ECCOPortalLLC2160Model()
dsw = model.get_dataset(varnames=['Eta'], type='latlon').sel(time=slice('2012-02-01','2012-04-30'),
                                                             i=slice(7600,None),i_g=slice(7600,None),
                                                             j=slice(4600,5300),j_g=slice(4600,5300),
                                                             k=0)
dsw.Eta.isel(time=0).plot()

image

roxyboy commented 3 years ago

Also, @roxyboy's example failed to reproduce for me on https://us-central1-b.gcp.pangeo.io/. Following code works fine.

from xmitgcm import llcreader
model = llcreader.ECCOPortalLLC2160Model()
dsw = model.get_dataset(varnames=['Eta'], type='latlon').sel(time=slice('2012-02-01','2012-04-30'),
                                                             i=slice(7600,None),i_g=slice(7600,None),
                                                             j=slice(4600,5300),j_g=slice(4600,5300),
                                                             k=0)
dsw.Eta.isel(time=0).plot()

image

It seems to working again for me too... I'll close this, sorry for the disturbance.

rabernat commented 3 years ago

Our CI also picked up the same exact error a few days ago: https://github.com/MITgcm/xmitgcm/runs/2657251546?check_suite_focus=true#step:6:87

To me, this indicates an intermittent error on the ECCO data portal. It would be great to get some confirmation from the NASA engineers whether this could be the case.

roxyboy commented 3 years ago

It seems to working again for me too... I'll close this, sorry for the disturbance.

Changing the domain seems to give the error again:

dsw = model.get_dataset(varnames=['Eta'], type='latlon').sel(time='2012-02-01',
                                                             i=slice(1000,2000),i_g=slice(1000,2000),
                                                             j=slice(1500,3000),j_g=slice(1500,3000),
                                                             k=0)
dsw.Eta.isel(time=0).plot()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-5-424a382310eb> in <module>
      3                                                              j=slice(1500,3000),j_g=slice(1500,3000),
      4                                                              k=0)
----> 5 dsw.Eta.isel(time=0).plot()

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/plot/plot.py in __call__(self, **kwargs)
    444 
    445     def __call__(self, **kwargs):
--> 446         return plot(self._da, **kwargs)
    447 
    448     # we can't use functools.wraps here since that also modifies the name / qualname

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/plot/plot.py in plot(darray, row, col, col_wrap, ax, hue, rtol, subplot_kws, **kwargs)
    161 
    162     """
--> 163     darray = darray.squeeze().compute()
    164 
    165     plot_dims = set(darray.dims)

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/dataarray.py in compute(self, **kwargs)
    891         """
    892         new = self.copy(deep=False)
--> 893         return new.load(**kwargs)
    894 
    895     def persist(self, **kwargs) -> "DataArray":

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/dataarray.py in load(self, **kwargs)
    865         dask.array.compute
    866         """
--> 867         ds = self._to_temp_dataset().load(**kwargs)
    868         new = self._from_temp_dataset(ds)
    869         self._variable = new._variable

/srv/conda/envs/notebook/lib/python3.8/site-packages/xarray/core/dataset.py in load(self, **kwargs)
    739 
    740             # evaluate all the dask arrays simultaneously
--> 741             evaluated_data = da.compute(*lazy_data.values(), **kwargs)
    742 
    743             for k, data in zip(lazy_data, evaluated_data):

/srv/conda/envs/notebook/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
    561         postcomputes.append(x.__dask_postcompute__())
    562 
--> 563     results = schedule(dsk, keys, **kwargs)
    564     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    565 

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
   2653                     should_rejoin = False
   2654             try:
-> 2655                 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
   2656             finally:
   2657                 for f in futures.values():

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
   1962             else:
   1963                 local_worker = None
-> 1964             return self.sync(
   1965                 self._gather,
   1966                 futures,

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    836             return future
    837         else:
--> 838             return sync(
    839                 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
    840             )

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    338     if error[0]:
    339         typ, exc, tb = error[0]
--> 340         raise exc.with_traceback(tb)
    341     else:
    342         return result[0]

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/utils.py in f()
    322             if callback_timeout is not None:
    323                 future = asyncio.wait_for(future, callback_timeout)
--> 324             result[0] = yield future
    325         except Exception as exc:
    326             error[0] = sys.exc_info()

/srv/conda/envs/notebook/lib/python3.8/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()

/srv/conda/envs/notebook/lib/python3.8/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
   1827                             exc = CancelledError(key)
   1828                         else:
-> 1829                             raise exception.with_traceback(traceback)
   1830                         raise exc
   1831                     if errors == "skip":

/srv/conda/envs/notebook/lib/python3.8/site-packages/xmitgcm/llcreader/llcmodel.py in _get_facet_chunk()
    504         read_offset = start * dtype.itemsize # in bytes
    505         read_length  = (end - start) * dtype.itemsize # in bytes
--> 506         file.seek(read_offset)
    507         buffer = file.read(read_length)
    508         data = np.frombuffer(buffer, dtype=dtype)

/srv/conda/envs/notebook/lib/python3.8/site-packages/fsspec/implementations/http.py in seek()
    552 
    553     def seek(self, *args, **kwargs):
--> 554         raise ValueError("Cannot seek streaming HTTP file")
    555 
    556     async def _read(self, num=-1):

ValueError: Cannot seek streaming HTTP file

Does this have to do with this new patch of data being saved as another file?

rabernat commented 3 years ago

No, I think that the llc2160 files are indeed inaccessible right now. It has changed multiple times just over the past few hours.

https://data.nas.nasa.gov/ecco/data.php?dir=/eccodata/llc_2160/compressed/0000092160 image (see how none of the links are active)

"Due to high demand our filesystem has become unresponsive."

rabernat commented 3 years ago

And now it's back.

roxyboy commented 3 years ago
import fsspec
url = "https://data.nas.nasa.gov/ecco/download_data.php?file=/eccodata/llc_4320/compressed/0000154224/Eta.0000154224.data.shrunk"
with fsspec.open(url, mode='rb') as fp:
    # test size works
    assert fp.size == 562551864
with fsspec.open(url, mode='rb') as fp:
    # test seek works
    fp.seek(2)

It seems that the connection to the ECCO portal is intermittent again. I'm getting the HTTP error again and when trying the snippet above, I got an assertion error the first try but it ran on second attempt.