ESGF / esgf-pyclient

Search client for the ESGF Search API
https://esgf-pyclient.readthedocs.io/en/latest/
BSD 3-Clause "New" or "Revised" License
32 stars 18 forks source link

logon for http request #84

Closed larsbuntemeyer closed 2 years ago

larsbuntemeyer commented 2 years ago

Hi everyone, i have one more thing where I am a little lost. I can access ESGF url via pyesgf and it works fine for me with opendap. However, I access CORDEX datasets regularly which require a logon to ESGF for data access. It works fine with opendap and xarray if i logon and search like, e.g.,

import xarray as xr
import pyesgf
from pyesgf.logon import LogonManager
from pyesgf.search import SearchConnection

lm = LogonManager()

# logon
myproxy_host = 'esgf-data.dkrz.de'
lm.logon(hostname=myproxy_host, interactive=True, bootstrap=True)
print(lm.is_logged_on())

# search
conn = SearchConnection('http://esgf-data.dkrz.de/esg-search', distrib=False)
ctx = conn.new_context(project='CORDEX', experiment='evaluation', time_frequency='mon',
                       variable='tas', driving_model="ECMWF-ERAINT", domain="EUR-11")
result = ctx.search()
print(f"length: {len(result)}")

res = result[0]
ctx = res.file_context()
#ctx.facet_counts
dataset = ctx.search()

download_url = dataset[0].download_url
opendap_url = dataset[0].opendap_url

grafik

ds = xr.open_dataset(opendap_url)
ds

grafik

I can't access via the download_url, e.g.,

import fsspec
with fsspec.open(download_url, ssl=True) as f:
    ds = xr.open_dataset(f)

which give a 401 Unauthorized error...

---------------------------------------------------------------------------
ClientResponseError                       Traceback (most recent call last)
File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/fsspec/implementations/http.py:391, in HTTPFileSystem._info(self, url, **kwargs)
    389 try:
    390     info.update(
--> 391         await _file_info(
    392             url,
    393             size_policy=policy,
    394             session=session,
    395             **self.kwargs,
    396             **kwargs,
    397         )
    398     )
    399     if info.get("size") is not None:

File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/fsspec/implementations/http.py:772, in _file_info(url, session, size_policy, **kwargs)
    771 async with r:
--> 772     r.raise_for_status()
    774     # TODO:
    775     #  recognise lack of 'Accept-Ranges',
    776     #                 or 'Accept-Ranges': 'none' (not 'bytes')
    777     #  to mean streaming only, no random access => return None

File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/aiohttp/client_reqrep.py:1004, in ClientResponse.raise_for_status(self)
   1003 self.release()
-> 1004 raise ClientResponseError(
   1005     self.request_info,
   1006     self.history,
   1007     status=self.status,
   1008     message=self.reason,
   1009     headers=self.headers,
   1010 )

ClientResponseError: 401, message='401', url=URL('https://cordexesg.dmi.dk/esg-orp/home.htm?redirect=http://cordexesg.dmi.dk/thredds/fileServer/cordex_general/cordex/output/EUR-11/DMI/ECMWF-ERAINT/evaluation/r1i1p1/DMI-HIRHAM5/v1/mon/tas/v20140620/tas_EUR-11_ECMWF-ERAINT_evaluation_r1i1p1_DMI-HIRHAM5_v1_mon_198901-199012.nc')

The above exception was the direct cause of the following exception:

FileNotFoundError                         Traceback (most recent call last)
Input In [5], in <cell line: 2>()
      1 import fsspec
----> 2 with fsspec.open(download_url, ssl=True) as f:
      3     ds = xr.open_dataset(f)

File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/fsspec/core.py:104, in OpenFile.__enter__(self)
    101 def __enter__(self):
    102     mode = self.mode.replace("t", "").replace("b", "") + "b"
--> 104     f = self.fs.open(self.path, mode=mode)
    106     self.fobjects = [f]
    108     if self.compression is not None:

File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/fsspec/spec.py:1037, in AbstractFileSystem.open(self, path, mode, block_size, cache_options, compression, **kwargs)
   1035 else:
   1036     ac = kwargs.pop("autocommit", not self._intrans)
-> 1037     f = self._open(
   1038         path,
   1039         mode=mode,
   1040         block_size=block_size,
   1041         autocommit=ac,
   1042         cache_options=cache_options,
   1043         **kwargs,
   1044     )
   1045     if compression is not None:
   1046         from fsspec.compression import compr

File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/fsspec/implementations/http.py:340, in HTTPFileSystem._open(self, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)
    338 kw["asynchronous"] = self.asynchronous
    339 kw.update(kwargs)
--> 340 size = size or self.info(path, **kwargs)["size"]
    341 session = sync(self.loop, self.set_session)
    342 if block_size and size:

File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/fsspec/asyn.py:86, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
     83 @functools.wraps(func)
     84 def wrapper(*args, **kwargs):
     85     self = obj or args[0]
---> 86     return sync(self.loop, func, *args, **kwargs)

File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/fsspec/asyn.py:66, in sync(loop, func, timeout, *args, **kwargs)
     64     raise FSTimeoutError from return_result
     65 elif isinstance(return_result, BaseException):
---> 66     raise return_result
     67 else:
     68     return return_result

File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/fsspec/asyn.py:26, in _runner(event, coro, result, timeout)
     24     coro = asyncio.wait_for(coro, timeout=timeout)
     25 try:
---> 26     result[0] = await coro
     27 except Exception as ex:
     28     result[0] = ex

File /opt/anaconda3/envs/pyesgf/lib/python3.10/site-packages/fsspec/implementations/http.py:404, in HTTPFileSystem._info(self, url, **kwargs)
    401     except Exception as exc:
    402         if policy == "get":
    403             # If get failed, then raise a FileNotFoundError
--> 404             raise FileNotFoundError(url) from exc
    405         logger.debug(str(exc))
    407 return {"name": url, "size": None, **info, "type": "file"}

FileNotFoundError: http://cordexesg.dmi.dk/thredds/fileServer/cordex_general/cordex/output/EUR-11/DMI/ECMWF-ERAINT/evaluation/r1i1p1/DMI-HIRHAM5/v1/mon/tas/v20140620/tas_EUR-11_ECMWF-ERAINT_evaluation_r1i1p1_DMI-HIRHAM5_v1_mon_198901-199012.nc

I would be greatful for any idea of how I can access CORDEX http urls. If I simply click on those http urls and login (in the webportal), I can download the files, e.g., from the browser. However, I have no experience of how to login with an open id in python for http access...

bouweandela commented 2 years ago

The fsspec package uses aoihttp under the hood to retrieve the data. To use the ESGF credentials provided by LogonManager with aiohttp, you can do:

import ssl
import fsspec
import xarray as xr
from pyesgf.logon import LogonManager

# logon
manager = LogonManager()
if not manager.is_logged_on():
    myproxy_host = 'esgf-data.dkrz.de'
    manager.logon(hostname=myproxy_host, interactive=True, bootstrap=True)

# create SSL context
sslcontext = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH)
sslcontext.load_verify_locations(capath=manager.esgf_certs_dir)
sslcontext.load_cert_chain(manager.esgf_credentials)

# open file
download_url = 'http://cordexesg.dmi.dk/thredds/fileServer/cordex_general/cordex/output/EUR-11/DMI/ECMWF-ERAINT/evaluation/r1i1p1/DMI-HIRHAM5/v1/mon/tas/v20140620/tas_EUR-11_ECMWF-ERAINT_evaluation_r1i1p1_DMI-HIRHAM5_v1_mon_198901-199012.nc'
with fsspec.open(download_url, ssl=sslcontext) as file:
    ds = xr.open_dataset(file)
    print(ds)
<xarray.Dataset>
Dimensions:       (rlat: 412, rlon: 424, time: 24, bounds: 2)
Coordinates:
    lat           (rlat, rlon) float64 ...
    lon           (rlat, rlon) float64 ...
  * rlat          (rlat) float64 -23.38 -23.26 -23.16 ... 21.61 21.73 21.83
  * rlon          (rlon) float64 -28.38 -28.26 -28.16 ... 17.93 18.05 18.16
  * time          (time) datetime64[ns] 1989-01-16T12:00:00 ... 1990-12-16T12...
Dimensions without coordinates: bounds
Data variables:
    rotated_pole  |S1 ...
    tas           (time, rlat, rlon) float32 ...
    time_bnds     (time, bounds) datetime64[ns] ...
    height        float64 ...
Attributes: (12/23)
    CDI:                            Climate Data Interface version 1.5.3 (htt...
    history:                        Thu Nov 14 20:02:22 2013: /usr/local/bin/...
    institution:                    Danish Meteorological Institute
    Conventions:                    CF-1.6
    tracking_id:                    108d904e-e9e9-464b-85a7-dba596f6b5bc
    contact:                        obc@dmi.dk
    ...                             ...
    experiment_id:                  evaluation
    driving_experiment_name:        evaluation
    driving_experiment:             ECMWF-ERAINT,evaluation,r1i1p1
    driving_model_id:               ECMWF-ERAINT
    CORDEX_domain:                  EUR-11
    NCO:                            4.0.9
larsbuntemeyer commented 2 years ago

Thanks @bouweandela ! This is very much appreciated and solves my problem 100%!