Kamodo not working on data stored in s3 buckets

rebeccaringuette commented 1 year ago

Basic description of the issue: I am attempting to interpolate the MMS trajectory through SWMF GM and OpenGGCM GM model outputs stored in s3 buckets online, but the code fails in the same way for both executions, indicating an empty list of files.

More details: I am using OpenGGCM_GM converted files and SWMF_GM .out files in a model flythrough using the MMS trajectory. The code is adapted from the PyHC executable paper.

file_dir1 = 's3://helio-public/SWMF_GM/James_Webster_051716_1/'  # s3 location
variable_list = ['B_x','B_y', 'B_z']               # list of desired variable names
coord_sys = "GSM-car"                              # GSM and cartesian because we're using x,y,z from above 
output_dir = '/home/jovyan/efs/raringuette/Results/'  # directory where the user wants the output stored
file_name1 = 's3_SWMFGM_James_Webster_051716_1'                   # what the user wants the output files to be named
output_name1 = output_dir + file_name1 + ".csv"      # output dir plus output file name without extension
plot_coord = 'GSM'                                 # coordinate system chosen for output plots

# Convert to UTC timestamps
from datetime import datetime, timezone
sat_time = [time.replace(tzinfo=timezone.utc).timestamp() for time in epoch]  # times range should be in the model data

# Use MMS trajectory acquired from PySPEDAS/SpacePy
sat_x = pos[:, 0] / 6378
sat_y = pos[:, 1] / 6378
sat_z = pos[:, 2] / 6378

results_swmfgm = S.ModelFlythrough(model1, file_dir1, variable_list, sat_time, sat_x, sat_y, sat_z, 
                                                              coord_sys, output_name=output_name1, plot_coord=plot_coord)

On efs and local storage on my machine, this executes flawlessly. However, I get the error below when the data is stored in an s3 bucket on HelioCloud.

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[25], line 4
      2 import os.path
      3 if not os.path.isfile(output_name1):
----> 4     results_swmfgm = S.ModelFlythrough(model1, file_dir1, variable_list, sat_time, sat_x, sat_y, sat_z, 
      5                                        coord_sys, output_name=output_name1, plot_coord=plot_coord)
      6 else:  # if previously run, read in results
      7     results_swmfgm = S.O.SF_read(output_name1)

File /efs/raringuette/Kamodo/kamodo_ccmc/flythrough/SatelliteFlythrough.py:413, in ModelFlythrough(model, file_dir, variable_list, sat_time, c1, c2, c3, coord_sys, output_name, plot_coord, verbose, _print_units)
    408 coord_type, coord_grid = coord_sys.split('-')
    410 # get interpolated results
    411 # coord_type should be one of SpacePy's or AstroPy's coordinates
    412 # coord_grid is either 'sph' or 'car'
--> 413 results = U.Model_SatelliteFlythrough(model, file_dir, new_list,
    414                                       sat_time, c1, c2, c3,
    415                                       coord_type, coord_grid,
    416                                       verbose=verbose)
    418 # remove requested variables not found in the data
    419 var_list = [key for key in results.keys() if key not in
    420             ['utc_time', 'c1', 'c2', 'c3', 'net_idx']]

File /efs/raringuette/Kamodo/kamodo_ccmc/flythrough/SF_utilities.py:153, in Model_SatelliteFlythrough(model, file_dir, variable_list, sat_time, c1, c2, c3, coord_type, coord_grid, verbose)
    151 # initialize model reader and times dictionary
    152 reader = MW.Model_Reader(model)  # Kamodo gets imported here
--> 153 start_utcts, end_utcts, filedate = File_UTCTimes(model, file_dir)
    155 # cut off trajectory times not found in data in file_dir
    156 idx = where((sat_time >= start_utcts) & (sat_time <= end_utcts))[0]

File /efs/raringuette/Kamodo/kamodo_ccmc/flythrough/SF_utilities.py:207, in File_UTCTimes(model, file_dir)
    205 else:
    206     reader = MW.Model_Reader(model)
--> 207     ko = reader(file_dir, filetime=True)  # creates any preprocessed files
    208     times, filedate = ko.times, ko.filedate
    209     del ko

File /efs/raringuette/Kamodo/kamodo_ccmc/readers/swmfgm_4D.py:129, in MODEL.<locals>.MODEL.__init__(self, file_dir, variables_requested, filetime, verbose, gridded_int, printfiles, **kwargs)
    127 patterns = unique([basename(f)[:10] for f in files])
    128 # get time grid from files
--> 129 dt = sp.IdlFile(files[0]).attrs['time']
    130 if dt is not None:  # filedate given not always at midnight
    131     self.filedate = datetime.strptime(
    132         dt.isoformat()[:10], '%Y-%m-%d').replace(
    133         tzinfo=timezone.utc)

IndexError: list index out of range

Digging deeper, the initial problem is simply that glob doesn't work on s3 buckets. I was able to create a successful alternate 'glob' to handle this.

def glob(file_pattern):
    import s3fs
    s3 = s3fs.S3FileSystem(anon=False)
    s3_files = sorted(s3.glob(file_pattern))
    return ['s3://'+f for f in s3_files]
file_dir1 = 's3://helio-public/raringuette/SWMF_GM/James_Webster_051716_1/'  # helio-public
swmf_files = glob(file_dir1+'*.out')
swmf_files[::30]

Output: ['s3://helio-public/raringuette/SWMF_GM/James_Webster_051716_1/3dvar_1_e20151016-113200-000.out', 's3://helio-public/raringuette/SWMF_GM/James_Webster_051716_1/3dvar_1_e20151016-133200-000.out', 's3://helio-public/raringuette/SWMF_GM/James_Webster_051716_1/3d__var_1_e20151016-153200-000.out']

However, I can't even open the files using the methods Kamodo/spacepy uses for these files (see spacepy issue for the .out problem/solution). Attempting to use the typical Dataset call from netCDF4 on one of the OpenGGCM nc files produces the error below. First, demonstrating that the files exist...

def glob(file_pattern):
    import s3fs
    s3 = s3fs.S3FileSystem(anon=False)
    s3_files = sorted(s3.glob(file_pattern))
    return ['s3://'+f for f in s3_files]
file_dir2 = 's3://helio-public/raringuette/OpenGGCM_GM/Yihua_Zheng_040122_1/'
openggcm_files = glob(file_dir2+'*.nc')
openggcm_files[:5]

Output: ['s3://helio-public/raringuette/OpenGGCM_GM/Yihua_Zheng_040122_1/Yihua_Zheng_040122_1.3df_2015-10-16_11_30.nc', 's3://helio-public/raringuette/OpenGGCM_GM/Yihua_Zheng_040122_1/Yihua_Zheng_040122_1.3df_2015-10-16_11_31.nc', 's3://helio-public/raringuette/OpenGGCM_GM/Yihua_Zheng_040122_1/Yihua_Zheng_040122_1.3df_2015-10-16_11_32.nc', 's3://helio-public/raringuette/OpenGGCM_GM/Yihua_Zheng_040122_1/Yihua_Zheng_040122_1.3df_2015-10-16_11_33.nc', 's3://helio-public/raringuette/OpenGGCM_GM/Yihua_Zheng_040122_1/Yihua_Zheng_040122_1.3df_2015-10-16_11_34.nc']

Now, showing the code to open the file...

from netCDF4 import Dataset
cdf_data = Dataset(openggcm_files[0])
cdf_data

and the error produced.

Error:curl error: Problem with the SSL CA cert (path? access rights?)
curl error details: 
Warning:oc_open: Could not read url
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[2], line 2
      1 from netCDF4 import Dataset
----> 2 cdf_data = Dataset(openggcm_files[0])
      3 cdf_data

File src/netCDF4/_netCDF4.pyx:2449, in netCDF4._netCDF4.Dataset.__init__()
File src/netCDF4/_netCDF4.pyx:2012, in netCDF4._netCDF4._ensure_nc_success()
OSError: [Errno -68] NetCDF: I/O failure: 's3://helio-public/raringuette/OpenGGCM_GM/Yihua_Zheng_040122_1/Yihua_Zheng_040122_1.3df_2015-10-16_11_30.nc'

Note that since 'helio-public' is a public s3 bucket, there should be no problem with access rights. Or, maybe I have missed something simple here. I suggest looking into a special build of netCDF4 to include with kamodo-ccmc to enable this (see the docs), include an alternate package chosen automatically when s3 buckets are detected in the file paths (S3netCDF4), or switch to using xarray, which works on this already. If the xarray path is used, make sure to use

pip install "xarray[io]"

to include the required additional backends (see the docs). You might also consider adding the support for dask and acceleration in the installation. In the near future, this additional support for s3 access will be needed for all types of model outputs (binary, csv, ascii, h5, etc).

rebeccaringuette commented 1 year ago

I found a solution that works nicely without xarray. It requires adding two packages as dependencies for kamodo-ccmc, but removes two other packages. The two packages needed are s3fs and h5netcdf, both installable via pip. h5py and netCDF4 are no longer needed.

The previous method of accessing nc files with netCDF4.Dataset can be replaced with the method below, which works for files stored in s3 buckets and in normal storage.

from h5netcdf.legacyapi import Dataset as Dataset_leg
import s3fs

def Dataset(filename, access='r'):
    if filename[:2] == 's3':
        s3=s3fs.S3FileSystem(anon=False)
        fgrab = s3.open(filename, access+'b')
        return Dataset_leg(fgrab)
    else:
        return Dataset_leg(filename, access)

Notice that the new definition of 'Dataset' automatically performs the correct operation based on whether the file is stored in an s3 bucket or in normal storage. This should be tested on WACCM-X files because the h0 files are produced with the 'NETCDF3_64BIT_OFFSET' option due to the large file sizes generated. This code can go into the reader_utilities.py script and be imported from there so that only the import statements in the effected readers need to be changed.

The normal file search method using glob will need to be replaced by the code below, which also automatically performs the correct operation based on the filename. This code would go nicely in the reader_utilities.py script, from which glob should be imported for all uses. Then, only the import statement in the readers will need to be changed.

from glob import glob as glob_leg
import s3fs

def glob(file_pattern):
    if file_pattern[:2] == 's3':
        s3 = s3fs.S3FileSystem(anon=False)
        s3_files = sorted(s3.glob(file_pattern))
        return ['s3://'+f for f in s3_files]
    else:
        return glob_leg(file_pattern)

The code to replace calls to h5py is

import h5netcdf as h5py  # works for s3 and efs
import s3fs

def convert(filename, access='r'):
    if filename[:2] == 's3':
        s3=s3fs.S3FileSystem(anon=False)
        fgrab = s3.open(filename, access+'b')
        return [fgrab]
    else:
        return [filename, access]
h5_data = h5py.File(*convert(filename))

where convert should be stored in the reader_utilities.py script. This remains to be tested in the relevant readers. h5netcdf and h5netcdf.legacyapi.Dataset both break for the normal/efs case if the file object is given instead of the filename. Note that this does NOT enable writing netcdf/h5 files to s3, so file conversions on the cloud will not be supported.

Since all of the file formats after file conversions are either .h5, .nc or .out files, this reduces the remaining file access problem to reading the two text files produced by each reader and the general I/O in SF_output.py. The open statements in the _readtimelist function in reader_utilities.py should be replaced with a call to the function below, which should offer the same resulting behavior for text files on local/efs or s3 storage. This has not been tested.

import s3fs

def _open(filename):
    if filename[:2] == 's3':
        s3 = s3fs.S3FileSystem(anon=False)
        return s3.open(filename)
    else:
        return open(filename)

Reading the csv and ascii files from s3 may be as simple as replacing line 149 in _SFcsvreader with a call to the function above. The behavior of this function with the csv package has not been tested. Writing csv and ascii files directly to s3 buckets may be possible with the function, but this has not been tested. A related issue on xarray's github may be useful if others are interested in writing files to s3.

lrastaet commented 1 year ago

Does _open() not work as file argument to IdlFile()?

rebeccaringuette commented 1 year ago

No, because spacepy performs the open command. The change has to happen on the spacepy side for the s3 issue.

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 MW.Variable_Search('magnetic', model, file_dir)

File ~/efs/raringuette/Kamodo/kamodo_ccmc/flythrough/model_wrapper.py:279, in Variable_Search(search_string, model, file_dir, return_dict)
    277         return new_dict
    278 elif file_dir != '' and model != '':
--> 279     ko_var_dict = Model_Variables(model, file_dir, return_dict=True)
    280     new_dict = {name: [value[0], value[-4]+'-'+value[-3],
    281                        value[-2], value[-1]] for name, value in
    282                 ko_var_dict.items() if search_string in value[0].lower()}
    283     if new_dict == {}:

File ~/efs/raringuette/Kamodo/kamodo_ccmc/flythrough/model_wrapper.py:184, in Model_Variables(model, file_dir, return_dict)
    182 else:
    183     reader = Model_Reader(model)
--> 184     ko = reader(file_dir, variables_requested='all')
    186     # either return or print nested_dictionary
    187     if return_dict:

File ~/efs/raringuette/Kamodo/kamodo_ccmc/readers/swmfgm_4D.py:128, in MODEL.<locals>.MODEL.__init__(self, file_dir, variables_requested, filetime, verbose, gridded_int, printfiles, **kwargs)
    126 patterns = unique([basename(f)[:10] for f in files])
    127 # get time grid from files
--> 128 dt = sp.IdlFile(RU._open(files[0]),
    129                 sort_unstructured=False).attrs['time']
    130 if dt is not None:  # filedate given not always at midnight
    131     self.filedate = datetime.strptime(
    132         dt.isoformat()[:10], '%Y-%m-%d').replace(
    133         tzinfo=timezone.utc)

File ~/users_conda_envs/PyHCs3/lib/python3.10/site-packages/spacepy/pybats/__init__.py:1220, in IdlFile.__init__(self, filename, iframe, header, keep_case, sort_unstructured, *args, **kwargs)
   1216 super(IdlFile, self).__init__(*args, **kwargs)  # Init as PbData.
   1218 # Gather information about the file: format, endianess (if necessary),
   1219 # number of picts/frames, etc.:
-> 1220 fmt, endchar, inttype, floattype = _probe_idlfile(filename)
   1221 self.attrs['file'] = filename   # Save file name.
   1222 self.attrs['format'] = fmt        # Save file format.

File ~/users_conda_envs/PyHCs3/lib/python3.10/site-packages/spacepy/pybats/__init__.py:807, in _probe_idlfile(filename)
    804 inttype = np.dtype(np.int32)
    805 floattype = np.dtype(np.float32)
--> 807 with open(filename, 'rb') as f:
    808     # On the first try, we may fail because of wrong-endianess.
    809     # If that is the case, swap that endian and try again.
    810     inttype.newbyteorder(endian)
    812     try:
    813         # Try to parse with little endian byte ordering:

TypeError: expected str, bytes or os.PathLike object, not TextIOWrapper

rebeccaringuette commented 1 year ago

This issue is solved in the pull request #131, both for netCDF4 and netCDF3 files (and for h5 files, too), with the exceptions noted is this issue.

nasa / Kamodo

Kamodo not working on data stored in s3 buckets #130