fsspec / gdrivefs

Google drive implementation of fsspec
BSD 2-Clause "Simplified" License
38 stars 20 forks source link

I/O operation on closed file #17

Closed tjcrone closed 3 years ago

tjcrone commented 3 years ago

I am trying to open an Xarray dataset on Google Drive. No problem listing the contents of a directory or reading a Pandas dataframe with read_csv(). However xr.open_dataset() causes a strange "I/O operation on closed file" error, not when it is read opened, but when I try to print out the details. It appears to print some of the details and then error out with "closed file". Any idea what I am doing wrong? Thanks. I'm at HEAD on gdrivefs and 0.8.4 on fsspec. I believe this would be a working example for anyone:

gdfs = gdrivefs.GoogleDriveFileSystem(root_file_id='1PCBDhk5f3v5PoPCY3Rdcqgy4S_Yj2kCC', token='cache')
of = gdfs.open('CRND0103-2017-NY_Millbrook_3_W.nc')
with of as f:
    ds = xr.open_dataset(f)
ds
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/srv/conda/envs/notebook/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
    343             method = get_real_method(obj, self.print_method)
    344             if method is not None:
--> 345                 return method()
    346             return None
    347         else:

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/dataset.py in _repr_html_(self)
   1665         if OPTIONS["display_style"] == "text":
   1666             return f"<pre>{escape(repr(self))}</pre>"
-> 1667         return formatting_html.dataset_repr(self)
   1668 
   1669     def info(self, buf=None) -> None:

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/formatting_html.py in dataset_repr(ds)
    277         dim_section(ds),
    278         coord_section(ds.coords),
--> 279         datavar_section(ds.data_vars),
    280         attr_section(ds.attrs),
    281     ]

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/formatting_html.py in _mapping_section(mapping, name, details_func, max_items_collapse, enabled)
    167     return collapsible_section(
    168         name,
--> 169         details=details_func(mapping),
    170         n_items=n_items,
    171         enabled=enabled,

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/formatting_html.py in summarize_vars(variables)
    133     vars_li = "".join(
    134         f"<li class='xr-var-item'>{summarize_variable(k, v)}</li>"
--> 135         for k, v in variables.items()
    136     )
    137 

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/formatting_html.py in <genexpr>(.0)
    133     vars_li = "".join(
    134         f"<li class='xr-var-item'>{summarize_variable(k, v)}</li>"
--> 135         for k, v in variables.items()
    136     )
    137 

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/formatting_html.py in summarize_variable(name, var, is_index, dtype, preview)
    108     preview = preview or escape(inline_variable_array_repr(variable, 35))
    109     attrs_ul = summarize_attrs(var.attrs)
--> 110     data_repr = short_data_repr_html(variable)
    111 
    112     attrs_icon = _icon("icon-file-text2")

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/formatting_html.py in short_data_repr_html(array)
     22         return internal_data._repr_html_()
     23     else:
---> 24         text = escape(short_data_repr(array))
     25         return f"<pre>{text}</pre>"
     26 

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/formatting.py in short_data_repr(array)
    461         return limit_lines(repr(array.data), limit=40)
    462     elif array._in_memory or array.size < 1e5:
--> 463         return short_numpy_repr(array)
    464     else:
    465         # internal xarray array type

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/formatting.py in short_numpy_repr(array)
    435 
    436 def short_numpy_repr(array):
--> 437     array = np.asarray(array)
    438 
    439     # default to lower precision so a full (abbreviated) line can fit on

/srv/conda/envs/notebook/lib/python3.7/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/common.py in __array__(self, dtype)
    130 
    131     def __array__(self: Any, dtype: DTypeLike = None) -> np.ndarray:
--> 132         return np.asarray(self.values, dtype=dtype)
    133 
    134     def __repr__(self) -> str:

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/variable.py in values(self)
    455     def values(self):
    456         """The variable's data as a numpy.ndarray"""
--> 457         return _as_array_or_item(self._data)
    458 
    459     @values.setter

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/variable.py in _as_array_or_item(data)
    258     TODO: remove this (replace with np.asarray) once these issues are fixed
    259     """
--> 260     data = np.asarray(data)
    261     if data.ndim == 0:
    262         if data.dtype.kind == "M":

/srv/conda/envs/notebook/lib/python3.7/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    675 
    676     def __array__(self, dtype=None):
--> 677         self._ensure_cached()
    678         return np.asarray(self.array, dtype=dtype)
    679 

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/indexing.py in _ensure_cached(self)
    672     def _ensure_cached(self):
    673         if not isinstance(self.array, NumpyIndexingAdapter):
--> 674             self.array = NumpyIndexingAdapter(np.asarray(self.array))
    675 
    676     def __array__(self, dtype=None):

/srv/conda/envs/notebook/lib/python3.7/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    651 
    652     def __array__(self, dtype=None):
--> 653         return np.asarray(self.array, dtype=dtype)
    654 
    655     def __getitem__(self, key):

/srv/conda/envs/notebook/lib/python3.7/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/indexing.py in __array__(self, dtype)
    555     def __array__(self, dtype=None):
    556         array = as_indexable(self.array)
--> 557         return np.asarray(array[self.key], dtype=None)
    558 
    559     def transpose(self, order):

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/backends/h5netcdf_.py in __getitem__(self, key)
     27     def __getitem__(self, key):
     28         return indexing.explicit_indexing_adapter(
---> 29             key, self.shape, indexing.IndexingSupport.OUTER_1VECTOR, self._getitem
     30         )
     31 

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/core/indexing.py in explicit_indexing_adapter(key, shape, indexing_support, raw_indexing_method)
    835     """
    836     raw_key, numpy_indices = decompose_indexer(key, shape, indexing_support)
--> 837     result = raw_indexing_method(raw_key.tuple)
    838     if numpy_indices.tuple:
    839         # index the loaded np.ndarray

/srv/conda/envs/notebook/lib/python3.7/site-packages/xarray/backends/h5netcdf_.py in _getitem(self, key)
     36         with self.datastore.lock:
     37             array = self.get_array(needs_lock=False)
---> 38             return array[key]
     39 
     40 

/srv/conda/envs/notebook/lib/python3.7/site-packages/h5netcdf/core.py in __getitem__(self, key)
    144 
    145     def __getitem__(self, key):
--> 146         return self._h5ds[key]
    147 
    148     def __setitem__(self, key, value):

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

/srv/conda/envs/notebook/lib/python3.7/site-packages/h5py/_hl/dataset.py in __getitem__(self, args)
    571         mspace = h5s.create_simple(mshape)
    572         fspace = selection.id
--> 573         self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)
    574 
    575         # Patch up the output for NumPy

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/h5d.pyx in h5py.h5d.DatasetID.read()

h5py/_proxy.pyx in h5py._proxy.dset_rw()

h5py/_proxy.pyx in h5py._proxy.H5PY_H5Dread()

h5py/defs.pyx in h5py.defs.H5Dread()

h5py/h5fd.pyx in h5py.h5fd.H5FD_fileobj_read()

/srv/conda/envs/notebook/lib/python3.7/site-packages/fsspec/spec.py in readinto(self, b)
   1407         """
   1408         out = memoryview(b).cast("B")
-> 1409         data = self.read(out.nbytes)
   1410         out[: len(data)] = data
   1411         return len(data)

/srv/conda/envs/notebook/lib/python3.7/site-packages/fsspec/spec.py in read(self, length)
   1392             length = self.size - self.loc
   1393         if self.closed:
-> 1394             raise ValueError("I/O operation on closed file.")
   1395         logger.debug("%s read: %i - %i" % (self, self.loc, self.loc + length))
   1396         if length == 0:

ValueError: I/O operation on closed file.
<xarray.Dataset>
Dimensions:                  (index: 365)
Coordinates:
  * index                    (index) int64 0 1 2 3 4 5 ... 360 361 362 363 364
Data variables:
    WBANNO                   (index) int64 ...
    LST_DATE                 (index) int64 ...
    CRX_VN                   (index) float64 ...
    LONGITUDE                (index) float64 ...
    LATITUDE                 (index) float64 ...
    T_DAILY_MAX              (index) float64 ...
    T_DAILY_MIN              (index) float64 ...
    T_DAILY_MEAN             (index) float64 ...
    T_DAILY_AVG              (index) float64 ...
    P_DAILY_CALC             (index) float64 ...
    SOLARAD_DAILY            (index) float64 ...
    SUR_TEMP_DAILY_TYPE      (index) object ...
    SUR_TEMP_DAILY_MAX       (index) float64 ...
    SUR_TEMP_DAILY_MIN       (index) float64 ...
    SUR_TEMP_DAILY_AVG       (index) float64 ...
    RH_DAILY_MAX             (index) float64 ...
    RH_DAILY_MIN             (index) float64 ...
    RH_DAILY_AVG             (index) float64 ...
    SOIL_MOISTURE_5_DAILY    (index) float64 ...
    SOIL_MOISTURE_10_DAILY   (index) float64 ...
    SOIL_MOISTURE_20_DAILY   (index) float64 ...
    SOIL_MOISTURE_50_DAILY   (index) float64 ...
    SOIL_MOISTURE_100_DAILY  (index) float64 ...
    SOIL_TEMP_5_DAILY        (index) float64 ...
    SOIL_TEMP_10_DAILY       (index) float64 ...
    SOIL_TEMP_20_DAILY       (index) float64 ...
    SOIL_TEMP_50_DAILY       (index) float64 ...
    SOIL_TEMP_100_DAILY      (index) float64 ...
tjcrone commented 3 years ago

I should point out that this appeared to work for some of my students intermittently. Which is odd. I don't think I managed to get the entire HTML repr of the dataset printed out properly.

martindurant commented 3 years ago

Perhaps you want to keep the file open explicitly

f = gdfs.open('CRND0103-2017-NY_Millbrook_3_W.nc').open()

instead of using with - but I don't know why the h5py driver needs to read additional metadata after the initial parse. Actually, I am assuming the driver backend - maybe you have different "engines" available to xarray than the students, causing the different behaviour.

tjcrone commented 3 years ago

We are all working on the same Pangeo JupyterHub and the same notebook image, so we should have the same engines.

Anyway, tried your suggestion and got this:

f = gdfs.open('CRND0103-2017-NY_Millbrook_3_W.nc').open()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-11-076798fa5f88> in <module>
----> 1 f = gdfs.open('CRND0103-2017-NY_Millbrook_3_W.nc').open()

AttributeError: 'GoogleDriveFile' object has no attribute 'open'

And it appears to be correct about that:

dir(f)
['DEFAULT_BLOCK_SIZE',
 '__abstractmethods__',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__next__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_abc_impl',
 '_checkClosed',
 '_checkReadable',
 '_checkSeekable',
 '_checkWritable',
 '_closed',
 '_fetch_range',
 '_initiate_upload',
 '_upload_chunk',
 'autocommit',
 'blocksize',
 'cache',
 'close',
 'closed',
 'commit',
 'details',
 'discard',
 'end',
 'file_id',
 'fileno',
 'flush',
 'fs',
 'info',
 'isatty',
 'kwargs',
 'loc',
 'mode',
 'path',
 'read',
 'readable',
 'readinto',
 'readinto1',
 'readline',
 'readlines',
 'readuntil',
 'seek',
 'seekable',
 'size',
 'start',
 'tell',
 'truncate',
 'writable',
 'write',
 'writelines']

Thanks for fielding these dumb questions. Once I have a good sense of which of these issues are mine and which are issues with the back-end, I will start working on fixes.

martindurant commented 3 years ago

Sorry, my mistake: I was referring to fsspec.open. Your code could be

gdfs = gdrivefs.GoogleDriveFileSystem(root_file_id='1PCBDhk5f3v5PoPCY3Rdcqgy4S_Yj2kCC', token='cache')
f = gdfs.open('CRND0103-2017-NY_Millbrook_3_W.nc')
ds = xr.open_dataset(f)
ds
...
f.close()
tjcrone commented 3 years ago

This suggestion works great. Thank you @martindurant much appreciated.