Open aekiss opened 2 years ago
This workaround doesn't work for me
import cosima_cookbook as cc
session = cc.database.create_session()
data = cc.querying.getvar('01deg_jra55v140_iaf_cycle4', 'phy', session, ncfile = '%monthly%')
It fails with
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/xarray/core/dataset.py in _copy_listed(self, names)
1358 try:
-> 1359 variables[name] = self._variables[name]
1360 except KeyError:
KeyError: 'time_bnds'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/jobfs/57796133.gadi-pbs/ipykernel_1260084/3696651004.py in <cell line: 1>()
----> 1 data = cc.querying.getvar('01deg_jra55v140_iaf_cycle4', 'phy', session, ncfile = '%monthly%')
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/cosima_cookbook/querying.py in getvar(expt, variable, session, ncfile, start_time, end_time, n, frequency, attrs, attrs_unique, **kwargs)
358 ncfiles = list(str(f.NCFile.ncfile_path) for f in ncfiles)
359
--> 360 ds = xr.open_mfdataset(
361 ncfiles,
362 parallel=True,
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/xarray/backends/api.py in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, data_vars, coords, combine, parallel, join, attrs_file, combine_attrs, **kwargs)
914 # calling compute here will return the datasets/file_objs lists,
915 # the underlying datasets will still be stored as dask arrays
--> 916 datasets, closers = dask.compute(datasets, closers)
917
918 # Combine all datasets, closing them in case of a ValueError
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/dask/base.py in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
573 postcomputes.append(x.__dask_postcompute__())
574
--> 575 results = schedule(dsk, keys, **kwargs)
576 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
577
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
79 pool = MultiprocessingPoolExecutor(pool)
80
---> 81 results = get_async(
82 pool.submit,
83 pool._max_workers,
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/dask/local.py in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
506 _execute_task(task, data) # Re-execute locally
507 else:
--> 508 raise_exception(exc, tb)
509 res, worker_id = loads(res_info)
510 state["cache"][key] = res
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/dask/local.py in reraise(exc, tb)
314 if exc.__traceback__ is not tb:
315 raise exc.with_traceback(tb)
--> 316 raise exc
317
318
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
219 try:
220 task, data = loads(task_info)
--> 221 result = _execute_task(task, data)
222 id = get_id()
223 result = dumps((result, id))
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/dask/core.py in _execute_task(arg, cache, dsk)
117 # temporaries by their reference count and can execute certain
118 # operations in-place.
--> 119 return func(*(_execute_task(a, cache) for a in args))
120 elif not ishashable(arg):
121 return arg
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/cosima_cookbook/querying.py in _preprocess(d)
354 # otherwise, figure out if we need any ancilliary data
355 # like time_bounds
--> 356 return d[variables]
357
358 ncfiles = list(str(f.NCFile.ncfile_path) for f in ncfiles)
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/xarray/core/dataset.py in __getitem__(self, key)
1498 return self._construct_dataarray(key)
1499 else:
-> 1500 return self._copy_listed(key)
1501
1502 def __setitem__(self, key: Hashable | list[Hashable] | Mapping, value) -> None:
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/xarray/core/dataset.py in _copy_listed(self, names)
1359 variables[name] = self._variables[name]
1360 except KeyError:
-> 1361 ref_name, var_name, var = _get_virtual_variable(
1362 self._variables, name, self._level_coords, self.dims
1363 )
/g/data/hh5/public/apps/miniconda3/envs/analysis3-22.01/lib/python3.9/site-packages/xarray/core/dataset.py in _get_virtual_variable(variables, key, level_vars, dim_sizes)
167 ref_var = dim_var.to_index_variable().get_level_variable(ref_name)
168 else:
--> 169 ref_var = variables[ref_name]
170
171 if var_name is None:
KeyError: 'time_bnds'
@angus-g can you take a look please?
@aidanheerdegen the Data Explorer also provides incorrect code for these cases, e.g.
cc.querying.getvar(expt='01deg_jra55v140_iaf_cycle4', variable='phy',
session=session, frequency='static',
attrs={'cell_methods': 'time: mean'}, n=1)
@aekiss looks like maybe because you're using conda/analysis3-22.01
which doesn't include #294?
d'oh, sorry! I should have thought of that.
For future reference, the conda/analysis3-22.07
kernel (or later) is needed for the workaround.
However, even with conda/analysis3-22.07
the Data Explorer still gives the same (incorrect) code - see above.
This issue has been mentioned on ACCESS Hive Community Forum. There might be relevant details there:
https://forum.access-hive.org.au/t/building-a-cosima-dataset-on-time-averaged-files/415/9
Just copying this from Slack so we don't lose it:
Some of the BGC output in IAF cycle 4 is saved as one time level per file. In these cases the COSIMA Cookbook indexing considers it "static", so the
frequency
filter incc.querying.getvar
doesn't work. E.g. this returns nothing:A workaround is to filter on the filename, e.g.
but it would be nice not to have to resort to this, because the output frequency isn't included in the filename for many of the experiments.