Closed jthorton closed 11 months ago
This issue appears to be related: https://github.com/h5py/h5py/issues/1948
The issue seems to be that fletcher32
does not support vlen
datatypes.
This line in QCPortal seems to set fletcher
to True
:
dataset_kwargs = {"chunks": True, "fletcher32": True}
Perhaps just setting this to False
would resolve this?
Working through this now; have other issues I'm hitting in trying to export e.g. SPICE PubChem Set 1 Single Points Dataset v1.2
:
/home/david/Projects/threads/molmat/openforcefield/qca-troubleshooting/hdf5-broken-714/QCFractal/qcfractal/interface/collections/dataset.py:273: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
return ret.reset_index().set_index("name").loc[subset].reset_index().set_index("index")
---------------------------------------------------------------------------
DimensionalityError Traceback (most recent call last)
Input In [13], in <cell line: 1>()
----> 1 ds.to_file('test.hdf5', 'hdf5')
File ~/Projects/threads/molmat/openforcefield/qca-troubleshooting/hdf5-broken-714/QCFractal/qcfractal/interface/collections/dataset.py:240, in Dataset.to_file(self, path, encoding)
237 elif encoding.lower() in ["hdf5", "h5"]:
238 from . import HDF5View
--> 240 HDF5View(path).write(self)
241 else:
242 raise NotImplementedError(f"Unsupported encoding: {encoding}")
File ~/Projects/threads/molmat/openforcefield/qca-troubleshooting/hdf5-broken-714/QCFractal/qcfractal/interface/collections/dataset_view.py:384, in HDF5View.write(self, ds)
382 gv_spec["stoich"] = gv_spec.pop("stoichiometry")
383 dataset_name = self._normalize_hdf5_name(name)
--> 384 df = ds.get_values(name=name, force=True, native=True)
385 assert df.shape[1] == 1
387 driver = specification["driver"]
File ~/Projects/threads/molmat/openforcefield/qca-troubleshooting/hdf5-broken-714/QCFractal/qcfractal/interface/collections/dataset.py:618, in Dataset.get_values(self, method, basis, keywords, program, driver, name, native, subset, force)
569 def get_values(
570 self,
571 method: Optional[Union[str, List[str]]] = None,
(...)
579 force: bool = False,
580 ) -> pd.DataFrame:
581 """
582 Obtains values matching the search parameters provided for the expected `return_result` values.
583 Defaults to the standard programs and keywords if not provided.
(...)
616 A DataFrame of values with columns corresponding to methods and rows corresponding to molecule entries.
617 """
--> 618 return self._get_values(
619 method=method,
620 basis=basis,
621 keywords=keywords,
622 program=program,
623 driver=driver,
624 name=name,
625 native=native,
626 subset=subset,
627 force=force,
628 )
File ~/Projects/threads/molmat/openforcefield/qca-troubleshooting/hdf5-broken-714/QCFractal/qcfractal/interface/collections/dataset.py:656, in Dataset._get_values(self, native, force, subset, **spec)
651 if driver is not None and driver != self.data.default_driver:
652 raise KeyError(
653 f"For native values, driver ({driver}) must be the same as the dataset's default driver "
654 f"({self.data.default_driver}). Consider using get_records instead."
655 )
--> 656 df = self._get_native_values(subset=subset_set, force=force, **spec_nodriver)
657 ret.append(df)
659 if native in {False, None}:
File ~/Projects/threads/molmat/openforcefield/qca-troubleshooting/hdf5-broken-714/QCFractal/qcfractal/interface/collections/dataset.py:744, in Dataset._get_native_values(self, subset, method, basis, keywords, program, name, force)
742 for query in new_queries:
743 qname = query["name"]
--> 744 new_data[qname] *= constants.conversion_factor(units[qname], self.units)
745 self._column_metadata[qname].update({"native": True, "units": self.units})
747 self._update_cache(new_data)
File ~/.conda/envs/qcfractal-broken-hdf5/lib/python3.8/site-packages/qcelemental/physical_constants/context.py:332, in PhysicalConstantsContext.conversion_factor(self, base_unit, conv_unit)
329 factor /= conv_unit.magnitude
330 conv_unit = conv_unit.units
--> 332 return self.ureg.convert(factor, base_unit, conv_unit)
File ~/.conda/envs/qcfractal-broken-hdf5/lib/python3.8/site-packages/pint/registry.py:947, in BaseRegistry.convert(self, value, src, dst, inplace)
944 if src == dst:
945 return value
--> 947 return self._convert(value, src, dst, inplace)
File ~/.conda/envs/qcfractal-broken-hdf5/lib/python3.8/site-packages/pint/registry.py:1830, in ContextRegistry._convert(self, value, src, dst, inplace)
1826 src = self._active_ctx.transform(a, b, self, src)
1828 value, src = src._magnitude, src._units
-> 1830 return super()._convert(value, src, dst, inplace)
File ~/.conda/envs/qcfractal-broken-hdf5/lib/python3.8/site-packages/pint/registry.py:1433, in NonMultiplicativeRegistry._convert(self, value, src, dst, inplace)
1428 raise DimensionalityError(
1429 src, dst, extra_msg=f" - In destination units, {ex}"
1430 )
1432 if not (src_offset_unit or dst_offset_unit):
-> 1433 return super()._convert(value, src, dst, inplace)
1435 src_dim = self._get_dimensionality(src)
1436 dst_dim = self._get_dimensionality(dst)
File ~/.conda/envs/qcfractal-broken-hdf5/lib/python3.8/site-packages/pint/registry.py:980, in BaseRegistry._convert(self, value, src, dst, inplace, check_dimensionality)
977 # If the source and destination dimensionality are different,
978 # then the conversion cannot be performed.
979 if src_dim != dst_dim:
--> 980 raise DimensionalityError(src, dst, src_dim, dst_dim)
982 # Here src and dst have only multiplicative units left. Thus we can
983 # convert with a factor.
984 factor, _ = self._get_root_units(src / dst)
DimensionalityError: Cannot convert from 'hartree / bohr' ([length] * [mass] / [time] ** 2) to 'kilocalorie / mole' ([length] ** 2 * [mass] / [substance] / [time] ** 2)
DimensionalityError: Cannot convert from 'hartree / bohr' ([length] * [mass] / [time] ** 2) to 'kilocalorie / mole' ([length] ** 2 * [mass] / [substance] / [time] ** 2)
This means that it's trying to convert a force or gradient into an energy. Perhaps the HDF5 export code was written before we started storing gradients?
The error is triggered by this line, where a driver='gradient'
data object is requesting conversion from its native units (hartree/bohr
) to
new_data[qname] *= constants.conversion_factor(units[qname], self.units)
self.units
returns self._units
, which is initialized to self.data.default_units
, which is set to kcal/mol
by DataModel
to match the default_driver='energy'
. When a different driver (for gradient
) is used, we're still looking up the default driver data model, and getting the wrong units.
I'm not entirely sure how this data model was intended to work---presumably each driver was intended to have its own DataModel
populated---but a simple fix is to add another line right after this line:
au_units = {"energy": "hartree", "gradient": "hartree/bohr", "hessian": "hartree/bohr**2"}
that would provide the desired post-conversion units
units = {"energy": "kcal/mol", "gradient": "kcal/mol/angstrom", "hessian": "kcal/mol/angstrom**2"}
and to just look up the appropriate units
in the call to conversion_factor
and next line.
new_data[qname] *= constants.conversion_factor(units[qname], self.units)
self._column_metadata[qname].update({"native": True, "units": self.units})
Alternatively, it's possible that self.units
should instead be pre-populated by a dictionary mapping different driver
values to different units that the Dataset
should use, or that base units for energy and length should be provided to appropriately adjust the au quantities into desired units.
You can also just set the units on the dataset for example ds.units = "Hartree / Bohr"
then call to file and this should work. One issue is that only the result from the driver will be saved to file, which in this case is the gradient whereas we want at least the energy and gradient and possibly other SCF properties. It might make sense to implement this in openff-qcsubmit alongside the new result filtering features where we can design an hdf5 format that would be most useful?
@jchodera I have put together a very quick prototype of how this might work in qcsubmit see the issue with an example HDF5 file for you to test and the associated PR for an explanation of the data layout.
I've created #715 that fixes HDF5 export functionality, but as @jthorton points out this does not feature much information; as constructed it only gives gradient
information. Is this sufficient for openmm/spice-dataset#11 @jchodera?
Rather than working to expand this functionality here, I'd rather us focus on getting the functionality we want in QCSubmit if possible. We'll get what we need into openforcefield/openff-qcsubmit#197.
@bennybp and I want to get a reasonable export pathway into QCFractal proper, but this should go into the next
major iteration, not the current Dataset
implementation. I'm focused on getting QCSubmit into compatible shape with QCFractal next
here.
v0.50 does not have HDF5 support (yet), so this issue is moot now.
Describe the bug Exporting single point datasets to HDF5 files seems to be broken.
To Reproduce
File ~/mambaforge/envs/bespokefit/lib/python3.8/site-packages/qcportal/collections/dataset.py:240, in Dataset.to_file(self, path, encoding) 237 elif encoding.lower() in ["hdf5", "h5"]: 238 from . import HDF5View --> 240 HDF5View(path).write(self) 241 else: 242 raise NotImplementedError(f"Unsupported encoding: {encoding}")
File ~/mambaforge/envs/bespokefit/lib/python3.8/site-packages/qcportal/collections/dataset_view.py:321, in HDF5View.write(self, ds) 319 molecules = ds.get_molecules(force=True) 320 mol_shape = (len(molecules),) --> 321 mol_geometry = molecule_group.create_dataset( 322 "geometry", shape=mol_shape, dtype=vlen_double_t, dataset_kwargs 323 ) 324 mol_symbols = molecule_group.create_dataset("symbols", shape=mol_shape, dtype=vlen_utf8_t, dataset_kwargs) 325 mol_schema = molecule_group.create_dataset("schema", shape=mol_shape, dtype=bytes_t, **dataset_kwargs)
File ~/mambaforge/envs/bespokefit/lib/python3.8/site-packages/h5py/_hl/group.py:149, in Group.create_dataset(self, name, shape, dtype, data, kwds) 146 parent_path, name = name.rsplit(b'/', 1) 147 group = self.require_group(parent_path) --> 149 dsid = dataset.make_new_dset(group, shape, dtype, data, name, kwds) 150 dset = dataset.Dataset(dsid) 151 return dset
File ~/mambaforge/envs/bespokefit/lib/python3.8/site-packages/h5py/_hl/dataset.py:142, in make_new_dset(parent, shape, dtype, data, name, chunks, compression, shuffle, fletcher32, maxshape, compression_opts, fillvalue, scaleoffset, track_times, external, track_order, dcpl, allow_unknown_filter) 138 else: 139 sid = h5s.create_simple(shape, maxshape) --> 142 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl) 144 if (data is not None) and (not isinstance(data, Empty)): 145 dset_id.write(h5s.ALL, h5s.ALL, data)
File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()
File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()
File h5py/h5d.pyx:87, in h5py.h5d.create()
ValueError: Unable to create dataset (not suitable for filters)
Expected behavior This should make a HDF5 view of the dataset.
Additional context QCPortal version 0.15.8 h5py version 3.6.0