Open lilyminium opened 3 years ago
I don't have an immediate idea of where this comes from -- maybe looping over the distance matrix calculation? -- but I get a 1.2-2x speedup that saves 180 milliseconds over 558 conformers, which is just over the recommended conformers to generate for am1bccelf10 in OpenEye.
Helper functions that make up the overall electrostatic energy function This is from code I wrote for another purpose, so it's split up into several helper functions.
from typing import Set, Tuple
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import itertools
from typing_extensions import Literal
def compute_mmff_charges(rdmol: "rdkit.Chem.Mol",
forcefield: Literal["MMFF94", "MMFF94s"] = "MMFF94",
normalize_partial_charges: bool = True,
) -> np.ndarray:
mps = AllChem.MMFFGetMoleculeProperties(rdmol, forcefield)
n_atoms = rdmol.GetNumAtoms()
charges = np.array([mps.GetMMFFPartialCharge(i) for i in range(n_atoms)])
if normalize_partial_charges:
total_charge = Chem.GetFormalCharge(rdmol)
partial_charges = charges.sum()
offset = (total_charge - partial_charges) / n_atoms
charges += offset
return charges
def get_exclusions(rdmol: "rdkit.Chem.Mol") -> Set[Tuple[int, int]]:
exclusions = set()
for i, atom in enumerate(rdmol.GetAtoms()):
partners = [b.GetOtherAtomIdx(i) for b in atom.GetBonds()]
exclusions |= set(tuple(sorted([i, x])) for x in partners)
exclusions |= set(itertools.combinations(sorted(partners), 2))
return exclusions
def compute_distance_matrix(coordinates: np.ndarray) -> np.ndarray:
dist_sq = np.einsum('ijk,ilk->ijl', coordinates, coordinates)
diag = np.einsum("ijj->ij", dist_sq)
a, b = diag.shape
dist_sq += dist_sq - diag.reshape((a, 1, b)) - diag.reshape((a, b, 1))
diag[:] = -0.0
return np.sqrt(-dist_sq)
Current function
import copy
from openff.toolkit.utils import RDKitToolkitWrapper
wrapper = RDKitToolkitWrapper()
def original_electrostatic(molecule):
molecule_copy = copy.deepcopy(molecule)
wrapper.assign_partial_charges(molecule_copy, "mmff94")
conformer_energies = [
wrapper._elf_compute_electrostatic_energy(molecule_copy, conformer)
for conformer in molecule_copy.conformers
]
return conformer_energies
My function
def new_electrostatic(molecule):
rdmol = molecule.to_rdkit()
conformers = np.array([c.GetPositions() for c in rdmol.GetConformers()])
distances = compute_distance_matrix(conformers)
inverse_distances = np.reciprocal(distances,
out=np.zeros_like(distances),
where=~np.isclose(distances, 0))
charges = np.abs(compute_mmff_charges(rdmol)).reshape(-1, 1)
charge_products = charges @ charges.T
excl_i, excl_j = zip(*get_exclusions(rdmol))
charge_products[(excl_i, excl_j)] = 0.0
charge_products[(excl_j, excl_i)] = 0.0
energies = inverse_distances * charge_products
return 0.5 * energies.sum(axis=(1, 2))
Testing
from openff.toolkit.topology.molecule import Molecule, unit
from numpy.testing import assert_allclose
import timeit
SMILES = ["CCOCCOCC", "CCCCCOCCCOCC", "CCCCCOCCCOCCOCCC", "CCCCCCCOCCCCCOCCCCOCCCCCOCCCC"]
def compare():
for smi in SMILES:
offmol = Molecule.from_smiles(smi)
offmol.generate_conformers(n_conformers=1000000)
n_conf = len(offmol.conformers)
assert_allclose(original_electrostatic(offmol),
new_electrostatic(offmol))
number = 10
original_ = timeit.timeit(lambda: original_electrostatic(offmol),
number=number)
new_ = timeit.timeit(lambda: new_electrostatic(offmol),
number=number)
original = 1000 * original_/ number
new = 1000 * new_ / number
speedup = original / new
shape_text = f"{n_conf} conformers"
text = (f"{shape_text:>20s}: {original:.3f} => {new:.3f} "
f"msec ({speedup:.1f}x speedup over {number} iterations)")
print(text)
compare()
2 conformers: 2.846 => 1.342 msec (2.1x speedup over 10 iterations)
57 conformers: 30.103 => 17.130 msec (1.8x speedup over 10 iterations)
558 conformers: 405.555 => 225.990 msec (1.8x speedup over 10 iterations)
4537 conformers: 5522.692 => 4344.659 msec (1.3x speedup over 10 iterations)
Is your feature request related to a problem? Please describe.
Currently
RDKitToolkitWrapper._elf_compute_electrostatic_energy
computes the energy for one conformer at a time. Perhaps it would be easier and more efficient to do them all at once? (the_elf
also doesn't make as much sense when it's one conformer at a time).Below I point out which lines do not need to be repeated for each conformer (--), and which are easily done in a vectorized array (++)
Describe the solution you'd like
Describe alternatives you've considered
Additional context