openforcefield / openff-toolkit

The Open Forcefield Toolkit provides implementations of the SMIRNOFF format, parameterization engine, and other tools. Documentation available at http://open-forcefield-toolkit.readthedocs.io
http://openforcefield.org
MIT License
313 stars 92 forks source link

Compute electrostatic energy for all conformers at once? #1092

Open lilyminium opened 3 years ago

lilyminium commented 3 years ago

Is your feature request related to a problem? Please describe.

Currently RDKitToolkitWrapper._elf_compute_electrostatic_energy computes the energy for one conformer at a time. Perhaps it would be easier and more efficient to do them all at once? (the _elf also doesn't make as much sense when it's one conformer at a time).

Below I point out which lines do not need to be repeated for each conformer (--), and which are easily done in a vectorized array (++)

-        if molecule.partial_charges is None:
-            raise ValueError("The molecule has no partial charges assigned.")

-        partial_charges = np.abs(
-            molecule.partial_charges.value_in_unit(unit.elementary_charge)
-        ).reshape(-1, 1)

        # Build an exclusion list for 1-2 and 1-3 interactions.
-        excluded_x, excluded_y = zip(
-            *{
-                *[(bond.atom1_index, bond.atom2_index) for bond in molecule.bonds],
-                *[
-                    (angle[0].molecule_atom_index, angle[-1].molecule_atom_index)
-                    for angle in molecule.angles
-                ],
-            }
-        )

        # Build the distance matrix between all pairs of atoms.
        coordinates = conformer.value_in_unit(unit.angstrom)

        # Equation 1: (a - b)^2 = a^2 - 2ab + b^2
        # distances_squared will eventually wind up as the squared distances
        # although it is first computed as the ab portion of Eq 1
+        distances_squared = coordinates.dot(coordinates.T)
        # np.einsum is both faster than np.diag, and not read-only
        # we know that a^2 == b^2 == diag(ab)
+        diag = np.einsum("ii->i", distances_squared)
        # we modify in-place so we can use the `diag` view
        # to make the diagonals 0
+        distances_squared += distances_squared - diag - diag[..., np.newaxis]
        # Handle edge cases where the squared distance is slightly negative due to
        # precision issues
+        diag[:] = -0.0  # this is somewhat faster than np.fill_diagonal
+        distances = np.sqrt(-distances_squared)

+        inverse_distances = np.reciprocal(
+            distances, out=np.zeros_like(distances), where=~np.isclose(distances, 0.0)
+        )

        # Multiply by the charge products.
-        charge_products = partial_charges @ partial_charges.T

-        charge_products[excluded_x, excluded_y] = 0.0
-        charge_products[excluded_y, excluded_x] = 0.0

+        interaction_energies = inverse_distances * charge_products

+        return 0.5 * interaction_energies.sum()

Describe the solution you'd like

Describe alternatives you've considered

Additional context

lilyminium commented 3 years ago

I don't have an immediate idea of where this comes from -- maybe looping over the distance matrix calculation? -- but I get a 1.2-2x speedup that saves 180 milliseconds over 558 conformers, which is just over the recommended conformers to generate for am1bccelf10 in OpenEye.

Helper functions that make up the overall electrostatic energy function This is from code I wrote for another purpose, so it's split up into several helper functions.

from typing import Set, Tuple
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import itertools
from typing_extensions import Literal

def compute_mmff_charges(rdmol: "rdkit.Chem.Mol",
                         forcefield: Literal["MMFF94", "MMFF94s"] = "MMFF94",
                         normalize_partial_charges: bool = True,
                        ) -> np.ndarray:
    mps = AllChem.MMFFGetMoleculeProperties(rdmol, forcefield)
    n_atoms = rdmol.GetNumAtoms()
    charges = np.array([mps.GetMMFFPartialCharge(i) for i in range(n_atoms)])

    if normalize_partial_charges:
        total_charge = Chem.GetFormalCharge(rdmol)
        partial_charges = charges.sum()
        offset = (total_charge - partial_charges) / n_atoms
        charges += offset
    return charges

def get_exclusions(rdmol: "rdkit.Chem.Mol") -> Set[Tuple[int, int]]:
    exclusions = set()
    for i, atom in enumerate(rdmol.GetAtoms()):
        partners = [b.GetOtherAtomIdx(i) for b in atom.GetBonds()]
        exclusions |= set(tuple(sorted([i, x])) for x in partners)
        exclusions |= set(itertools.combinations(sorted(partners), 2))
    return exclusions

def compute_distance_matrix(coordinates: np.ndarray) -> np.ndarray:
    dist_sq = np.einsum('ijk,ilk->ijl', coordinates, coordinates)
    diag = np.einsum("ijj->ij", dist_sq)
    a, b = diag.shape
    dist_sq += dist_sq - diag.reshape((a, 1, b)) - diag.reshape((a, b, 1))
    diag[:] = -0.0
    return np.sqrt(-dist_sq)

Current function

import copy
from openff.toolkit.utils import RDKitToolkitWrapper
wrapper = RDKitToolkitWrapper()

def original_electrostatic(molecule):
    molecule_copy = copy.deepcopy(molecule)

    wrapper.assign_partial_charges(molecule_copy, "mmff94")

    conformer_energies = [
        wrapper._elf_compute_electrostatic_energy(molecule_copy, conformer)
        for conformer in molecule_copy.conformers
    ]
    return conformer_energies

My function

def new_electrostatic(molecule):
    rdmol = molecule.to_rdkit()
    conformers = np.array([c.GetPositions() for c in rdmol.GetConformers()])
    distances = compute_distance_matrix(conformers)
    inverse_distances = np.reciprocal(distances,
                                      out=np.zeros_like(distances),
                                      where=~np.isclose(distances, 0))

    charges = np.abs(compute_mmff_charges(rdmol)).reshape(-1, 1)
    charge_products = charges @ charges.T

    excl_i, excl_j = zip(*get_exclusions(rdmol))
    charge_products[(excl_i, excl_j)] = 0.0
    charge_products[(excl_j, excl_i)] = 0.0

    energies = inverse_distances * charge_products
    return 0.5 * energies.sum(axis=(1, 2))

Testing

from openff.toolkit.topology.molecule import Molecule, unit
from numpy.testing import assert_allclose
import timeit

SMILES = ["CCOCCOCC", "CCCCCOCCCOCC", "CCCCCOCCCOCCOCCC", "CCCCCCCOCCCCCOCCCCOCCCCCOCCCC"]

def compare():
    for smi in SMILES:
        offmol = Molecule.from_smiles(smi)
        offmol.generate_conformers(n_conformers=1000000)
        n_conf = len(offmol.conformers)

        assert_allclose(original_electrostatic(offmol),
                        new_electrostatic(offmol))

        number = 10

        original_ = timeit.timeit(lambda: original_electrostatic(offmol),
                                  number=number)
        new_ = timeit.timeit(lambda: new_electrostatic(offmol),
                             number=number)
        original = 1000 *  original_/ number
        new = 1000 * new_ / number

        speedup = original / new
        shape_text = f"{n_conf} conformers"
        text = (f"{shape_text:>20s}: {original:.3f} => {new:.3f} "
                f"msec ({speedup:.1f}x speedup over {number} iterations)")
        print(text)
compare()
        2 conformers: 2.846 => 1.342 msec (2.1x speedup over 10 iterations)
       57 conformers: 30.103 => 17.130 msec (1.8x speedup over 10 iterations)
      558 conformers: 405.555 => 225.990 msec (1.8x speedup over 10 iterations)
     4537 conformers: 5522.692 => 4344.659 msec (1.3x speedup over 10 iterations)