NaN after restarting simulation

RorroArt commented 1 year ago

Code:

import jax 
import jax.numpy as jnp
import numpy as np

import sys

import hoomd
import hoomd.md

from ecv.models import AutoEncoder
from ecv.models.encoders import MLP, EGNN
from ecv.models.decoders import SimpleRelDecoder, SimpleDecoder

from ecv.cv import CVLearner

from pysages.methods import SpectralABF

from optax import adam

def generate_context(**kwargs):

    """
    Generates a simulation context, we pass this function to the attribute
    `run` of our sampling method.
    """

    hoomd.context.initialize("--mode=gpu")
    pi = np.pi
    kT = 0.596161
    dt = 0.02045
    ### System Definition
    snapshot = hoomd.data.make_snapshot(
        N = 14,
        box = hoomd.data.boxdim(Lx = 41, Ly = 41, Lz = 41),
        particle_types = ['C', 'H'],
        bond_types = ["CC", "CH"],
        angle_types = ["CCC", "CCH", "HCH"],
        dihedral_types = ["CCCC", "HCCC", "HCCH"],
        pair_types = ["CCCC", "HCCC", "HCCH"],
        dtype = "double"
    )

    snapshot.particles.typeid[0] = 0
    snapshot.particles.typeid[1:4] = 1
    snapshot.particles.typeid[4] = 0
    snapshot.particles.typeid[5:7] = 1
    snapshot.particles.typeid[7] = 0
    snapshot.particles.typeid[8:10] = 1
    snapshot.particles.typeid[10] = 0
    snapshot.particles.typeid[11:14] = 1

    positions = np.array([
        [-2.990196,  0.097881,  0.000091],
        [-2.634894, -0.911406,  0.001002],
        [-2.632173,  0.601251, -0.873601],
        [-4.060195,  0.099327, -0.000736],
        [-2.476854,  0.823942,  1.257436],
        [-2.832157,  1.833228,  1.256526],
        [-2.834877,  0.320572,  2.131128],
        [-0.936856,  0.821861,  1.258628],
        [-0.578833,  1.325231,  0.384935],
        [-0.581553, -0.187426,  1.259538],
        [-0.423514,  1.547922,  2.515972],
        [-0.781537,  1.044552,  3.389664],
        [ 0.646485,  1.546476,  2.516800],
        [-0.778816,  2.557208,  2.515062]
    ])

    reference_box_low_coords = np.array([-22.206855, -19.677099, -19.241968])
    box_low_coords = np.array([
        -snapshot.box.Lx / 2,
        -snapshot.box.Ly / 2,
        -snapshot.box.Lz / 2
    ])
    positions += (box_low_coords - reference_box_low_coords)

    snapshot.particles.position[:] = positions[:]

    mC = 12.00
    mH = 1.008
    snapshot.particles.mass[:] = [
        mC, mH, mH, mH,
        mC, mH, mH,
        mC, mH, mH,
        mC, mH, mH, mH
    ]

    reference_charges = np.array([
        -0.180000, 0.060000, 0.060000, 0.060000,
        -0.120000, 0.060000, 0.060000,
        -0.120000, 0.060000, 0.060000,
        -0.180000, 0.060000, 0.060000, 0.060000]
    )
    charge_conversion = 18.22262
    snapshot.particles.charge[:] = charge_conversion * reference_charges[:]

    snapshot.bonds.resize(13)
    snapshot.bonds.typeid[0:3] = 1
    snapshot.bonds.typeid[3] = 0
    snapshot.bonds.typeid[4:6] = 1
    snapshot.bonds.typeid[6] = 0
    snapshot.bonds.typeid[7:9] = 1
    snapshot.bonds.typeid[9] = 0
    snapshot.bonds.typeid[10:13] = 1

    snapshot.bonds.group[:] = [
        [0, 2], [0, 1], [0, 3], [0, 4],
        [4, 5], [4, 6], [4, 7],
        [7, 8], [7, 9], [7, 10],
        [10, 11], [10, 12], [10, 13]
    ]

    snapshot.angles.resize(24)
    snapshot.angles.typeid[0:2] = 2
    snapshot.angles.typeid[2] = 1
    snapshot.angles.typeid[3] = 2
    snapshot.angles.typeid[4:8] = 1
    snapshot.angles.typeid[8] = 0
    snapshot.angles.typeid[9] = 2
    snapshot.angles.typeid[10:14] = 1
    snapshot.angles.typeid[14] = 0
    snapshot.angles.typeid[15] = 2
    snapshot.angles.typeid[16:21] = 1
    snapshot.angles.typeid[21:24] = 2

    snapshot.angles.group[:] = [
        [1, 0, 2], [2, 0, 3], [2, 0, 4],
        [1, 0, 3], [1, 0, 4], [3, 0, 4],
        [0, 4, 5], [0, 4, 6], [0, 4, 7],
        [5, 4, 6], [5, 4, 7], [6, 4, 7],
        [4, 7, 8], [4, 7, 9], [4, 7, 10],
        [8, 7, 9], [8, 7, 10], [9, 7, 10],
        [7, 10, 11], [7, 10, 12], [7, 10, 13],
        [11, 10, 12], [11, 10, 13], [12, 10, 13]
    ]

    snapshot.dihedrals.resize(27)
    snapshot.dihedrals.typeid[0:2] = 2
    snapshot.dihedrals.typeid[2] = 1
    snapshot.dihedrals.typeid[3:5] = 2
    snapshot.dihedrals.typeid[5] = 1
    snapshot.dihedrals.typeid[6:8] = 2
    snapshot.dihedrals.typeid[8:11] = 1
    snapshot.dihedrals.typeid[11] = 0
    snapshot.dihedrals.typeid[12:14] = 2
    snapshot.dihedrals.typeid[14] = 1
    snapshot.dihedrals.typeid[15:17] = 2
    snapshot.dihedrals.typeid[17:21] = 1
    snapshot.dihedrals.typeid[21:27] = 2

    snapshot.dihedrals.group[:] = [
        [2, 0, 4, 5], [2, 0, 4, 6], [2, 0, 4, 7],
        [1, 0, 4, 5], [1, 0, 4, 6], [1, 0, 4, 7],
        [3, 0, 4, 5], [3, 0, 4, 6], [3, 0, 4, 7],
        [0, 4, 7, 8], [0, 4, 7, 9], [0, 4, 7, 10],
        [5, 4, 7, 8], [5, 4, 7, 9], [5, 4, 7, 10],
        [6, 4, 7, 8], [6, 4, 7, 9], [6, 4, 7, 10],
        [4, 7, 10, 11], [4, 7, 10, 12], [4, 7, 10, 13],
        [8, 7, 10, 11], [8, 7, 10, 12], [8, 7, 10, 13],
        [9, 7, 10, 11], [9, 7, 10, 12], [9, 7, 10, 13]
    ]

    snapshot.pairs.resize(27)
    snapshot.pairs.typeid[0:1] = 0
    snapshot.pairs.typeid[1:11] = 1
    snapshot.pairs.typeid[11:27] = 2
    snapshot.pairs.group[:] = [
        # CCCC
        [0, 10],
        # HCCC
        [0, 8], [0, 9], [5, 10], [6, 10],
        [1, 7], [2, 7], [3, 7],
        [11, 4], [12, 4], [13, 4],
        # HCCH
        [1, 5], [1, 6], [2, 5], [2, 6], [3, 5], [3, 6],
        [5, 8], [6, 8], [5, 9], [6, 9],
        [8, 11], [8, 12], [8, 13], [9, 11], [9, 12], [9, 13]
    ]

    hoomd.init.read_snapshot(snapshot)

    ### Set interactions
    nl_ex = hoomd.md.nlist.cell()
    nl_ex.reset_exclusions(exclusions = ["1-2", "1-3", "1-4"])

    lj = hoomd.md.pair.lj(r_cut = 12.0, nlist = nl_ex)
    lj.pair_coeff.set('C', 'C', epsilon = 0.07, sigma = 3.55)
    lj.pair_coeff.set('H', 'H', epsilon = 0.03, sigma = 2.42)
    lj.pair_coeff.set('C', 'H', epsilon = np.sqrt(0.07*0.03), sigma = np.sqrt(3.55*2.42))

    coulomb = hoomd.md.charge.pppm(hoomd.group.charged(), nlist = nl_ex)
    coulomb.set_params(Nx = 64, Ny = 64, Nz = 64, order = 6, rcut = 12.0)

    harmonic = hoomd.md.bond.harmonic()
    harmonic.bond_coeff.set("CC", k = 2*268.0, r0 = 1.529)
    harmonic.bond_coeff.set("CH", k = 2*340.0, r0 = 1.09)

    angle = hoomd.md.angle.harmonic()
    angle.angle_coeff.set("CCC", k = 2*58.35, t0 = 112.7 * pi / 180)
    angle.angle_coeff.set("CCH", k = 2*37.5, t0 = 110.7 * pi / 180)
    angle.angle_coeff.set("HCH", k = 2*33.0, t0 = 107.8 * pi / 180)

    dihedral = hoomd.md.dihedral.opls()
    dihedral.dihedral_coeff.set("CCCC", k1 = 1.3, k2 = -0.05, k3 = 0.2, k4 = 0.0)
    dihedral.dihedral_coeff.set("HCCC", k1 = 0.0, k2 = 0.0, k3 = 0.3, k4 = 0.0)
    dihedral.dihedral_coeff.set("HCCH", k1 = 0.0, k2 = 0.0, k3 = 0.3, k4 = 0.0)

    lj_special_pairs = hoomd.md.special_pair.lj()
    lj_special_pairs.pair_coeff.set("CCCC", epsilon = 0.07, sigma = 3.55, r_cut = 12.0)
    lj_special_pairs.pair_coeff.set("HCCH", epsilon = 0.03, sigma = 2.42, r_cut = 12.0)
    lj_special_pairs.pair_coeff.set("HCCC",
        epsilon = np.sqrt(0.07 * 0.03), sigma = np.sqrt(3.55 * 2.42), r_cut = 12.0
    )

    coulomb_special_pairs = hoomd.md.special_pair.coulomb()
    coulomb_special_pairs.pair_coeff.set("CCCC", alpha = 0.5, r_cut = 12.0)
    coulomb_special_pairs.pair_coeff.set("HCCC", alpha = 0.5, r_cut = 12.0)
    coulomb_special_pairs.pair_coeff.set("HCCH", alpha = 0.5, r_cut = 12.0)

    hoomd.md.integrate.mode_standard(dt = dt)
    integrator = hoomd.md.integrate.nvt(group = hoomd.group.all(), kT = kT, tau = 100*dt)
    integrator.randomize_velocities(seed = 42)

    return hoomd.context.current

model = AutoEncoder(
    encoder=MLP(HIDDEN, LATENT_SIZE, n_layers=N_LAYERS),
    decoder=SimpleRelDecoder(14, 3, HIDDEN),
    latent_size=LATENT_SIZE
)

learner = CVLearner(
    n_molecules=14,
    simulation_context=generate_context,
    method=SpectralABF,
    model=model,
    optimizer=optimizer,
)

def coords_target_fn(batch):
    coords = batch.x
    return coords.flatten()

def distances_target_fn(batch):
    coords = batch.x
    coords = jnp.broadcast_to(coords.squeeze(0), (14, 14, 3))
    y = jnp.linalg.norm(coords - jnp.transpose(coords, axes=(1,0,2)), axis=-1)
    return y

def loss_fn(params, _model, x, y, key):
    y_pred, mean, var = _model.apply(params, x, key)
    reconstruction_loss = jnp.abs(y - y_pred).mean()
    kl_loss = -0.5 * jnp.sum(1 + var -jnp.power(mean, 2) - jnp.exp(var))
    return reconstruction_loss + kl_loss

data = learner.generate_initial_data(steps=10000, interval=10)
learner.train_model(key=jax.random.PRNGKey(12), loss_fn=loss_fn, target_fn=distances_target_fn, epochs=20)

params, _ = learner.model_params
key = jax.random.PRNGKey(12)

projections = jnp.array([model.encode(params, batch, key)[0] for batch in data])

lower, upper = projections.min(), projections.max()
distance = upper - lower
step = distance / 20 
lower -= step; upper += step

from pysages.colvars.core import CollectiveVariable

from typing import NamedTuple

class Batch(NamedTuple):
    x: jnp.ndarray

def compute_cv(positions):
    return model.encode(params, Batch(positions), key)[0].squeeze(0)

class CV(CollectiveVariable):
    @property
    def function(self):
        return compute_cv

import pysages
from pysages.grids import Grid
import pickle

cvs = [CV([0,1,2,3,4,5,6,7,8,9,10,11,12,13] )]
grid = Grid(lower=lower, upper=upper, shape=(64,), periodic=True)
method = SpectralABF(cvs, grid)
result = pysages.run(method, generate_context, 1)

with open("restart.pickle", "wb") as f:
    pickle.dump(result, f)

with open("restart.pickle", "rb") as f:
    result = pickle.load(f)

result = pysages.run(result, generate_context, 400000)

Then it logs the following:

notice(2): Group "all" created containing 14 particles
notice(2): -- Neighborlist exclusion statistics -- :
notice(2): Particles with 7 exclusions             : 6
notice(2): Particles with 10 exclusions             : 6
notice(2): Particles with 13 exclusions             : 2
notice(2): Neighbors included by diameter          : no
notice(2): Neighbors excluded when in the same body: no
notice(2): Group "charged" created containing 14 particles
-----
You are using PPPM. Please cite the following:
* D N LeBard, B G Levine, S A Barr, A Jusufi, S Sanders, M L Klein, and A Z
  Panagiotopoulos. "Self-assembly of coarse-grained ionic surfactants
  accelerated by graphics processing units", Journal of Computational Physics 8
  (2012) 2385-2397
-----
** starting run **
notice(2): charge.pppm: RMS error: 1.29239e-08
**ERROR**: Particle with unique tag 12 has NaN for its position.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[16], line 1
----> 1 result = pysages.run(result, generate_context, 400000)

File [~/miniconda3/envs/sages_2/lib/python3.8/site-packages/plum/function.py:378](https://vscode-remote+ssh-002dremote-002btitans-002ddepablogroup-002educkdns-002eorg.vscode-resource.vscode-cdn.net/home/rorroart/models/ecv/~/miniconda3/envs/sages_2/lib/python3.8/site-packages/plum/function.py:378), in Function.__call__(self, *args, **kw_args)
    374 except KeyError:
    375     # Cache miss. Run the resolver based on the arguments.
    376     method, return_type = self.resolve_method(args, types)
--> 378 return _convert(method(*args, **kw_args), return_type)

File [~/miniconda3/envs/sages_2/lib/python3.8/site-packages/pysages/methods/core.py:303](https://vscode-remote+ssh-002dremote-002btitans-002ddepablogroup-002educkdns-002eorg.vscode-resource.vscode-cdn.net/home/rorroart/models/ecv/~/miniconda3/envs/sages_2/lib/python3.8/site-packages/pysages/methods/core.py:303), in run(result, context_generator, timesteps, context_args, post_run_action, config, **kwargs)
    301 with config.executor as ex:
    302     result_args = zip(result.states, callbacks, result.snapshots)
--> 303     futures = [submit_work(ex, ReplicaResult(method, *args)) for args in result_args]
    305 results = [future.result() for future in futures]
    306 states = [r.states for r in results]

File [~/miniconda3/envs/sages_2/lib/python3.8/site-packages/pysages/methods/core.py:303](https://vscode-remote+ssh-002dremote-002btitans-002ddepablogroup-002educkdns-002eorg.vscode-resource.vscode-cdn.net/home/rorroart/models/ecv/~/miniconda3/envs/sages_2/lib/python3.8/site-packages/pysages/methods/core.py:303), in (.0)
    301 with config.executor as ex:
    302     result_args = zip(result.states, callbacks, result.snapshots)
--> 303     futures = [submit_work(ex, ReplicaResult(method, *args)) for args in result_args]
    305 results = [future.result() for future in futures]
    306 states = [r.states for r in results]
...
--> 201 context.current.system.run(int(tsteps), callback_period, callback, limit_hours, int(limit_multiple));
    202 if not quiet:
    203     context.msg.notice(1, "** run complete **\n")

RuntimeError: Error computing cell list

ap-uchi commented 10 months ago

Hello!

I believe I also might have a similar issue with Spectral ABF for my system using OpenMM. Using a time step of 2 fs and a temperature of ~500 K, the simulation will crash (Energy or particle coordinate is NaN) within 31,000 steps. Reducing the temperature to 300 K and keeping the same time step size allows my system to run for 354,000 steps before crashing as well. I've tried running within the crash threshold (<300,000 steps), saving the results, and restarting it, but this results in an immediate crash.

I've tried using NoseHooverIntegrator, LangevinIntegrator, and LangevinMiddleIntegrator. Reducing the time step size prolongs the crash but doesn't prevent it after running for extended periods of time.

Please let me know if additional information is needed @pabloferz ! Thanks for your help.

pabloferz commented 9 months ago

@ap-uchi is this for any of the systems in the examples or for your own? What are your CVs?

ap-uchi commented 9 months ago

@pabloferz I have reproduced this issue in multiple different systems - multiple types of polymers with DP 30 and with DP 2, but in both I have used the colvars.coordinates.Distance collective variable. I can send over my simulation files if you would like as well!

System attached below: twomer_ABF.tar.gz

pabloferz commented 9 months ago

Yes, having the simulation files would make tracking the issue a bit easier.

SSAGESLabs / PySAGES

NaN after restarting simulation #279