markovmodel / PyEMMA

🚂 Python API for Emma's Markov Model Algorithms 🚂
http://pyemma.org
GNU Lesser General Public License v3.0
307 stars 119 forks source link

How to add featurizer to Amber NetCDF files? #922

Closed jeiros closed 8 years ago

jeiros commented 8 years ago

Hi all,

I'm trying to understand how would I load only a subset of features from a NetCDF Amber simulation file. From the documentation of the load function, I can use features (MDFeaturizer, optional, default = None) – a featurizer object specifying how molecular dynamics files should be read (e.g. intramolecular distances, angles, dihedrals, etc).. The problem I'm facing is that, to load a NetCDF file, I am forced to use the top parameter. If I use both, only the featurizer is tried to be used and then I get an error again, as I'm forced to use a topology when loading a NetCDF file. Is there a way to do this? Here's what I've tried:

import pyemma
import mdtraj as md
topfile = md.load_prmtop('WT-ff14SB_clean.prmtop')
feat = pyemma.coordinates.featurizer(topfile)
feat.add_backbone_torsions()
print(feat.dimension())
print(type(feat))
836
<class 'pyemma.coordinates.data.featurization.featurizer.MDFeaturizer'>

Using just the topology object works fine

traj1 = pyemma.coordinates.load('run9/05_Prod_WTff14SB_0000-0135ns_run9.nc', top=topfile)
print(traj1.shape)
(6750, 20436)

Try using the backbone dihedral features

traj2 = pyemma.coordinates.load('run9/05_Prod_WTff14SB_0000-0135ns_run9.nc', features=feat)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.flux' detected! Please use 'msmtools.flux' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.analysis' detected! Please use 'msmtools.analysis' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.dtraj' detected! Please use 'msmtools.dtraj' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.generation' detected! Please use 'msmtools.generation' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.estimation' detected! Please use 'msmtools.estimation' in the future.
  category=PyEMMA_DeprecationWarning)

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-4-547a63d7fdb3> in <module>()
----> 1 traj2 = pyemma.coordinates.load('run9/05_Prod_WTff14SB_0000-0135ns_run9.nc', features=feat)

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/api.py in load(trajfiles, features, top, stride, chunk_size, **kw)
    204             and (any(isinstance(item, (list, tuple, _string_types)) for item in trajfiles)
    205                  or len(trajfiles) is 0)):
--> 206         reader = create_file_reader(trajfiles, top, features, chunk_size=chunk_size if chunk_size is not None else 0, **kw)
    207         trajs = reader.get_output(stride=stride)
    208         if len(trajs) == 1:

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/data/util/reader_utils.py in create_file_reader(input_files, topology, featurizer, chunk_size, **kw)
     99 
    100                     reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology,
--> 101                                            chunksize=chunk_size)
    102                 else:
    103                     if suffix in ['.npy', '.npz']:

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/data/feature_reader.py in __init__(self, trajectories, topologyfile, chunksize, featurizer)
    118 
    119         # Check that the topology and the files in the filelist can actually work together
--> 120         self._assert_toptraj_consistency()
    121 
    122     @property

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/data/feature_reader.py in _assert_toptraj_consistency(self)
    161     def _assert_toptraj_consistency(self):
    162         r""" Check if the topology and the filenames of the reader have the same n_atoms"""
--> 163         traj = mdtraj.load_frame(self.filenames[0], index=0, top=self.topfile)
    164         desired_n_atoms = self.featurizer.topology.n_atoms
    165         assert traj.xyz.shape[1] == desired_n_atoms, "Mismatch in the number of atoms between the topology" \

/Users/je714/anaconda3/lib/python3.5/site-packages/mdtraj/core/trajectory.py in load_frame(filename, index, top, atom_indices, **kwargs)
    305         _assert_files_or_dirs_exist(filename)
    306 
--> 307     return loader(filename, frame=index, **kwargs)
    308 
    309 

/Users/je714/anaconda3/lib/python3.5/site-packages/mdtraj/formats/netcdf.py in load_netcdf(filename, top, stride, atom_indices, frame)
     88     from mdtraj.core.trajectory import _parse_topology, Trajectory
     89     if top is None:
---> 90         raise ValueError('"top" argument is required for load_netcdf')
     91 
     92     topology = _parse_topology(top)

ValueError: "top" argument is required for load_netcdf

Using both the featurizer object and the topology object

traj3 = pyemma.coordinates.load('run9/05_Prod_WTff14SB_0000-0135ns_run9.nc', features=feat, top=topfile)
06-09-16 16:47:19 pyemma.coordinates.data.feature_reader.FeatureReader[1] WARNING  Both a topology file and a featurizer were given as arguments. Only featurizer gets respected in this case.

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.flux' detected! Please use 'msmtools.flux' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.analysis' detected! Please use 'msmtools.analysis' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.dtraj' detected! Please use 'msmtools.dtraj' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.generation' detected! Please use 'msmtools.generation' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.estimation' detected! Please use 'msmtools.estimation' in the future.
  category=PyEMMA_DeprecationWarning)

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-5-09ad6c683c75> in <module>()
----> 1 traj3 = pyemma.coordinates.load('run9/05_Prod_WTff14SB_0000-0135ns_run9.nc', features=feat, top=topfile)

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/api.py in load(trajfiles, features, top, stride, chunk_size, **kw)
    204             and (any(isinstance(item, (list, tuple, _string_types)) for item in trajfiles)
    205                  or len(trajfiles) is 0)):
--> 206         reader = create_file_reader(trajfiles, top, features, chunk_size=chunk_size if chunk_size is not None else 0, **kw)
    207         trajs = reader.get_output(stride=stride)
    208         if len(trajs) == 1:

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/data/util/reader_utils.py in create_file_reader(input_files, topology, featurizer, chunk_size, **kw)
     99 
    100                     reader = FeatureReader(input_list, featurizer=featurizer, topologyfile=topology,
--> 101                                            chunksize=chunk_size)
    102                 else:
    103                     if suffix in ['.npy', '.npz']:

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/data/feature_reader.py in __init__(self, trajectories, topologyfile, chunksize, featurizer)
    118 
    119         # Check that the topology and the files in the filelist can actually work together
--> 120         self._assert_toptraj_consistency()
    121 
    122     @property

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/data/feature_reader.py in _assert_toptraj_consistency(self)
    161     def _assert_toptraj_consistency(self):
    162         r""" Check if the topology and the filenames of the reader have the same n_atoms"""
--> 163         traj = mdtraj.load_frame(self.filenames[0], index=0, top=self.topfile)
    164         desired_n_atoms = self.featurizer.topology.n_atoms
    165         assert traj.xyz.shape[1] == desired_n_atoms, "Mismatch in the number of atoms between the topology" \

/Users/je714/anaconda3/lib/python3.5/site-packages/mdtraj/core/trajectory.py in load_frame(filename, index, top, atom_indices, **kwargs)
    305         _assert_files_or_dirs_exist(filename)
    306 
--> 307     return loader(filename, frame=index, **kwargs)
    308 
    309 

/Users/je714/anaconda3/lib/python3.5/site-packages/mdtraj/formats/netcdf.py in load_netcdf(filename, top, stride, atom_indices, frame)
     88     from mdtraj.core.trajectory import _parse_topology, Trajectory
     89     if top is None:
---> 90         raise ValueError('"top" argument is required for load_netcdf')
     91 
     92     topology = _parse_topology(top)

ValueError: "top" argument is required for load_netcdf

Sorry if this is a basic thing to do but I haven't found a way to do this in the docs.

marscher commented 8 years ago

Thank you for your detailed report. We have not tested netcdf extensively, so we really appreciate. Can you confirm, the same behaviour if you use the "source" function of coordinates?

marscher commented 8 years ago

The _assert_toptraj_consistency method actually passes the top argument to the underlying loader (netcdf), so it is not obvious why the netcdf loader of mdtraj complains it is lacking the top argument.

marscher commented 8 years ago

I think we assume that topfile is really a file not a mdtraj.Topology object (which the documentation is actually clear about). So our type-checking gets somehow confused, because "topfile" isn't a string.

topfile = md.load_prmtop('WT-ff14SB_clean.prmtop') # just set this to the path.
feat = pyemma.coordinates.featurizer(topfile)

However we should support passing Topology objects, too.

marscher commented 8 years ago

the load function of mdtraj type checks three cases, we could also support.

  1. str (path)
  2. Trajectory
  3. Topology

However the "topologyfile" argument is then somehow misleading.

jeiros commented 8 years ago

Hi @marscher ,

I've used the source function as follows:

topfile = md.load_prmtop('WT-ff14SB_clean.prmtop')
traj_files = pyemma.coordinates.source(sorted(glob('*/05*nc')), top=topfile)
traj_files.featurizer.add_backbone_torsions()
traj_files.featurizer.dimension()
836

So in that case it works fine

jeiros commented 8 years ago

Also, if I just pass the string of the .prmtop file, the load function gives an OSError because it doesn't recognize the extension of the file:

traj4 = pyemma.coordinates.load('run9/05_Prod_WTff14SB_0000-0135ns_run9.nc', top='./WT-ff14SB_clean.prmtop')
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.analysis' detected! Please use 'msmtools.analysis' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.generation' detected! Please use 'msmtools.generation' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.flux' detected! Please use 'msmtools.flux' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.estimation' detected! Please use 'msmtools.estimation' in the future.
  category=PyEMMA_DeprecationWarning)
/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/util/_ext/shimmodule.py:131: PyEMMA_DeprecationWarning: Access to a moved module 'pyemma.msm.dtraj' detected! Please use 'msmtools.dtraj' in the future.
  category=PyEMMA_DeprecationWarning)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/Users/je714/anaconda3/lib/python3.5/site-packages/mdtraj/core/trajectory.py in load(filename_or_filenames, discard_overlapping_frames, **kwargs)
    397     try:
--> 398         #loader = _LoaderRegistry[extension][0]
    399         loader = FormatRegistry.loaders[extension]

KeyError: '.prmtop'

During handling of the above exception, another exception occurred:

OSError                                   Traceback (most recent call last)
<ipython-input-26-47cb77248b17> in <module>()
----> 1 traj4 = pyemma.coordinates.load('run9/05_Prod_WTff14SB_0000-0135ns_run9.nc', top='./WT-ff14SB_clean.prmtop')

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/api.py in load(trajfiles, features, top, stride, chunk_size, **kw)

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/data/util/reader_utils.py in create_file_reader(input_files, topology, featurizer, chunk_size, **kw)

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/data/feature_reader.py in __init__(self, trajectories, topologyfile, chunksize, featurizer)

/Users/je714/anaconda3/lib/python3.5/site-packages/pyEMMA-2.2-py3.5-macosx-10.5-x86_64.egg/pyemma/coordinates/data/featurization/featurizer.py in __init__(self, topfile)

/Users/je714/anaconda3/lib/python3.5/site-packages/mdtraj/core/trajectory.py in load(filename_or_filenames, discard_overlapping_frames, **kwargs)
    400     except KeyError:
    401         raise IOError('Sorry, no loader for filename=%s (extension=%s) '
--> 402                       'was found. I can only load files '
    403                       'with extensions in %s' % (filename, extension, FormatRegistry.loaders.keys()))
    404 

OSError: Sorry, no loader for filename=./WT-ff14SB_clean.prmtop (extension=.prmtop) was found. I can only load files with extensions in dict_keys(['.hdf5', '.restrt', '.mol2', '.ncdf', '.ncrst', '.binpos', '.stk', '.lammpstrj', '.gro', '.xyz', '.trr', '.lh5', '.pdb.gz', '.xml', '.pdb', '.crd', '.netcdf', '.mdcrd', '.hoomdxml', '.xtc', '.xyz.gz', '.nc', '.dtr', '.rst7', '.h5', '.dcd', '.arc', '.inpcrd'])

Edit: This is why I actually passed the topology as an mdtraj topology object, because although in the docs of the load function the top argument is meant to be a string, it doesn't work for files ending in .prmtop.

marscher commented 8 years ago

ah I see. You should have said that in the first place :+1: