Source code for mbgdml.analysis.rdf

# MIT License
#
# Copyright (c) 2022-2023, Alex M. Maldonado
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""Compute RDF curves under periodic boundary conditions."""

import ray
import numpy as np
from ..periodic import Cell
from ..utils import gen_combs, chunk_iterable
from ..logger import GDMLLogger

log = GDMLLogger(__name__)


# Possible ray task.
def _bin_distances(R, R_idxs, atom_pairs, rdf_settings, cell_vectors):
    r"""Compute relevant RDF data for one or more structures.

    Parameters
    ----------
    R : :obj:`numpy.ndarray` or :obj:`numpy.memmap`, ndim: ``3``
        Atomic coordinates of one or more structures.
    R_idxs : :obj:`int` or :obj:`list`
        Indices of ``R`` to compute RDF contributions.
    atom_pairs : :obj:`numpy.ndarray`, ndim: ``2``
        Indices of all atom pairs to consider for each structure.
    rdf_settings : :obj:`dict`
        Keyword arguments for :func:`numpy.histogram` to bin distances.
    cell_vectors : :obj:`numpy.ndarray`
        The three periodic cell vectors. For example, a cube of length 16.0 would
        be ``[[16.0, 0.0, 0.0], [0.0, 16.0, 0.0], [0.0, 0.0, 16.0]]``.

    Returns
    -------
    :obj:`numpy.ndarray`
        Histogram count of distances.
    :obj:`float`
        Cumulative volume for this set of structures.
    :obj:`int`
        Number of structures computed here.
    """
    R = R[R_idxs]
    if R.ndim == 2:
        R = R[None, ...]
    n_R = R.shape[0]

    cell = Cell(cell_vectors, cell_vectors[0][0] / 2, True)

    # Compute histogram of distances for structure(s)
    D = R[:, atom_pairs[:, 1]] - R[:, atom_pairs[:, 0]]
    new_shape = (np.prod(D.shape[:2]), 3)
    D = D.reshape(new_shape)
    D = cell.d_mic(D, check_cutoff=False)  # Should check_cutoff be false?
    dists = np.linalg.norm(D, ord=2, axis=1).flatten()
    count, _ = np.histogram(dists, **rdf_settings)

    # Determine volume contribution.
    vol_contrib = n_R * cell.volume

    return count, vol_contrib, n_R


[docs]class RDF: r"""Handles calculating the radial distribution function (RDF), :math:`g(r)`, of a constant volume simulation. """ def __init__( self, Z, entity_ids, comp_ids, cell_vectors, bin_width=0.05, rdf_range=(0.0, 15.0), inter_only=True, use_ray=False, ray_address="auto", n_workers=1, ): r""" Parameters ---------- Z : :obj:`numpy.ndarray`, ndim: ``1`` Atomic numbers of all atoms in the system. entity_ids : :obj:`numpy.ndarray`, ndim: ``1`` Integers that specify which fragment each atom belongs to for all structures. comp_ids : :obj:`numpy.ndarray`, ndim: ``1`` Labels for each ``entity_id`` used to determine the desired entity for RDF computations. cell_vectors : :obj:`numpy.ndarray` The three cell vectors. inter_only : :obj:`bool`, default: ``True`` Only intermolecular distances are allowed. If ``True``, atoms that have the same ``entity_id`` are ignored. use_ray : :obj:`bool`, default: ``False`` Use `ray <https://docs.ray.io/en/latest/>`__ to parallelize computations. n_workers : :obj:`int`, default: ``1`` Total number of workers available for ray. This is ignored if ``use_ray`` is ``False``. ray_address : :obj:`str`, default: ``"auto"`` Ray cluster address to connect to. """ # Store data self.Z = Z self.entity_ids = entity_ids self.comp_ids = comp_ids self.cell_vectors = cell_vectors self.bin_width = bin_width self.rdf_range = rdf_range self.inter_only = inter_only self.use_ray = use_ray self.n_workers = n_workers # Setup ray if use_ray: if not ray.is_initialized(): log.debug("ray is not initialized") # Try to connect to already running ray service (from ray cli). try: log.debug("Trying to connect to ray at address %r", ray_address) ray.init(address=ray_address) except ConnectionError: log.debug("Failed to connect to ray at %r", ray_address) log.debug("Trying to initialize ray with %d cores", n_workers) ray.init(num_cpus=n_workers) log.debug("Successfully initialized ray") else: log.debug("Ray was already initialized") self._max_chunk_size = 300 def _setup(self, comp_id_pair, entity_idxs): r"""Prepare to do RDF computation. Parameters ---------- comp_id_pair : :obj:`tuple` The component ID of the entities to consider. entity_idxs : :obj:`tuple`, ndim: ``1`` or ``2`` The atom indices in each component to compute distances from. """ # Setup histogram rdf_span = self.rdf_range[-1] - self.rdf_range[0] nbins = int(rdf_span / self.bin_width) self._hist_settings = {"bins": nbins, "range": self.rdf_range} count, edges = np.histogram([-1], **self._hist_settings) count = count.astype(np.float64) count *= 0.0 self._count = count self.edges = edges self.bins = 0.5 * (edges[:-1] + edges[1:]) # Cumulative volume for rdf normalization. self._cuml_volume = 0.0 # Cumulative volume. self._n_analyzed = 0 # Number of structures analyzed # Compute atom pairs indices. atom_sets = [] for i, comp_id in enumerate(comp_id_pair): entity_idx = entity_idxs[i] # Could contain an int or tuple avail_entities = np.where(comp_id == self.comp_ids)[0] # Convert entity_ids into atom indices sets = [] for entity_id in avail_entities: if isinstance(entity_idx, (tuple, list)): entity_idx = list(entity_idx) elif isinstance(entity_idx, int): entity_idx = [entity_idx] sets.extend( np.argwhere(entity_id == self.entity_ids).T[0][entity_idx].tolist() ) atom_sets.append(sets) self._atom_sets = atom_sets # Invalid or unwanted values will be labeled with -1 to drop later. atom_pairs = np.empty( (len(tuple(gen_combs(atom_sets))), len(atom_sets)), dtype=np.int32 ) self._n_pairs, self._n_sets = atom_pairs.shape i = 0 for comb in gen_combs(atom_sets): # Check if the pair is on the same entity if requested. if self.inter_only: if len(set(self.entity_ids[list(comb)])) == 1: atom_pairs[i] = -1 i += 1 continue atom_pairs[i] = comb i += 1 atom_pairs = atom_pairs[atom_pairs >= 0] self._atom_pairs = atom_pairs.reshape( (int(len(atom_pairs) / self._n_sets), self._n_sets) )
[docs] def run(self, R, comp_id_pair, entity_idxs, step=1): r"""Perform the RDF computation. Parameters ---------- R : :obj:`numpy.ndarray`, ndim: ``3`` Cartesian coordinates of all atoms in the system under periodic boundary conditions. .. tip:: We recommend a memory-map for large ``R`` to reduce memory requirements. comp_id_pair : :obj:`tuple` The component ID of the entities to consider. For example, ``('h2o', 'h2o')`` or ``('h2o', 'meoh')``. entity_idxs : :obj:`tuple`, ndim: ``1`` or ``2`` The atom indices in each component to compute distances from. step : :obj:`int` Number of structures/frames to skip between each analyzed frame. Returns ------- :obj:`numpy.ndarray` ``self.bins``: :math:`r` as the midpoint of each histogram bin. :obj:`numpy.ndarray` ``self.results``: :math:`g(r)` with respect to ``bins``. Examples -------- Suppose we want to compute the :math:`O_{w}`-:math:`O_{m}` RDF where :math:`O_{w}` and :math:`O_{m}` are the oxygen atoms of water and methanol, respectively. We can define our system as such. >>> import numpy as np >>> Z = np.array([8, 1, 1, 8, 1, 6, 1, 1, 1]*25) >>> entity_ids = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]*25) >>> comp_ids = np.array(['h2o', 'meoh']*25) >>> cell_vectors = np.array([[16., 0., 0.], [0., 16., 0.], [0., 0., 16.]]) .. note:: The above information is an arbitrary system made up of 25 water and 25 methanol molecules. They are contained in a 16 Angstrom periodic box where the atoms are always in the same order: water, methanol, water, methanol, water, etc. This information completely specifies our system to prepare for computing the RDF. We initialize our object with this information. >>> from mbgdml.analysis.rdf import RDF >>> rdf = RDF(Z, entity_ids, comp_ids, cell_vectors) From here we need to specify what RDF to compute with ``rdf.run()``. We assume the Cartesian coordinates for ``R`` are already loaded as a 3D array or memory-map. The last two pieces of information are ``comp_id_pair`` and ``entity_idxs``. ``comp_id_pair`` specifies what components or species we want to compute our RDF with respect to. In this example, we want :math:`O_{w}`-:math:`O_{m}`. ``entity_idxs`` specifies which atom in each entity to use. Oxygen is the first atom in both water and methanol (i.e., index of ``0``). >>> comp_id_pair = ('h2o', 'meoh') >>> entity_idxs = (0, 0) We can then compute our RDF! >>> bins, gr = rdf.run(R, comp_id_pair, entity_idxs) Notes ----- ``inter_only`` only comes into play when there is a chance of also computing intramolecular distances during the RDF calculation. Take the hydroxyl OH RDF in a pure methanol simulation for instance. Our ``comp_id_pair`` and ``entity_idxs`` would be ``('meoh', 'meoh')`` and ``(0, 1)``. The O-H intramolecular bond distance would be a perfectly valid atom pair. Usually we are interested in intermolecular distances. ``inter_only`` controls whether intramolecular distances are included (``inter_only = False``) or not (``inter_only = True``). ``entity_idxs`` can specify one or more atoms for each component. For example, if you wanted to compute the OH RDF of pure water you could use ``entity_idxs = (0, (1, 2))``. TODO: Support different cell sizes for each structure. """ self._setup(comp_id_pair, entity_idxs) # Computing histogram. # Serial operation. if not self.use_ray: for i in range(0, len(R), step): count, volume_contrib, n_R = _bin_distances( R, i, self._atom_pairs, self._hist_settings, self.cell_vectors ) self._count += count self._cuml_volume += volume_contrib self._n_analyzed += n_R # Parallel operation with ray. else: _bin_distances_remote = ray.remote(_bin_distances) chunk_size = min( self._max_chunk_size, int(len(tuple(range(0, len(R), step))) / self.n_workers), ) chunker = chunk_iterable(range(0, len(R), step), chunk_size) R = ray.put(R) atom_pairs = ray.put(self._atom_pairs) hist_settings = ray.put(self._hist_settings) cell_vectors = ray.put(self.cell_vectors) # Initialize ray workers workers = [] def add_worker(workers, chunker): try: chunk = list(next(chunker)) except StopIteration: return workers.append( _bin_distances_remote.remote( R, chunk, atom_pairs, hist_settings, cell_vectors ) ) return for _ in range(self.n_workers): add_worker(workers, chunker) while len(workers) != 0: done_id, workers = ray.wait(workers) count, volume_contrib, n_R = ray.get(done_id)[0] self._count += count self._cuml_volume += volume_contrib self._n_analyzed += n_R add_worker(workers, chunker) # Normalize the RDF norm = self._n_analyzed # Number of analyzed frames # Volume in each radial shell vols = np.power(self.edges, 3) norm *= 4 / 3 * np.pi * np.diff(vols) # Array of shape self.edges # Average number density N = self._n_pairs avg_volume = self._cuml_volume / self._n_analyzed norm *= N / avg_volume self.results = self._count / norm return self.bins, self.results