# MIT License
#
# Copyright (c) 2022-2023, Alex M. Maldonado
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""Compute RDF curves under periodic boundary conditions."""
import ray
import numpy as np
from ..periodic import Cell
from ..utils import gen_combs, chunk_iterable
from ..logger import GDMLLogger
log = GDMLLogger(__name__)
# Possible ray task.
def _bin_distances(R, R_idxs, atom_pairs, rdf_settings, cell_vectors):
r"""Compute relevant RDF data for one or more structures.
Parameters
----------
R : :obj:`numpy.ndarray` or :obj:`numpy.memmap`, ndim: ``3``
Atomic coordinates of one or more structures.
R_idxs : :obj:`int` or :obj:`list`
Indices of ``R`` to compute RDF contributions.
atom_pairs : :obj:`numpy.ndarray`, ndim: ``2``
Indices of all atom pairs to consider for each structure.
rdf_settings : :obj:`dict`
Keyword arguments for :func:`numpy.histogram` to bin distances.
cell_vectors : :obj:`numpy.ndarray`
The three periodic cell vectors. For example, a cube of length 16.0 would
be ``[[16.0, 0.0, 0.0], [0.0, 16.0, 0.0], [0.0, 0.0, 16.0]]``.
Returns
-------
:obj:`numpy.ndarray`
Histogram count of distances.
:obj:`float`
Cumulative volume for this set of structures.
:obj:`int`
Number of structures computed here.
"""
R = R[R_idxs]
if R.ndim == 2:
R = R[None, ...]
n_R = R.shape[0]
cell = Cell(cell_vectors, cell_vectors[0][0] / 2, True)
# Compute histogram of distances for structure(s)
D = R[:, atom_pairs[:, 1]] - R[:, atom_pairs[:, 0]]
new_shape = (np.prod(D.shape[:2]), 3)
D = D.reshape(new_shape)
D = cell.d_mic(D, check_cutoff=False) # Should check_cutoff be false?
dists = np.linalg.norm(D, ord=2, axis=1).flatten()
count, _ = np.histogram(dists, **rdf_settings)
# Determine volume contribution.
vol_contrib = n_R * cell.volume
return count, vol_contrib, n_R
[docs]class RDF:
r"""Handles calculating the radial distribution function (RDF),
:math:`g(r)`, of a constant volume simulation.
"""
def __init__(
self,
Z,
entity_ids,
comp_ids,
cell_vectors,
bin_width=0.05,
rdf_range=(0.0, 15.0),
inter_only=True,
use_ray=False,
ray_address="auto",
n_workers=1,
):
r"""
Parameters
----------
Z : :obj:`numpy.ndarray`, ndim: ``1``
Atomic numbers of all atoms in the system.
entity_ids : :obj:`numpy.ndarray`, ndim: ``1``
Integers that specify which fragment each atom belongs to for all
structures.
comp_ids : :obj:`numpy.ndarray`, ndim: ``1``
Labels for each ``entity_id`` used to determine the desired entity
for RDF computations.
cell_vectors : :obj:`numpy.ndarray`
The three cell vectors.
inter_only : :obj:`bool`, default: ``True``
Only intermolecular distances are allowed. If ``True``, atoms that
have the same ``entity_id`` are ignored.
use_ray : :obj:`bool`, default: ``False``
Use `ray <https://docs.ray.io/en/latest/>`__ to parallelize
computations.
n_workers : :obj:`int`, default: ``1``
Total number of workers available for ray. This is ignored if ``use_ray``
is ``False``.
ray_address : :obj:`str`, default: ``"auto"``
Ray cluster address to connect to.
"""
# Store data
self.Z = Z
self.entity_ids = entity_ids
self.comp_ids = comp_ids
self.cell_vectors = cell_vectors
self.bin_width = bin_width
self.rdf_range = rdf_range
self.inter_only = inter_only
self.use_ray = use_ray
self.n_workers = n_workers
# Setup ray
if use_ray:
if not ray.is_initialized():
log.debug("ray is not initialized")
# Try to connect to already running ray service (from ray cli).
try:
log.debug("Trying to connect to ray at address %r", ray_address)
ray.init(address=ray_address)
except ConnectionError:
log.debug("Failed to connect to ray at %r", ray_address)
log.debug("Trying to initialize ray with %d cores", n_workers)
ray.init(num_cpus=n_workers)
log.debug("Successfully initialized ray")
else:
log.debug("Ray was already initialized")
self._max_chunk_size = 300
def _setup(self, comp_id_pair, entity_idxs):
r"""Prepare to do RDF computation.
Parameters
----------
comp_id_pair : :obj:`tuple`
The component ID of the entities to consider.
entity_idxs : :obj:`tuple`, ndim: ``1`` or ``2``
The atom indices in each component to compute distances from.
"""
# Setup histogram
rdf_span = self.rdf_range[-1] - self.rdf_range[0]
nbins = int(rdf_span / self.bin_width)
self._hist_settings = {"bins": nbins, "range": self.rdf_range}
count, edges = np.histogram([-1], **self._hist_settings)
count = count.astype(np.float64)
count *= 0.0
self._count = count
self.edges = edges
self.bins = 0.5 * (edges[:-1] + edges[1:])
# Cumulative volume for rdf normalization.
self._cuml_volume = 0.0 # Cumulative volume.
self._n_analyzed = 0 # Number of structures analyzed
# Compute atom pairs indices.
atom_sets = []
for i, comp_id in enumerate(comp_id_pair):
entity_idx = entity_idxs[i] # Could contain an int or tuple
avail_entities = np.where(comp_id == self.comp_ids)[0]
# Convert entity_ids into atom indices
sets = []
for entity_id in avail_entities:
if isinstance(entity_idx, (tuple, list)):
entity_idx = list(entity_idx)
elif isinstance(entity_idx, int):
entity_idx = [entity_idx]
sets.extend(
np.argwhere(entity_id == self.entity_ids).T[0][entity_idx].tolist()
)
atom_sets.append(sets)
self._atom_sets = atom_sets
# Invalid or unwanted values will be labeled with -1 to drop later.
atom_pairs = np.empty(
(len(tuple(gen_combs(atom_sets))), len(atom_sets)), dtype=np.int32
)
self._n_pairs, self._n_sets = atom_pairs.shape
i = 0
for comb in gen_combs(atom_sets):
# Check if the pair is on the same entity if requested.
if self.inter_only:
if len(set(self.entity_ids[list(comb)])) == 1:
atom_pairs[i] = -1
i += 1
continue
atom_pairs[i] = comb
i += 1
atom_pairs = atom_pairs[atom_pairs >= 0]
self._atom_pairs = atom_pairs.reshape(
(int(len(atom_pairs) / self._n_sets), self._n_sets)
)
[docs] def run(self, R, comp_id_pair, entity_idxs, step=1):
r"""Perform the RDF computation.
Parameters
----------
R : :obj:`numpy.ndarray`, ndim: ``3``
Cartesian coordinates of all atoms in the system under periodic
boundary conditions.
.. tip::
We recommend a memory-map for large ``R`` to reduce memory
requirements.
comp_id_pair : :obj:`tuple`
The component ID of the entities to consider. For example,
``('h2o', 'h2o')`` or ``('h2o', 'meoh')``.
entity_idxs : :obj:`tuple`, ndim: ``1`` or ``2``
The atom indices in each component to compute distances from.
step : :obj:`int`
Number of structures/frames to skip between each analyzed frame.
Returns
-------
:obj:`numpy.ndarray`
``self.bins``: :math:`r` as the midpoint of each histogram bin.
:obj:`numpy.ndarray`
``self.results``: :math:`g(r)` with respect to ``bins``.
Examples
--------
Suppose we want to compute the :math:`O_{w}`-:math:`O_{m}` RDF where
:math:`O_{w}` and :math:`O_{m}` are the oxygen atoms of water and
methanol, respectively. We can define our system as such.
>>> import numpy as np
>>> Z = np.array([8, 1, 1, 8, 1, 6, 1, 1, 1]*25)
>>> entity_ids = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]*25)
>>> comp_ids = np.array(['h2o', 'meoh']*25)
>>> cell_vectors = np.array([[16., 0., 0.], [0., 16., 0.], [0., 0., 16.]])
.. note::
The above information is an arbitrary system made up of 25 water and
25 methanol molecules. They are contained in a 16 Angstrom periodic box
where the atoms are always in the same order: water, methanol, water,
methanol, water, etc.
This information completely specifies our system to prepare for computing
the RDF. We initialize our object with this information.
>>> from mbgdml.analysis.rdf import RDF
>>> rdf = RDF(Z, entity_ids, comp_ids, cell_vectors)
From here we need to specify what RDF to compute with ``rdf.run()``.
We assume the Cartesian coordinates for ``R`` are already loaded as
a 3D array or memory-map.
The last two pieces of information are ``comp_id_pair`` and
``entity_idxs``. ``comp_id_pair`` specifies what components or species
we want to compute our RDF with respect to. In this example, we want
:math:`O_{w}`-:math:`O_{m}`. ``entity_idxs`` specifies which atom in
each entity to use. Oxygen is the first atom in both water and
methanol (i.e., index of ``0``).
>>> comp_id_pair = ('h2o', 'meoh')
>>> entity_idxs = (0, 0)
We can then compute our RDF!
>>> bins, gr = rdf.run(R, comp_id_pair, entity_idxs)
Notes
-----
``inter_only`` only comes into play when there is a chance of also
computing intramolecular distances during the RDF calculation. Take the
hydroxyl OH RDF in a pure methanol simulation for instance. Our
``comp_id_pair`` and ``entity_idxs`` would be ``('meoh', 'meoh')`` and
``(0, 1)``. The O-H intramolecular bond distance would be a perfectly
valid atom pair. Usually we are interested in intermolecular distances.
``inter_only`` controls whether intramolecular distances are included
(``inter_only = False``) or not (``inter_only = True``).
``entity_idxs`` can specify one or more atoms for each component. For
example, if you wanted to compute the OH RDF of pure water you could
use ``entity_idxs = (0, (1, 2))``.
TODO: Support different cell sizes for each structure.
"""
self._setup(comp_id_pair, entity_idxs)
# Computing histogram.
# Serial operation.
if not self.use_ray:
for i in range(0, len(R), step):
count, volume_contrib, n_R = _bin_distances(
R, i, self._atom_pairs, self._hist_settings, self.cell_vectors
)
self._count += count
self._cuml_volume += volume_contrib
self._n_analyzed += n_R
# Parallel operation with ray.
else:
_bin_distances_remote = ray.remote(_bin_distances)
chunk_size = min(
self._max_chunk_size,
int(len(tuple(range(0, len(R), step))) / self.n_workers),
)
chunker = chunk_iterable(range(0, len(R), step), chunk_size)
R = ray.put(R)
atom_pairs = ray.put(self._atom_pairs)
hist_settings = ray.put(self._hist_settings)
cell_vectors = ray.put(self.cell_vectors)
# Initialize ray workers
workers = []
def add_worker(workers, chunker):
try:
chunk = list(next(chunker))
except StopIteration:
return
workers.append(
_bin_distances_remote.remote(
R, chunk, atom_pairs, hist_settings, cell_vectors
)
)
return
for _ in range(self.n_workers):
add_worker(workers, chunker)
while len(workers) != 0:
done_id, workers = ray.wait(workers)
count, volume_contrib, n_R = ray.get(done_id)[0]
self._count += count
self._cuml_volume += volume_contrib
self._n_analyzed += n_R
add_worker(workers, chunker)
# Normalize the RDF
norm = self._n_analyzed # Number of analyzed frames
# Volume in each radial shell
vols = np.power(self.edges, 3)
norm *= 4 / 3 * np.pi * np.diff(vols) # Array of shape self.edges
# Average number density
N = self._n_pairs
avg_volume = self._cuml_volume / self._n_analyzed
norm *= N / avg_volume
self.results = self._count / norm
return self.bins, self.results