Source code for mbgdml.data.predictset

# MIT License
#
# Copyright (c) 2020-2023, Alex M. Maldonado
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import ray
import numpy as np
from .. import __version__ as mbgdml_version
from .basedata import mbGDMLData
from ..mbe import mbePredict, decomp_to_total
from ..logger import GDMLLogger

log = GDMLLogger(__name__)


[docs]class PredictSet(mbGDMLData): r"""A predict set is a data set with mbGDML predicted energy and forces instead of training data. When analyzing many structures using mbGDML it is easier to predict all many-body contributions once and then analyze the stored data. """ def __init__(self, pset=None, Z_key="Z", R_key="R", E_key="E", F_key="F"): r""" Parameters ---------- pset : :obj:`str` or :obj:`dict`, default: ``None`` Predict set path or dictionary to initialize with. Z_key : :obj:`str`, default: ``Z`` :obj:`dict` key in ``pset`` for atomic numbers. R_key : :obj:`str`, default: ``R`` :obj:`dict` key in ``pset`` for Cartesian coordinates. E_key : :obj:`str`, default: ``E`` :obj:`dict` key in ``pset`` for energies. F_key : :obj:`str`, default: ``F`` :obj:`dict` key in ``pset`` for atomic forces. """ self.name = "predictset" r"""File name of the predict set. Default: ``"predictset"`` :type: :obj:`str` """ self.type = "p" self.mbgdml_version = mbgdml_version self._loaded = False self._predicted = False # Set keys for atomic properties. self.Z_key = Z_key self.R_key = R_key self.E_key = E_key self.F_key = F_key if pset is not None: self.load(pset)
[docs] def prepare(self): r"""Prepares a predict set by calculated the decomposed energy and force contributions. Note ----- You must load a dataset first to specify ``Z``, ``R``, ``entity_ids``, and ``comp_ids``. """ E_nbody, F_nbody, entity_combs, nbody_orders = self.mbePredict.predict_decomp( self.Z, self.R, self.entity_ids, self.comp_ids ) self.nbody_orders = nbody_orders for i, order in enumerate(nbody_orders): setattr(self, f"E_{order}", E_nbody[i]) setattr(self, f"F_{order}", F_nbody[i]) setattr(self, f"entity_combs_{order}", entity_combs[i]) self._predicted = True
[docs] def asdict(self): r"""Converts object into a custom :obj:`dict`. Returns ------- :obj:`dict` """ pset = { "type": np.array("p"), "version": np.array(self.mbgdml_version), "name": np.array(self.name), "theory": np.array(self.theory), self.Z_key: self.Z, self.R_key: self.R, "r_unit": np.array(self.r_unit), "E_true": self.E_true, "e_unit": np.array(self.e_unit), "F_true": self.F_true, "entity_ids": self.entity_ids, "comp_ids": self.comp_ids, "nbody_orders": self.nbody_orders, "models_md5": self.models_md5, } for nbody_order in self.nbody_orders: pset[f"E_{nbody_order}"] = getattr(self, f"E_{nbody_order}") pset[f"F_{nbody_order}"] = getattr(self, f"F_{nbody_order}") pset[f"entity_combs_{nbody_order}"] = getattr( self, f"entity_combs_{nbody_order}" ) if self._loaded is False and self._predicted is False: if not hasattr(self, "dataset") or not hasattr(self, "mbgdml"): raise AttributeError("No data can be predicted or is not loaded.") self.prepare() return pset
@property def e_unit(self): r"""Units of energy. Options are ``eV``, ``hartree``, ``kcal/mol``, and ``kJ/mol``. :type: :obj:`str` """ return self._e_unit @e_unit.setter def e_unit(self, var): self._e_unit = var @property def E_true(self): r"""True energies from data set. :type: :obj:`numpy.ndarray` """ return self._E_true @E_true.setter def E_true(self, var): self._E_true = var # pylint: disable=invalid-name @property def F_true(self): r"""True forces from data set. :type: :obj:`numpy.ndarray` """ return self._F_true @F_true.setter def F_true(self, var): self._F_true = var # pylint: disable=invalid-name @property def entity_ids(self): r"""1D array specifying which atoms belong to which entities. An entity represents a related set of atoms such as a single molecule, several molecules, or a functional group. For mbGDML, an entity usually corresponds to a model trained to predict energies and forces of those atoms. Each ``entity_id`` is an :obj:`int` starting from ``0``. It is conceptually similar to PDBx/mmCIF ``_atom_site.label_entity_ids`` data item. Examples -------- A single water molecule would be ``[0, 0, 0]``. A water (three atoms) and methanol (six atoms) molecule in the same structure would be ``[0, 0, 0, 1, 1, 1, 1, 1, 1]``. :type: :obj:`numpy.ndarray` """ if hasattr(self, "_entity_ids"): return self._entity_ids return np.array([]) @entity_ids.setter def entity_ids(self, var): self._entity_ids = np.array(var) @property def comp_ids(self): r"""A 1D array relating ``entity_id`` to a fragment label for chemical components or species. Labels could be ``WAT`` or ``h2o`` for water, ``MeOH`` for methanol, ``bz`` for benzene, etc. There are no standardized labels for species. The index of the label is the respective ``entity_id``. For example, a water and methanol molecule could be ``['h2o', 'meoh']``. Examples -------- Suppose we have a structure containing a water and methanol molecule. We can use the labels of ``h2o`` and ``meoh`` (which could be anything): ``['h2o', 'meoh']``. Note that the ``entity_id`` is a :obj:`str`. :type: :obj:`numpy.ndarray` """ if hasattr(self, "_comp_ids"): return self._comp_ids return np.array([]) @property def theory(self): r"""The level of theory used to compute energy and gradients of the data set. :type: :obj:`str` """ if hasattr(self, "_theory"): return self._theory return "n/a" @theory.setter def theory(self, var): self._theory = var
[docs] def load(self, pset): r"""Reads predict data set and loads data. Parameters ---------- pset : :obj:`str` or :obj:`dict` Path to predict set ``npz`` file or a dictionary. """ if isinstance(pset, str): pset = dict(np.load(pset, allow_pickle=True)) elif not isinstance(pset, dict): raise TypeError(f"{type(pset)} is not supported.") self.name = str(pset["name"][()]) self.theory = str(pset["theory"][()]) self.version = str(pset["version"][()]) self.Z = pset[self.Z_key] self.R = pset[self.R_key] self.r_unit = str(pset["r_unit"][()]) self.e_unit = str(pset["e_unit"][()]) self.E_true = pset[f"{self.E_key}_true"] self.F_true = pset[f"{self.F_key}_true"] self.entity_ids = pset["entity_ids"] self.comp_ids = pset["comp_ids"] self.nbody_orders = pset["nbody_orders"] self.models_md5 = pset["models_md5"] for i in pset["nbody_orders"]: setattr(self, f"{self.E_key}_{i}", pset[f"{self.E_key}_{i}"]) setattr(self, f"{self.F_key}_{i}", pset[f"{self.F_key}_{i}"]) setattr(self, f"entity_combs_{i}", pset[f"entity_combs_{i}"]) self._loaded = True
@comp_ids.setter def comp_ids(self, var): self._comp_ids = np.array(var)
[docs] def nbody_predictions( self, nbody_orders, use_ray=False, n_workers=1, ray_address="auto" ): r"""Energies and forces of all structures including ``nbody_order`` contributions. Predict sets have data that is broken down into many-body contributions. This function sums the many-body contributions up to the specified level; for example, ``3`` returns the energy and force predictions when including one, two, and three body contributions/corrections. Parameters ---------- nbody_orders : :obj:`list` of :obj:`int` :math:`n`-body orders to include. use_ray : :obj:`bool`, default: ``False`` Use `ray <https://docs.ray.io/en/latest/>`__ to parallelize computations. n_workers : :obj:`int`, default: ``1`` Total number of workers available for ray. This is ignored if ``use_ray`` is ``False``. ray_address : :obj:`str`, default: ``"auto"`` Ray cluster address to connect to. Returns ------- :obj:`numpy.ndarray` Energy of structures up to and including n-order corrections. :obj:`numpy.ndarray` Forces of structures up to an including n-order corrections. """ if use_ray: if not ray.is_initialized(): log.debug("ray is not initialized") # Try to connect to already running ray service (from ray cli). try: log.debug("Trying to connect to ray at address %r", ray_address) ray.init(address=ray_address) except ConnectionError: log.debug("Failed to connect to ray at %r", ray_address) log.debug("Trying to initialize ray with %d cores", n_workers) ray.init(num_cpus=n_workers) log.debug("Successfully initialized ray") else: log.debug("Ray was already initialized") E = np.zeros(self.E_true.shape) F = np.zeros(self.F_true.shape) for nbody_order in nbody_orders: E_decomp = getattr(self, f"{self.E_key}_{nbody_order}") F_decomp = getattr(self, f"{self.F_key}_{nbody_order}") entity_combs = getattr(self, f"entity_combs_{nbody_order}") if len(nbody_orders) == 1: return E_decomp, F_decomp E_nbody, F_nbody = decomp_to_total( E_decomp, F_decomp, self.entity_ids, entity_combs ) E += E_nbody F += F_nbody return E, F
[docs] def load_dataset(self, dset, Z_key="Z", R_key="R", E_key="E", F_key="F"): r"""Loads data set in preparation to create a predict set. Parameters ---------- dset : :obj:`str` or :obj:`dict` Path to data set or :obj:`dict` with at least the following data: Z_key : :obj:`str`, default: ``Z`` :obj:`dict` key in ``dset_path`` for atomic numbers. R_key : :obj:`str`, default: ``R`` :obj:`dict` key in ``dset_path`` for atomic Cartesian coordinates. E_key : :obj:`str`, default: ``E`` :obj:`dict` key in ``dset_path`` for energies. F_key : :obj:`str`, default: ``F`` :obj:`dict` key in ``dset_path`` for atomic forces. """ if isinstance(dset, str): dset = dict(np.load(dset, allow_pickle=True)) elif not isinstance(dset, dict): raise TypeError(f"{type(dset)} is not supported") self.Z = dset[Z_key] self.R = dset[R_key] E = dset[E_key] if not isinstance(dset[E_key], np.ndarray): E = np.array(E) if E.ndim == 0: E = np.array([E]) self.E_true = E self.F_true = dset[F_key] self.entity_ids = dset["entity_ids"] self.comp_ids = dset["comp_ids"] if isinstance(dset["theory"], np.ndarray): self.theory = str(dset["theory"].item()) else: self.theory = dset["theory"] if isinstance(dset["r_unit"], np.ndarray): self.r_unit = str(dset["r_unit"].item()) else: self.r_unit = dset["r_unit"] if isinstance(dset["e_unit"], np.ndarray): self.e_unit = str(dset["e_unit"].item()) else: self.e_unit = dset["e_unit"]
[docs] def load_models( self, models, predict_model, use_ray=False, n_workers=1, ray_address="auto", wkr_chunk_size=100, ): r"""Loads model(s) in preparation to create a predict set. Parameters ---------- models : :obj:`list` of :obj:`mbgdml.models.Model` Machine learning model objects that contain all information to make predictions using ``predict_model``. predict_model : ``callable`` A function that takes ``Z, R, entity_ids, nbody_gen, model`` and computes energies and forces. This will be turned into a ray remote function if ``use_ray = True``. This can return total properties or all individual :math:`n`-body energies and forces. use_ray : :obj:`bool`, default: ``False`` Use `ray <https://docs.ray.io/en/latest/>`__ to parallelize computations. n_workers : :obj:`int`, default: ``1`` Total number of workers available for ray. This is ignored if ``use_ray`` is ``False``. ray_address : :obj:`str`, default: ``"auto"`` Ray cluster address to connect to. wkr_chunk_size : :obj:`int`, default: ``100`` Number of :math:`n`-body structures to assign to each spawned worker with ray. """ if use_ray: assert ray.is_initialized() self.mbePredict = mbePredict( models, predict_model, use_ray, n_workers, ray_address, wkr_chunk_size ) models_md5, nbody_orders = [], [] for model in self.mbePredict.models: if use_ray: model = ray.get(model) nbody_orders.append(model.nbody_order) try: models_md5.append(model.md5) except AttributeError: pass self.nbody_orders = np.array(nbody_orders) self.models_md5 = np.array(models_md5)