Source code for mbgdml.data.dataset

# MIT License
#
# Copyright (c) 2020-2023, Alex M. Maldonado
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import numpy as np
from cclib.parser.utils import convertor
from .basedata import mbGDMLData
from .. import utils
from .. import _version
from ..logger import GDMLLogger

mbgdml_version = _version.get_versions()["version"]

log = GDMLLogger(__name__)


[docs]class DataSet(mbGDMLData): r"""For creating, loading, manipulating, and using data sets.""" def __init__(self, dset_path=None, Z_key="Z", R_key="R", E_key="E", F_key="F"): """ Parameters ---------- dset_path : :obj:`str`, optional Path to a `npz` file. Z_key : :obj:`str`, default: ``Z`` :obj:`dict` key in ``dset_path`` for atomic numbers. R_key : :obj:`str`, default: ``R`` :obj:`dict` key in ``dset_path`` for Cartesian coordinates. E_key : :obj:`str`, default: ``E`` :obj:`dict` key in ``dset_path`` for energies. F_key : :obj:`str`, default: ``F`` :obj:`dict` key in ``dset_path`` for atomic forces. """ self.type = "d" self.name = "dataset" # Set keys for atomic properties. self.Z_key = Z_key self.R_key = R_key self.E_key = E_key self.F_key = F_key if dset_path is not None: self.load(dset_path) @property def name(self): r"""Human-readable label for the data set. :type: :obj:`str` """ if hasattr(self, "_name"): return self._name return None @name.setter def name(self, var): self._name = str(var) @property def r_prov_ids(self): r"""Specifies structure sets IDs/labels and corresponding MD5 hashes. Keys are the Rset IDs (:obj:`int`) and values are MD5 hashes (:obj:`str`) for the particular structure set. This is used as a breadcrumb trail that specifies where each structure in the data set originates from. Examples -------- >>> dset.r_prov_ids {0: '2339670ad87a606cb11a72191dfd9f58'} :type: :obj:`dict` """ if hasattr(self, "_r_prov_ids"): return self._r_prov_ids return {} @r_prov_ids.setter def r_prov_ids(self, var): self._r_prov_ids = var @property def r_prov_specs(self): r"""An array specifying where each structure in ``R`` originates from. A ``(n_R, 1 + n_entity)`` array where each row contains the Rset ID from ``r_prov_ids`` (e.g., 0, 1, 2, etc.) then the structure index and entity_ids from the original full structure in the structure set. If there has been no previous sampling, an array of shape (1, 0) is returned. :type: :obj:`numpy.ndarray` Examples -------- >>> dset.r_prov_specs # [r_prov_id, r_index, entity_1, entity_2, entity_3] array([[0, 985, 46, 59, 106], [0, 174, 51, 81, 128]]) """ if hasattr(self, "_r_prov_specs"): return self._r_prov_specs return np.array([[]], dtype="int_") @r_prov_specs.setter def r_prov_specs(self, var): self._r_prov_specs = var @property def F(self): r"""Atomic forces of atoms in structure(s). A :obj:`numpy.ndarray` with shape of ``(m, n, 3)`` where ``m`` is the number of structures and ``n`` is the number of atoms with three Cartesian components. :type: :obj:`numpy.ndarray` """ if hasattr(self, "_F"): return self._F return np.array([[[]]]) @F.setter def F(self, var): self._F = var # pylint: disable=invalid-name @property def E(self): r"""The energies of structure(s). A :obj:`numpy.ndarray` with shape of ``(n,)`` where ``n`` is the number of atoms. :type: :obj:`numpy.ndarray` """ if hasattr(self, "_E"): return self._E return np.array([]) @E.setter def E(self, var): self._E = var # pylint: disable=invalid-name @property def e_unit(self): r"""Units of energy. Options are ``'eV'``, ``'hartree'``, ``'kcal/mol'``, and ``'kJ/mol'``. :type: :obj:`str` """ if hasattr(self, "_e_unit"): return self._e_unit return "n/a" @e_unit.setter def e_unit(self, var): self._e_unit = var @property def theory(self): r"""The level of theory used to compute energy and gradients of the data set. :type: :obj:`str` """ if hasattr(self, "_theory"): return self._theory return "n/a" @theory.setter def theory(self, var): self._theory = var @property def E_min(self): # pylint: disable=invalid-name r"""Minimum energy of all structures. :type: :obj:`float` """ return float(np.min(self.E.ravel())) @property def E_max(self): # pylint: disable=invalid-name r"""Maximum energy of all structures. :type: :obj:`float` """ return float(np.max(self.E.ravel())) @property def E_var(self): # pylint: disable=invalid-name r"""Energy variance. :type: :obj:`float` """ return float(np.var(self.E.ravel())) @property def E_mean(self): # pylint: disable=invalid-name r"""Mean of all energies. :type: :obj:`float` """ return float(np.mean(self.E.ravel())) @property def F_min(self): # pylint: disable=invalid-name r"""Minimum atomic force in all structures. :type: :obj:`float` """ return float(np.min(self.F.ravel())) @property def F_max(self): # pylint: disable=invalid-name r"""Maximum atomic force in all structures. :type: :obj:`float` """ return float(np.max(self.F.ravel())) @property def F_var(self): # pylint: disable=invalid-name r"""Force variance. :type: :obj:`float` """ return float(np.var(self.F.ravel())) @property def F_mean(self): # pylint: disable=invalid-name r"""Mean of all forces. :type: :obj:`float` """ return float(np.mean(self.F.ravel())) @property def md5(self): r"""Unique MD5 hash of data set. Notes ----- ``Z`` and ``R`` are always used to generate the MD5 hash. If available, :obj:`mbgdml.data.DataSet.E` and :obj:`mbgdml.data.DataSet.F` are used. :type: :obj:`str` """ try: return self.asdict()["md5"].item().decode() except BaseException: print("Not enough information in dset for MD5") raise @property def entity_ids(self): r"""1D array specifying which atoms belong to which entities. An entity represents a related set of atoms such as a single molecule, several molecules, or a functional group. For mbGDML, an entity usually corresponds to a model trained to predict energies and forces of those atoms. Each ``entity_id`` is an :obj:`int` starting from ``0``. It is conceptually similar to PDBx/mmCIF ``_atom_site.label_entity_ids`` data item. Examples -------- A single water molecule would be ``[0, 0, 0]``. A water (three atoms) and methanol (six atoms) molecule in the same structure would be ``[0, 0, 0, 1, 1, 1, 1, 1, 1]``. :type: :obj:`numpy.ndarray` """ if hasattr(self, "_entity_ids"): return self._entity_ids return np.array([]) @entity_ids.setter def entity_ids(self, var): self._entity_ids = np.array(var) @property def comp_ids(self): r"""A 1D array relating ``entity_id`` to a fragment label for chemical components or species. Labels could be ``WAT`` or ``h2o`` for water, ``MeOH`` for methanol, ``bz`` for benzene, etc. There are no standardized labels for species. The index of the label is the respective ``entity_id``. For example, a water and methanol molecule could be ``['h2o', 'meoh']``. Examples -------- Suppose we have a structure containing a water and methanol molecule. We can use the labels of ``h2o`` and ``meoh`` (which could be anything): ``['h2o', 'meoh']``. Note that the ``entity_id`` is a :obj:`str`. :type: :obj:`numpy.ndarray` """ if hasattr(self, "_comp_ids"): return self._comp_ids return np.array([]) @comp_ids.setter def comp_ids(self, var): self._comp_ids = np.array(var) @property def mb(self): r"""Many-body expansion order of this data set. This is :obj:`None` if the data set does not contain many-body energies and forces. :type: :obj:`int` """ if hasattr(self, "_mb"): return self._mb return None @mb.setter def mb(self, var): self._mb = int(var) @property def mb_dsets_md5(self): r"""All MD5 hash of data sets used to remove n-body contributions from data sets. :type: :obj:`numpy.ndarray` """ if hasattr(self, "_mb_dsets_md5"): return self._mb_dsets_md5 return np.array([]) @mb_dsets_md5.setter def mb_dsets_md5(self, var): self._mb_dsets_md5 = var.astype(str) @property def mb_models_md5(self): r"""All MD5 hash of models used to remove n-body contributions from models. :type: :obj:`numpy.ndarray` """ if hasattr(self, "_mb_models_md5"): return self._mb_models_md5 return np.array([]) @mb_models_md5.setter def mb_models_md5(self, var): self._mb_models_md5 = var.astype(str) # pylint: disable-next=invalid-name
[docs] def convertE(self, E_units): r"""Convert energies and updates ``e_unit``. Parameters ---------- E_units : :obj:`str` Desired units of energy. Options are ``'eV'``, ``'hartree'``, ``'kcal/mol'``, and ``'kJ/mol'``. """ self._E = convertor(self.E, self.e_unit, E_units) self.e_unit = E_units
# pylint: disable-next=invalid-name
[docs] def convertR(self, R_units): r"""Convert coordinates and updates ``r_unit``. Parameters ---------- R_units : :obj:`str` Desired units of coordinates. Options are ``'Angstrom'`` or ``'bohr'``. """ self._R = convertor(self.R, self.r_unit, R_units) self.r_unit = R_units
# pylint: disable-next=invalid-name
[docs] def convertF(self, force_e_units, force_r_units, e_units, r_units): r"""Convert forces. Does not change ``e_unit`` or ``r_unit``. Parameters ---------- force_e_units : :obj:`str` Specifies package-specific energy units used in calculation. Available units are ``'eV'``, ``'hartree'``, ``'kcal/mol'``, and ``'kJ/mol'``. force_r_units : :obj:`str` Specifies package-specific distance units used in calculation. Available units are ``'Angstrom'`` and ``'bohr'``. e_units : :obj:`str` Desired units of energy. Available units are ``'eV'``, ``'hartree'``, ``'kcal/mol'``, and ``'kJ/mol'``. r_units : :obj:`str` Desired units of distance. Available units are ``'Angstrom'`` and ``'bohr'``. """ self._F = utils.convert_forces( self.F, force_e_units, force_r_units, e_units, r_units )
def _update(self, dataset): r"""Updates object attributes. Parameters ---------- dataset : :obj:`dict` Contains all information and arrays stored in data set. """ self.name = dataset["name"].item() self._Z = dataset[self.Z_key] self._R = dataset[self.R_key] self._E = dataset[self.E_key] self._F = dataset[self.F_key] self._r_unit = dataset["r_unit"].item() try: self._e_unit = dataset["e_unit"].item() except KeyError: self._e_unit = "n/a" try: self.mbgdml_version = dataset["mbgdml_version"].item() except KeyError: # Some old data sets do not have this information. # This is unessential, so we will just ignore this. pass try: self.theory = dataset["theory"].item() except KeyError: self.theory = "n/a" # mbGDML added data set information. if "mb" in dataset.keys(): self.mb = dataset["mb"].item() if "mb_models_md5" in dataset.keys(): self.mb_models_md5 = dataset["mb_models_md5"] if "mb_dsets_md5" in dataset.keys(): self.mb_dsets_md5 = dataset["mb_dsets_md5"] try: self.r_prov_specs = dataset["r_prov_specs"][()] self.r_prov_ids = dataset["r_prov_ids"][()] self.entity_ids = dataset["entity_ids"] self.comp_ids = dataset["comp_ids"] except KeyError: pass if "centered" in dataset.keys(): self.centered = dataset["centered"].item()
[docs] def load(self, dataset_path): r"""Read data set. Parameters ---------- dataset_path : :obj:`str` Path to NumPy ``npz`` file. """ dataset_npz = np.load(dataset_path, allow_pickle=True) npz_type = dataset_npz.f.type.item() if npz_type != "d": raise ValueError(f"{npz_type} is not a data set.") self._update(dict(dataset_npz))
# pylint: disable-next=too-many-branches
[docs] def asdict(self, gdml_keys=True): r"""Converts object into a custom :obj:`dict`. Parameters ---------- gdml_keys : :obj:`bool`, default: ``True`` Data sets can use any keys to specify atomic data. However, mbGDML uses the standard of ``Z`` for atomic numbers, ``R`` for structure coordinates, ``E`` for energies, and ``F`` for forces. Using this option changes the data set keys to GDML keys. Returns ------- :obj:`dict` """ # Data always available for data sets. dataset = { "type": np.array("d"), "mbgdml_version": np.array(mbgdml_version), "name": np.array(self.name), "r_prov_ids": np.array(self.r_prov_ids), "r_prov_specs": np.array(self.r_prov_specs), self.Z_key: np.array(self.Z), self.R_key: np.array(self.R), "r_unit": np.array(self.r_unit), "entity_ids": self.entity_ids, "comp_ids": self.comp_ids, } if gdml_keys: md5_properties = ["Z", "R"] else: md5_properties = [self.Z_key, self.R_key] # When starting a new data set from a structure set, there will not be # any energy or force data. Thus, we try to add the data if available, # but will not error out if the data is not available. # Theory. try: dataset["theory"] = np.array(self.theory) except BaseException: pass # Energies. try: dataset[self.E_key] = np.array(self.E) dataset["e_unit"] = np.array(self.e_unit) dataset["E_min"] = np.array(self.E_min) dataset["E_max"] = np.array(self.E_max) dataset["E_mean"] = np.array(self.E_mean) dataset["E_var"] = np.array(self.E_var) if gdml_keys: md5_properties.append("E") else: md5_properties.append(self.E_key) except BaseException: pass # Forces. try: dataset[self.F_key] = np.array(self.F) dataset["F_min"] = np.array(self.F_min) dataset["F_max"] = np.array(self.F_max) dataset["F_mean"] = np.array(self.F_mean) dataset["F_var"] = np.array(self.F_var) if gdml_keys: md5_properties.append("F") else: md5_properties.append(self.F_key) except BaseException: pass # mbGDML information. if hasattr(self, "mb") and self.mb is not None: dataset["mb"] = np.array(self.mb) if len(self.mb_models_md5) > 0: dataset["mb_models_md5"] = np.array(self.mb_models_md5) if len(self.mb_dsets_md5) > 0: dataset["mb_dsets_md5"] = np.array(self.mb_dsets_md5) if hasattr(self, "centered"): dataset["centered"] = np.array(self.centered) if gdml_keys: dataset["Z"] = dataset.pop(self.Z_key) dataset["R"] = dataset.pop(self.R_key) dataset["E"] = dataset.pop(self.E_key) dataset["F"] = dataset.pop(self.F_key) dataset["md5"] = np.array(utils.md5_data(dataset, md5_properties)) return dataset
[docs] def print(self): r"""Prints all structure coordinates, energies, and forces of a data set.""" num_config = self.R.shape[0] for config in range(num_config): r_prov_spec = self.r_prov_specs[config] print(f"-----Configuration {config}-----") print( f"r_prov_id: {int(r_prov_spec[0])} " f"Structure index: {int(r_prov_spec[1])}" ) print(f"Molecule indices: {r_prov_spec[2:]}") print(f"Coordinates:\n{self.R[config]}") print(f"Energy: {self.E[config]}") print(f"Forces:\n{self.F[config]}\n")
[docs] def write_xyz(self, save_dir): r"""Saves xyz file of all structures in data set. Parameters ---------- save_dir : :obj:`str` """ xyz_path = os.path.join(save_dir, self.name) utils.write_xyz(xyz_path, self.Z, self.R)