Source code for mbgdml.data.dataset

# MIT License
#
# Copyright (c) 2020-2023, Alex M. Maldonado
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import numpy as np
from cclib.parser.utils import convertor
from .basedata import mbGDMLData
from .. import utils
from .. import _version
from ..logger import GDMLLogger

mbgdml_version = _version.get_versions()["version"]

log = GDMLLogger(__name__)


[docs]class DataSet(mbGDMLData):
    r"""For creating, loading, manipulating, and using data sets."""

    def __init__(self, dset_path=None, Z_key="Z", R_key="R", E_key="E", F_key="F"):
        """
        Parameters
        ----------
        dset_path : :obj:`str`, optional
            Path to a `npz` file.
        Z_key : :obj:`str`, default: ``Z``
            :obj:`dict` key in ``dset_path`` for atomic numbers.
        R_key : :obj:`str`, default: ``R``
            :obj:`dict` key in ``dset_path`` for Cartesian coordinates.
        E_key : :obj:`str`, default: ``E``
            :obj:`dict` key in ``dset_path`` for energies.
        F_key : :obj:`str`, default: ``F``
            :obj:`dict` key in ``dset_path`` for atomic forces.
        """
        self.type = "d"
        self.name = "dataset"

        # Set keys for atomic properties.
        self.Z_key = Z_key
        self.R_key = R_key
        self.E_key = E_key
        self.F_key = F_key

        if dset_path is not None:
            self.load(dset_path)

    @property
    def name(self):
        r"""Human-readable label for the data set.

        :type: :obj:`str`
        """
        if hasattr(self, "_name"):
            return self._name

        return None

    @name.setter
    def name(self, var):
        self._name = str(var)

    @property
    def r_prov_ids(self):
        r"""Specifies structure sets IDs/labels and corresponding MD5 hashes.

        Keys are the Rset IDs (:obj:`int`) and values are MD5 hashes
        (:obj:`str`) for the particular structure set.

        This is used as a breadcrumb trail that specifies where each structure
        in the data set originates from.

        Examples
        --------
        >>> dset.r_prov_ids
        {0: '2339670ad87a606cb11a72191dfd9f58'}

        :type: :obj:`dict`
        """
        if hasattr(self, "_r_prov_ids"):
            return self._r_prov_ids

        return {}

    @r_prov_ids.setter
    def r_prov_ids(self, var):
        self._r_prov_ids = var

    @property
    def r_prov_specs(self):
        r"""An array specifying where each structure in ``R`` originates from.

        A ``(n_R, 1 + n_entity)`` array where each row contains the Rset ID
        from ``r_prov_ids`` (e.g., 0, 1, 2, etc.) then the structure index and
        entity_ids from the original full structure in the structure set.

        If there has been no previous sampling, an array of shape (1, 0)
        is returned.

        :type: :obj:`numpy.ndarray`

        Examples
        --------
        >>> dset.r_prov_specs  # [r_prov_id, r_index, entity_1, entity_2, entity_3]
        array([[0, 985, 46, 59, 106],
               [0, 174, 51, 81, 128]])
        """
        if hasattr(self, "_r_prov_specs"):
            return self._r_prov_specs

        return np.array([[]], dtype="int_")

    @r_prov_specs.setter
    def r_prov_specs(self, var):
        self._r_prov_specs = var

    @property
    def F(self):
        r"""Atomic forces of atoms in structure(s).

        A :obj:`numpy.ndarray` with shape of ``(m, n, 3)`` where ``m`` is the
        number of structures and ``n`` is the number of atoms with three
        Cartesian components.

        :type: :obj:`numpy.ndarray`
        """
        if hasattr(self, "_F"):
            return self._F

        return np.array([[[]]])

    @F.setter
    def F(self, var):
        self._F = var  # pylint: disable=invalid-name

    @property
    def E(self):
        r"""The energies of structure(s).

        A :obj:`numpy.ndarray` with shape of ``(n,)`` where ``n`` is the number
        of atoms.

        :type: :obj:`numpy.ndarray`
        """
        if hasattr(self, "_E"):
            return self._E

        return np.array([])

    @E.setter
    def E(self, var):
        self._E = var  # pylint: disable=invalid-name

    @property
    def e_unit(self):
        r"""Units of energy. Options are ``'eV'``, ``'hartree'``,
        ``'kcal/mol'``, and ``'kJ/mol'``.

        :type: :obj:`str`
        """
        if hasattr(self, "_e_unit"):
            return self._e_unit

        return "n/a"

    @e_unit.setter
    def e_unit(self, var):
        self._e_unit = var

    @property
    def theory(self):
        r"""The level of theory used to compute energy and gradients of the data
        set.

        :type: :obj:`str`
        """
        if hasattr(self, "_theory"):
            return self._theory

        return "n/a"

    @theory.setter
    def theory(self, var):
        self._theory = var

    @property
    def E_min(self):  # pylint: disable=invalid-name
        r"""Minimum energy of all structures.

        :type: :obj:`float`
        """
        return float(np.min(self.E.ravel()))

    @property
    def E_max(self):  # pylint: disable=invalid-name
        r"""Maximum energy of all structures.

        :type: :obj:`float`
        """
        return float(np.max(self.E.ravel()))

    @property
    def E_var(self):  # pylint: disable=invalid-name
        r"""Energy variance.

        :type: :obj:`float`
        """
        return float(np.var(self.E.ravel()))

    @property
    def E_mean(self):  # pylint: disable=invalid-name
        r"""Mean of all energies.

        :type: :obj:`float`
        """
        return float(np.mean(self.E.ravel()))

    @property
    def F_min(self):  # pylint: disable=invalid-name
        r"""Minimum atomic force in all structures.

        :type: :obj:`float`
        """
        return float(np.min(self.F.ravel()))

    @property
    def F_max(self):  # pylint: disable=invalid-name
        r"""Maximum atomic force in all structures.

        :type: :obj:`float`
        """
        return float(np.max(self.F.ravel()))

    @property
    def F_var(self):  # pylint: disable=invalid-name
        r"""Force variance.

        :type: :obj:`float`
        """
        return float(np.var(self.F.ravel()))

    @property
    def F_mean(self):  # pylint: disable=invalid-name
        r"""Mean of all forces.

        :type: :obj:`float`
        """
        return float(np.mean(self.F.ravel()))

    @property
    def md5(self):
        r"""Unique MD5 hash of data set.

        Notes
        -----
        ``Z`` and ``R`` are always used to generate the MD5 hash. If available,
        :obj:`mbgdml.data.DataSet.E` and :obj:`mbgdml.data.DataSet.F` are used.

        :type: :obj:`str`
        """
        try:
            return self.asdict()["md5"].item().decode()
        except BaseException:
            print("Not enough information in dset for MD5")
            raise

    @property
    def entity_ids(self):
        r"""1D array specifying which atoms belong to which entities.

        An entity represents a related set of atoms such as a single molecule,
        several molecules, or a functional group. For mbGDML, an entity usually
        corresponds to a model trained to predict energies and forces of those
        atoms. Each ``entity_id`` is an :obj:`int` starting from ``0``.

        It is conceptually similar to PDBx/mmCIF ``_atom_site.label_entity_ids``
        data item.

        Examples
        --------
        A single water molecule would be ``[0, 0, 0]``. A water (three atoms)
        and methanol (six atoms) molecule in the same structure would be
        ``[0, 0, 0, 1, 1, 1, 1, 1, 1]``.

        :type: :obj:`numpy.ndarray`
        """
        if hasattr(self, "_entity_ids"):
            return self._entity_ids

        return np.array([])

    @entity_ids.setter
    def entity_ids(self, var):
        self._entity_ids = np.array(var)

    @property
    def comp_ids(self):
        r"""A 1D array relating ``entity_id`` to a fragment label for chemical
        components or species. Labels could be ``WAT`` or ``h2o`` for water,
        ``MeOH`` for methanol, ``bz`` for benzene, etc. There are no
        standardized labels for species. The index of the label is the
        respective ``entity_id``. For example, a water and methanol molecule
        could be ``['h2o', 'meoh']``.

        Examples
        --------
        Suppose we have a structure containing a water and methanol molecule.
        We can use the labels of ``h2o`` and ``meoh`` (which could be
        anything): ``['h2o', 'meoh']``. Note that the
        ``entity_id`` is a :obj:`str`.

        :type: :obj:`numpy.ndarray`
        """
        if hasattr(self, "_comp_ids"):
            return self._comp_ids

        return np.array([])

    @comp_ids.setter
    def comp_ids(self, var):
        self._comp_ids = np.array(var)

    @property
    def mb(self):
        r"""Many-body expansion order of this data set. This is :obj:`None` if the
        data set does not contain many-body energies and forces.

        :type: :obj:`int`
        """
        if hasattr(self, "_mb"):
            return self._mb

        return None

    @mb.setter
    def mb(self, var):
        self._mb = int(var)

    @property
    def mb_dsets_md5(self):
        r"""All MD5 hash of data sets used to remove n-body contributions from
        data sets.

        :type: :obj:`numpy.ndarray`
        """
        if hasattr(self, "_mb_dsets_md5"):
            return self._mb_dsets_md5

        return np.array([])

    @mb_dsets_md5.setter
    def mb_dsets_md5(self, var):
        self._mb_dsets_md5 = var.astype(str)

    @property
    def mb_models_md5(self):
        r"""All MD5 hash of models used to remove n-body contributions from
        models.

        :type: :obj:`numpy.ndarray`
        """
        if hasattr(self, "_mb_models_md5"):
            return self._mb_models_md5

        return np.array([])

    @mb_models_md5.setter
    def mb_models_md5(self, var):
        self._mb_models_md5 = var.astype(str)

    # pylint: disable-next=invalid-name
[docs]    def convertE(self, E_units):
        r"""Convert energies and updates ``e_unit``.

        Parameters
        ----------
        E_units : :obj:`str`
            Desired units of energy. Options are ``'eV'``, ``'hartree'``,
            ``'kcal/mol'``, and ``'kJ/mol'``.
        """
        self._E = convertor(self.E, self.e_unit, E_units)
        self.e_unit = E_units

    # pylint: disable-next=invalid-name
[docs]    def convertR(self, R_units):
        r"""Convert coordinates and updates ``r_unit``.

        Parameters
        ----------
        R_units : :obj:`str`
            Desired units of coordinates. Options are ``'Angstrom'`` or
            ``'bohr'``.
        """
        self._R = convertor(self.R, self.r_unit, R_units)
        self.r_unit = R_units

    # pylint: disable-next=invalid-name
[docs]    def convertF(self, force_e_units, force_r_units, e_units, r_units):
        r"""Convert forces.

        Does not change ``e_unit`` or ``r_unit``.

        Parameters
        ----------
        force_e_units : :obj:`str`
            Specifies package-specific energy units used in calculation.
            Available units are ``'eV'``, ``'hartree'``, ``'kcal/mol'``, and
            ``'kJ/mol'``.
        force_r_units : :obj:`str`
            Specifies package-specific distance units used in calculation.
            Available units are ``'Angstrom'`` and ``'bohr'``.
        e_units : :obj:`str`
            Desired units of energy. Available units are ``'eV'``,
            ``'hartree'``, ``'kcal/mol'``, and ``'kJ/mol'``.
        r_units : :obj:`str`
            Desired units of distance. Available units are ``'Angstrom'`` and
            ``'bohr'``.
        """
        self._F = utils.convert_forces(
            self.F, force_e_units, force_r_units, e_units, r_units
        )

    def _update(self, dataset):
        r"""Updates object attributes.

        Parameters
        ----------
        dataset : :obj:`dict`
            Contains all information and arrays stored in data set.
        """
        self.name = dataset["name"].item()
        self._Z = dataset[self.Z_key]
        self._R = dataset[self.R_key]
        self._E = dataset[self.E_key]
        self._F = dataset[self.F_key]
        self._r_unit = dataset["r_unit"].item()
        try:
            self._e_unit = dataset["e_unit"].item()
        except KeyError:
            self._e_unit = "n/a"
        try:
            self.mbgdml_version = dataset["mbgdml_version"].item()
        except KeyError:
            # Some old data sets do not have this information.
            # This is unessential, so we will just ignore this.
            pass
        try:
            self.theory = dataset["theory"].item()
        except KeyError:
            self.theory = "n/a"
        # mbGDML added data set information.
        if "mb" in dataset.keys():
            self.mb = dataset["mb"].item()
        if "mb_models_md5" in dataset.keys():
            self.mb_models_md5 = dataset["mb_models_md5"]
        if "mb_dsets_md5" in dataset.keys():
            self.mb_dsets_md5 = dataset["mb_dsets_md5"]

        try:
            self.r_prov_specs = dataset["r_prov_specs"][()]
            self.r_prov_ids = dataset["r_prov_ids"][()]
            self.entity_ids = dataset["entity_ids"]
            self.comp_ids = dataset["comp_ids"]
        except KeyError:
            pass

        if "centered" in dataset.keys():
            self.centered = dataset["centered"].item()

[docs]    def load(self, dataset_path):
        r"""Read data set.

        Parameters
        ----------
        dataset_path : :obj:`str`
            Path to NumPy ``npz`` file.
        """
        dataset_npz = np.load(dataset_path, allow_pickle=True)
        npz_type = dataset_npz.f.type.item()

        if npz_type != "d":
            raise ValueError(f"{npz_type} is not a data set.")

        self._update(dict(dataset_npz))

    # pylint: disable-next=too-many-branches
[docs]    def asdict(self, gdml_keys=True):
        r"""Converts object into a custom :obj:`dict`.

        Parameters
        ----------
        gdml_keys : :obj:`bool`, default: ``True``
            Data sets can use any keys to specify atomic data. However, mbGDML uses
            the standard of ``Z`` for atomic numbers, ``R`` for structure coordinates,
            ``E`` for energies, and ``F`` for forces. Using this option changes the
            data set keys to GDML keys.

        Returns
        -------
        :obj:`dict`
        """
        # Data always available for data sets.
        dataset = {
            "type": np.array("d"),
            "mbgdml_version": np.array(mbgdml_version),
            "name": np.array(self.name),
            "r_prov_ids": np.array(self.r_prov_ids),
            "r_prov_specs": np.array(self.r_prov_specs),
            self.Z_key: np.array(self.Z),
            self.R_key: np.array(self.R),
            "r_unit": np.array(self.r_unit),
            "entity_ids": self.entity_ids,
            "comp_ids": self.comp_ids,
        }
        if gdml_keys:
            md5_properties = ["Z", "R"]
        else:
            md5_properties = [self.Z_key, self.R_key]

        # When starting a new data set from a structure set, there will not be
        # any energy or force data. Thus, we try to add the data if available,
        # but will not error out if the data is not available.
        # Theory.
        try:
            dataset["theory"] = np.array(self.theory)
        except BaseException:
            pass

        # Energies.
        try:
            dataset[self.E_key] = np.array(self.E)
            dataset["e_unit"] = np.array(self.e_unit)
            dataset["E_min"] = np.array(self.E_min)
            dataset["E_max"] = np.array(self.E_max)
            dataset["E_mean"] = np.array(self.E_mean)
            dataset["E_var"] = np.array(self.E_var)
            if gdml_keys:
                md5_properties.append("E")
            else:
                md5_properties.append(self.E_key)
        except BaseException:
            pass

        # Forces.
        try:
            dataset[self.F_key] = np.array(self.F)
            dataset["F_min"] = np.array(self.F_min)
            dataset["F_max"] = np.array(self.F_max)
            dataset["F_mean"] = np.array(self.F_mean)
            dataset["F_var"] = np.array(self.F_var)
            if gdml_keys:
                md5_properties.append("F")
            else:
                md5_properties.append(self.F_key)
        except BaseException:
            pass

        # mbGDML information.
        if hasattr(self, "mb") and self.mb is not None:
            dataset["mb"] = np.array(self.mb)
        if len(self.mb_models_md5) > 0:
            dataset["mb_models_md5"] = np.array(self.mb_models_md5)
        if len(self.mb_dsets_md5) > 0:
            dataset["mb_dsets_md5"] = np.array(self.mb_dsets_md5)

        if hasattr(self, "centered"):
            dataset["centered"] = np.array(self.centered)

        if gdml_keys:
            dataset["Z"] = dataset.pop(self.Z_key)
            dataset["R"] = dataset.pop(self.R_key)
            dataset["E"] = dataset.pop(self.E_key)
            dataset["F"] = dataset.pop(self.F_key)

        dataset["md5"] = np.array(utils.md5_data(dataset, md5_properties))
        return dataset

[docs]    def print(self):
        r"""Prints all structure coordinates, energies, and forces of a data set."""
        num_config = self.R.shape[0]
        for config in range(num_config):
            r_prov_spec = self.r_prov_specs[config]
            print(f"-----Configuration {config}-----")
            print(
                f"r_prov_id: {int(r_prov_spec[0])}     "
                f"Structure index: {int(r_prov_spec[1])}"
            )
            print(f"Molecule indices: {r_prov_spec[2:]}")
            print(f"Coordinates:\n{self.R[config]}")
            print(f"Energy: {self.E[config]}")
            print(f"Forces:\n{self.F[config]}\n")

[docs]    def write_xyz(self, save_dir):
        r"""Saves xyz file of all structures in data set.

        Parameters
        ----------
        save_dir : :obj:`str`
        """
        xyz_path = os.path.join(save_dir, self.name)
        utils.write_xyz(xyz_path, self.Z, self.R)