Source code for mbgdml.train

# MIT License
#
# Copyright (c) 2018-2020, Stefan Chmiela
# Copyright (c) 2020-2023, Alex M. Maldonado
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import shutil
import numpy as np
from bayes_opt import BayesianOptimization, SequentialDomainReductionTransformer

from .models import gdmlModel
from .analysis.problematic import ProblematicStructures
from .predictors import predict_gdml
from ._gdml.train import GDMLTrain, model_errors, add_valid_errors
from ._gdml.train import save_model, get_test_idxs
from .utils import save_json
from .losses import loss_f_rmse, mae, rmse
from .logger import GDMLLogger

log = GDMLLogger(__name__)


[docs]class mbGDMLTrain: r"""Train many-body GDML models.""" def __init__( self, entity_ids, comp_ids, use_sym=True, use_E=True, use_E_cstr=False, use_cprsn=False, solver="analytic", lam=1e-10, solver_tol=1e-4, use_torch=False, max_processes=None, ): """ Parameters ---------- entity_ids : :obj:`numpy.ndarray` Model ``entity_ids``. comp_ids : :obj:`numpy.ndarray` Model ``comp_ids``. use_sym : :obj:`bool`, default: ``True`` If to identify and include symmetries when training GDML models. This usually increases training and prediction times, but comes with accuracy improvements. use_E : :obj:`bool`, default: ``True`` Whether or not to reconstruct the potential energy surface (``True``) with or (``False``) without energy labels. It is highly recommended to train with energies. use_E_cstr : :obj:`bool`, default: ``False`` Whether or not to include energies as a part of the model training. Meaning ``True`` will add another column of alphas that will be trained to the energies. This is rarely useful for higher order *n*-body models. use_cprsn : :obj:`bool`, default: ``False`` Compresses the kernel matrix along symmetric degrees of freedom to try to reduce training time. Usually does not provide significant benefits. solver : :obj:`str`, default: ``'analytic'`` The GDML solver to use, either ``analytic`` or ``iterative``. lam : :obj:`float`, default: ``1e-10`` Hyper-parameter lambda (regularization strength). This generally does not need to change. solver_tol : :obj:`float`, default: ``1e-4`` Solver tolerance. use_torch : :obj:`bool`, default: ``False`` Use PyTorch to enable GPU acceleration. max_processes : :obj:`int`, default: :obj:`None` The maximum number of cores to use for the training process. Will automatically calculate if not specified. """ self.entity_ids = entity_ids self.comp_ids = comp_ids self.use_sym = use_sym self.use_E = use_E self.use_E_cstr = use_E_cstr self.use_cprsn = use_cprsn self.solver = solver self.lam = lam self.solver_tol = solver_tol self.interact_cut_off = None self.use_torch = use_torch self.max_processes = max_processes self.GDMLTrain = GDMLTrain(max_processes=max_processes, use_torch=use_torch) self.sigma_grid = [ 2, 25, 50, 100, 200, 300, 400, 500, 700, 900, 1100, 1500, 2000, 2500, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 20000, 50000, 100000, ] r"""Determining reasonable ``sigma_bounds`` is difficult without some prior experience with the system. Even then, the optimal ``sigma`` can drastically change depending on the training set size. ``sigma_grid`` will assist with determining optimal ``sigma_bounds`` by first performing a course grid search. The Bayesian optimization will start with the bounds of the grid-search minimum. It is recommended to choose a large ``sigma_bounds`` as large as your ``sigma_grid``; it will be updated internally. The number of probes done during the initial grid search will be subtracted from the Bayesian optimization ``init_points``. We recommend that the grid includes several lower sigmas (< 50), a few medium sigmas (< 500), and several large sigmas that span up to at least 1000 for higher-order models. **Default** .. code-block:: python [ 2, 25, 50, 100, 200, 300, 400, 500, 700, 900, 1100, 1500, 2000, 2500, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 20000, 50000, 100000 ] :type: :obj:`list` """ self.bayes_opt_params = { "init_points": 10, "n_iter": 10, "alpha": 1e-7, "acq": "ucb", "kappa": 1.5, } r"""Bayesian optimization parameters. **Default** .. code-block:: python { 'init_points': 10, 'n_iter': 10, 'alpha': 1e-7, 'acq': 'ucb', 'kappa': 1.5 } :type: :obj:`dict` """ self.bayes_opt_params_final = None r"""Bayesian optimization parameters for the final model. If :obj:`None`, then ``bayes_opt_params`` are used. Default: :obj:`None` :type: :obj:`dict` """ self.sigma_bounds = (2, 400) r"""Kernel length scale bounds for the Bayesian optimization. This is only used if ``sigma_grid`` if :obj:`None`. Default: ``(2, 400)`` :type: :obj:`tuple` """ self.check_energy_pred = True r"""Will return the model with the lowest loss that predicts reasonable energies. If ``False``, the model with the lowest loss is not checked for reasonable energy predictions. Sometimes, GDML kernels are unable to accurately reconstruct potential energies even if the force predictions are accurate. This is sometimes prevalent in many-body models with low and high sigmas (i.e., sigmas less than 5 or greater than 500). Default: ``True`` :type: :obj:`bool` """ self.loss_func = loss_f_rmse r"""Loss function for validation. The input of this function is the dictionary of :obj:`mbgdml._gdml.train.add_valid_errors` which contains ``force`` and ``energy`` errors. Default: :obj:`mbgdml.losses.loss_f_rmse` :type: ``callable`` """ self.loss_kwargs = {} r"""Loss function keyword arguments with the exception of the validation ``results`` dictionary. :type: :obj:`dict` """ self.require_E_eval = True # pylint: disable=invalid-name r"""Require energy evaluation regardless even if they are terrible. If ``False``, it defaults to sGDML behavior (this does not work well with n-body training). Default: ``True`` :type: :obj:`bool` """ self.keep_tasks = False r"""Keep all models trained during the train task if ``True``. They are removed by default. :type: :obj:`bool` """ self.bayes_opt_n_check_rising = 4 r"""Number of additional ``sigma_grid`` probes to check if loss continues to rise after finding a minima. We often perform a grid search prior to Bayesian optimization. Sometimes, with :math:`n`-body training, the loss will start rising but then fall again to a lower value. Thus, we do some extra (larger) sigmas to check if the loss will fall again. If it does, then we restart the grid search. :type: :obj:`int` """
[docs] def min_memory_analytic(self, n_train, n_atoms): r"""Minimum memory recommendation for training analytically. GDML currently only supports closed form solutions (i.e., analytically). Thus, the entire kernel matrix must be in memory which requires :math:`(M * 3N)^2` double precision (8 byte) entries. This provides a rough estimate for memory requirements. Parameters ---------- n_train : :obj:`int` Number of training structures. n_atoms : :obj:`int` Number of atoms in a single structure. Returns ------- :obj:`float` Minimum memory requirements in MB. """ return (8 * ((n_train * 3 * n_atoms) ** 2)) * 1e-6
def __del__(self): # pylint: disable=undefined-variable, global-variable-undefined global glob if "glob" in globals(): del glob
[docs] def create_task( self, train_dataset, n_train, valid_dataset, n_valid, sigma, train_idxs=None, valid_idxs=None, ): r"""Create a single training task that can be used as a template for hyperparameter searches. Parameters ---------- train_dataset : :obj:`mbgdml.data.DataSet` Dataset for training a model on. n_train : :obj:`int` The number of training points to sample. valid_dataset : :obj:`mbgdml.data.DataSet` Dataset for validating a model on. n_valid : :obj:`int` The number of validation points to sample, without replacement. sigma : :obj:`float` or :obj:`int` Kernel length scale of the desired model. train_idxs : :obj:`numpy.ndarray`, default: :obj:`None` The specific indices of structures to train the model on. If :obj:`None` will automatically sample the training data set. valid_idxs : :obj:`numpy.ndarray`, default: :obj:`None` The specific indices of structures to validate models on. If :obj:`None`, structures will be automatically determined. """ self.task = self.GDMLTrain.create_task( train_dataset, n_train, valid_dataset, n_valid, sigma, lam=self.lam, use_sym=self.use_sym, use_E=self.use_E, use_E_cstr=self.use_E_cstr, use_cprsn=self.use_cprsn, solver=self.solver, solver_tol=self.solver_tol, idxs_train=train_idxs, idxs_valid=valid_idxs, ) return self.task
[docs] def train_model(self, task): r"""Trains a GDML model from a task. Parameters ---------- task : :obj:`dict` Training task. n_train : :obj:`int` The number of training points to sample. Returns ------- :obj:`dict` Trained (not validated or tested) model. """ model = self.GDMLTrain.train(task, require_E_eval=self.require_E_eval) return model
[docs] def test_model(self, model, dataset, n_test=None): r"""Test model and add mbGDML modifications. Parameters ---------- model : :obj:`mbgdml.models.gdmlModel` Model to test. dataset : :obj:`dict` Test dataset. Returns ------- :obj:`mbgdml.models.gdmlModel` Tested and finalized many-body GDML model. """ n_test, E_errors, F_errors = model_errors( model.model_dict, dataset, is_valid=False, n_test=n_test, max_processes=self.max_processes, use_torch=self.use_torch, ) model.model_dict["n_test"] = np.array(n_test) results_test = {"force": {"mae": mae(F_errors), "rmse": rmse(F_errors)}} if E_errors is not None: results_test["energy"] = {"mae": mae(E_errors), "rmse": rmse(E_errors)} # Adding mbGDML-specific modifications to model. model.add_modifications(dataset) if "mb" in dataset.keys(): model.model_dict["mb"] = int(dataset["mb"][()]) if "mb_models_md5" in dataset.keys(): model.model_dict["mb_models_md5"] = dataset["mb_models_md5"] return model, results_test
[docs] def save_valid_csv(self, save_dir, valid_json): r"""Writes a CSV summary file for validation statistics. This is just easier to see the trend of sigma and validation error. Parameters ---------- save_dir : :obj:`str` Where to save the CSV file. valid_json : :obj:`dict` The validation json curated during the training routine. """ import pandas as pd # pylint: disable=import-outside-toplevel sigmas = np.array(valid_json["sigmas"]) sigma_argsort = np.argsort(sigmas) sigmas = sigmas[sigma_argsort] losses = np.array(valid_json["losses"])[sigma_argsort] e_mae = np.array(valid_json["energy"]["mae"])[sigma_argsort] e_rmse = np.array(valid_json["energy"]["rmse"])[sigma_argsort] f_mae = np.array(valid_json["force"]["mae"])[sigma_argsort] f_rmse = np.array(valid_json["force"]["rmse"])[sigma_argsort] df = pd.DataFrame( { "sigma": sigmas, "losses": losses, "e_mae": e_mae, "e_rmse": e_rmse, "f_mae": f_mae, "f_rmse": f_rmse, } ) df.to_csv(os.path.join(save_dir, "valid.csv"), index=False)
[docs] def save_idxs(self, model, dataset, save_dir, n_test): r"""Saves npy files of the dataset splits (training, validation, and test). Parameters ---------- model : :obj:`mbgdml.models.gdmlModel` Many-body GDML model. dataset : :obj:`dict` Dataset used for training, validation, and testing. """ log.info("\n# Writing train, valid, and test indices #") train_idxs = model.model_dict["idxs_train"] valid_idxs = model.model_dict["idxs_valid"] np.save(os.path.join(save_dir, "train_idxs"), train_idxs) np.save(os.path.join(save_dir, "valid_idxs"), valid_idxs) test_idxs = get_test_idxs(model.model_dict, dataset, n_test=n_test) np.save(os.path.join(save_dir, "test_idxs"), test_idxs)
[docs] def plot_bayes_opt_gp(self, optimizer): r"""Prepare a plot of the Bayesian optimization Gaussian process. Parameters ---------- optimizer : ``bayes_opt.BayesianOptimization`` Returns ------- ``object`` A matplotlib figure object. """ # pylint: disable=protected-access, import-outside-toplevel import matplotlib.pyplot as plt params = optimizer.space.params.flatten() losses = -optimizer.space.target sigma_bounds = optimizer._space.bounds[0] lower_bound = min(min(params), sigma_bounds[0]) upper_bound = max(max(params), sigma_bounds[1]) x = np.linspace(lower_bound, upper_bound, 10000) # pylint: disable-next=invalid-name mu, sigma = optimizer._gp.predict(x.reshape(-1, 1), return_std=True) fig, ax = plt.subplots( nrows=1, ncols=1, figsize=(5.5, 4), constrained_layout=True ) ax.plot(x, -mu, label="Prediction") ax.fill_between( x, -mu - 1.9600 * sigma, -mu + 1.9600 * sigma, alpha=0.1, label=r"95% confidence", ) ax.scatter(params, losses, c="red", s=8, zorder=10, label="Observations") ax.set_xlabel("Sigma") ax.set_ylabel(self.loss_func.__name__) ax.set_xlim(left=lower_bound, right=upper_bound) plt.legend() return fig
# pylint: disable-next=too-many-branches, too-many-statements
[docs] def bayes_opt( self, dataset, model_name, n_train, n_valid, n_test=None, save_dir=".", is_final=False, use_domain_opt=False, plot_bo=True, train_idxs=None, valid_idxs=None, overwrite=False, write_json=True, write_idxs=True, ): r"""Train a GDML model using Bayesian optimization for sigma. Uses the `Bayesian optimization <https://github.com/fmfn/BayesianOptimization>`__ package to automatically find the optimal sigma. This will maximize the negative validation loss. ``gp_params`` can be used to specify options to ``BayesianOptimization.maximize()`` method. A sequential domain reduction optimizer is used to accelerate the convergence to an optimal sigma (when requested). Parameters ---------- dataset : :obj:`mbgdml.data.DataSet` Dataset to train, validate, and test a model on. model_name : :obj:`str` User-defined model name without the ``'.npz'`` file extension. n_train : :obj:`int` The number of training points to use. n_valid : :obj:`int` The number of validation points to use. n_test : :obj:`int`, default: :obj:`None` The number of test points to test the validated GDML model. Defaults to testing all available structures. save_dir : :obj:`str`, default: ``'.'`` Path to train and save the mbGDML model. Defaults to current directory. is_final : :obj:`bool` If we use ``bayes_opt_params_final`` or not. use_domain_opt : :obj:`bool`, default: ``False`` Whether to use a sequential reduction optimizer or not. This sometimes crashes. plot_bo : :obj:`bool`, default: ``True`` Plot the Bayesian optimization Gaussian process. train_idxs : :obj:`numpy.ndarray`, default: :obj:`None` The specific indices of structures to train the model on. If :obj:`None` will automatically sample the training data set. valid_idxs : :obj:`numpy.ndarray`, default: :obj:`None` The specific indices of structures to validate models on. If :obj:`None`, structures will be automatically determined. overwrite : :obj:`bool`, default: ``False`` Overwrite existing files. write_json : :obj:`bool`, default: ``True`` Write a JSON file containing information about the training job. write_idxs : :obj:`bool`, default: ``True`` Write npy files for training, validation, and test indices. Returns ------- :obj:`dict` Optimal many-body GDML model. ``bayes_opt.BayesianOptimization`` The Bayesian optimizer object. """ t_job = log.t_start() log.info( "#############################\n" "# Training #\n" "# Bayesian Optimization #\n" "#############################\n" ) if write_json: job_json = { "model": {}, "testing": {}, "validation": {}, "training": {"idxs": []}, } if train_idxs is not None: assert n_train == len(train_idxs) assert len(set(train_idxs)) == len(train_idxs) if valid_idxs is not None: assert n_valid == len(valid_idxs) assert len(set(valid_idxs)) == len(valid_idxs) dset_dict = dataset.asdict(gdml_keys=True) task_dir = os.path.join(save_dir, "tasks") if os.path.exists(task_dir): if overwrite: shutil.rmtree(task_dir) else: raise FileExistsError("Task directory already exists") os.makedirs(task_dir, exist_ok=overwrite) task = self.create_task( dset_dict, n_train, dset_dict, n_valid, None, # Do not specify sigma; should not have any effect. train_idxs=train_idxs, valid_idxs=valid_idxs, ) valid_json = { "sigmas": [], "losses": [], "force": {"mae": [], "rmse": []}, "energy": {"mae": [], "rmse": []}, "idxs": [], } def opt_func(sigma): r"""Computes the (negative) validation loss that Bayesian optimization tries to optimize. Also updates ``valid_json``. Parameters ---------- sigma : :obj:`float` Kernel length scale for a validation run. Returns ------- :obj:`float` Negative loss. """ task["sig"] = sigma model_trial = self.train_model(task) if len(valid_json["idxs"]) == 0: valid_json["idxs"] = model_trial["idxs_valid"].tolist() valid_results, model_trial = add_valid_errors( model_trial, dset_dict, overwrite=True, max_processes=self.max_processes, use_torch=self.use_torch, ) # pylint: disable-next=invalid-name l = self.loss_func(valid_results, **self.loss_kwargs) valid_json["losses"].append(l) valid_json["sigmas"].append(float(model_trial["sig"])) valid_json["energy"]["mae"].append(mae(valid_results["energy"])) valid_json["energy"]["rmse"].append(rmse(valid_results["energy"])) valid_json["force"]["mae"].append(mae(valid_results["force"])) valid_json["force"]["rmse"].append(rmse(valid_results["force"])) model_trail_path = os.path.join(task_dir, f"model-{float(sigma)}.npz") save_model(model_trial, model_trail_path) return -l sigma_bounds = self.sigma_bounds opt_kwargs = {"f": opt_func, "pbounds": {"sigma": sigma_bounds}, "verbose": 0} if use_domain_opt: bounds_transformer = SequentialDomainReductionTransformer() opt_kwargs["bounds_transformer"] = bounds_transformer optimizer = BayesianOptimization(**opt_kwargs) # Use optimizer.probe to run grid search and to update sigma_bounds. # If no minimum is found, the provided sigma_bounds are used. # For every probe we will reduce the number of initial points in the # Bayesian optimization. if is_final: if self.bayes_opt_params_final is not None: gp_params = self.bayes_opt_params_final else: gp_params = self.bayes_opt_params else: gp_params = self.bayes_opt_params # Addressing way of handling gp_params # https://github.com/bayesian-optimization/BayesianOptimization/commit/ac435483f24e666866a395423aa3eb4d65278fea if gp_params is not None: init_points = gp_params.get("init_points", 5) init_points = gp_params.get("n_iter", 25) del_keys = ["init_points", "n_iter"] for key in del_keys: if key in gp_params.keys(): # pylint: disable=consider-iterating-dictionary del gp_params[key] sigma_grid = self.sigma_grid if sigma_grid is not None: log.info("# Initial grid search #") sigma_grid.sort() def probe_sigma(sigma): optimizer.probe({"sigma": sigma}, lazy=False) def constant_loss_rising(valid_json, min_idxs_orig): r"""Checks for sustained rising loss. Parameters ---------- valid_json : :obj:`dict` Data from validation calculations. min_idxs_orig : :obj:`int` Index of the minimum. Returns ------- :obj:`bool` If the loss has continued to rise after identifying a minimum. """ if self.check_energy_pred: if valid_json["energy"]["rmse"][min_idxs_orig] is None: # Restart to find a better minimum that predicts energies. return False losses = valid_json["losses"] # Check if losses start falling after minimum. for i in range(min_idxs_orig + 1, len(losses)): if losses[i] < losses[i - 1]: # We will restart the grid search to find another minimum. return False return True loss_rising = False do_extra = ( self.bayes_opt_n_check_rising ) # Extra sigmas to check after losses rise. for i, sigma in enumerate(sigma_grid): probe_sigma(sigma) init_points -= 1 # Check to see if losses have started to rise. # Only check if loss_rising is False to avoid changing back # to False if the losses start falling (for do_extra). if len(valid_json["sigmas"]) >= 2 and not loss_rising: if valid_json["losses"][-1] > valid_json["losses"][-2]: loss_rising = True min_idxs_orig = len(valid_json["sigmas"]) - 2 if loss_rising: # We do extra sigmas to ensure we did not have premature # rising loss. if i != len(sigma_grid) - 1 and do_extra > 0: do_extra -= 1 continue # Once do_extra is complete we check losses. restart_grid_search = constant_loss_rising( valid_json, min_idxs_orig ) if not restart_grid_search: # Losses started lowering again. # Or we checked for energy predictions and it failed. # Restart the grid search. loss_rising = False do_extra = self.bayes_opt_n_check_rising else: # Loss has continued to rise; good chance we found the minimum. # Update the sigma upper bound to the largest sigma # we have already checked. sigma_bounds = (sigma_bounds[0], np.max(valid_json["sigmas"])) optimizer.set_bounds({"sigma": sigma_bounds}) # We stop grid search and start Bayesian optimization. break # Ensure init_points cannot be negative init_points = max(init_points, 0) log.info("# Bayesian optimization #") optimizer.set_gp_params(**gp_params) optimizer.maximize() results = optimizer.res losses = np.array([-res["target"] for res in results]) min_idxs = np.argsort(losses) for idx in min_idxs: sigma_best = results[idx]["params"]["sigma"] if self.check_energy_pred: e_rmse = valid_json["energy"]["rmse"][idx] if e_rmse is None: # Go to the next lowest loss. continue # Found the lowest loss. break break model_best_path = os.path.join(task_dir, f"model-{sigma_best}.npz") model_best = gdmlModel(model_best_path, comp_ids=self.comp_ids) # Testing model model_best, results_test = self.test_model(model_best, dset_dict, n_test=n_test) # Including final JSON stuff and writing. if write_json: job_json["training"]["idxs"] = model_best.model_dict["idxs_train"].tolist() job_json["testing"]["n_test"] = int(model_best.model_dict["n_test"][()]) job_json["testing"]["energy"] = results_test["energy"] job_json["testing"]["force"] = results_test["force"] job_json["model"]["sigma"] = float(model_best.model_dict["sig"][()]) job_json["model"]["n_symm"] = len(model_best.model_dict["perms"]) job_json["validation"] = valid_json save_json(os.path.join(save_dir, "training.json"), job_json) self.save_valid_csv(save_dir, valid_json) if write_idxs: self.save_idxs(model_best, dset_dict, save_dir, n_test) if not self.keep_tasks: shutil.rmtree(task_dir) # Saving model. save_path = os.path.join(save_dir, model_name) model_best.save(save_path) # Bayesian optimization plot if plot_bo: fig = self.plot_bayes_opt_gp(optimizer) fig.savefig(os.path.join(save_dir, "bayes_opt.png"), dpi=600) log.t_stop(t_job, message="\nJob duration : {time} s", precision=2) return model_best.model_dict, optimizer
# pylint: disable-next=too-many-branches, too-many-statements
[docs] def active_train( self, dataset, model_name, n_train_init, n_train_final, n_valid, model0=None, n_train_step=100, n_test=None, save_dir=".", overwrite=False, write_json=True, write_idxs=True, ): r"""Trains a GDML model by using Bayesian optimization and adding problematic (high error) structures to the training set. Trains a GDML model with :meth:`mbgdml.train.mbGDMLTrain.bayes_opt`. Parameters ---------- dataset : :obj:`mbgdml.data.DataSet` Data set to split into training, validation, and test sets are derived from. model_name : :obj:`str` User-defined model name without the ``'.npz'`` file extension. n_train_init : :obj:`int` Initial size of the training set. If ``model0`` is provided, this is the size of that model. n_train_final : :obj:`int` Training set size of the final model. n_valid : :obj:`int` Size of the validation set to be used for each training task. Different structures are sampled for each training task. model0 : :obj:`dict`, default: :obj:`None` Initial model to start training with. Training indices will be taken from here. n_train_step : :obj:`int`, default: ``100`` Number of problematic structures to add to the training set for each iteration. n_test : :obj:`int`, default: :obj:`None` The number of test points to test the validated GDML model. Defaults to testing all available structures. save_dir : :obj:`str`, default: ``'.'`` Path to train and save the mbGDML model. overwrite : :obj:`bool`, default: ``False`` Overwrite existing files. write_json : :obj:`bool`, default: ``True`` Write a JSON file containing information about the training job. write_idxs : :obj:`bool`, default: ``True`` Write npy files for training, validation, and test indices. """ log.log_package() t_job = log.t_start() log.info( "###################\n" "# Active #\n" "# Training #\n" "###################\n" ) log.info("Initial training set size : %r", n_train_init) if model0 is not None: log.info("Initial model was provided") log.info("Taking training indices from model") train_idxs = model0["idxs_train"] n_train = len(train_idxs) log.log_array(train_idxs, level=10) prob_s = ProblematicStructures( [gdmlModel(model0, comp_ids=self.comp_ids)], predict_gdml ) save_dir_i = os.path.join(save_dir, f"train{n_train}") os.makedirs(save_dir_i, exist_ok=overwrite) prob_idxs = prob_s.find(dataset, n_train_step, save_dir=save_dir_i) train_idxs = np.concatenate((train_idxs, prob_idxs)) log.info("Extended the training set to %d", len(train_idxs)) n_train = len(train_idxs) else: log.info("An initial model will be trained") n_train = n_train_init train_idxs = None sigma_bounds = self.sigma_bounds while n_train <= n_train_final: save_dir_i = os.path.join(save_dir, f"train{n_train}") model, _ = self.bayes_opt( dataset, model_name + f"-train{n_train}", n_train, n_valid, n_test=n_test, save_dir=save_dir_i, train_idxs=train_idxs, valid_idxs=None, overwrite=overwrite, write_json=write_json, write_idxs=write_idxs, ) # Check sigma bounds buffer = 5 lower_buffer = min(sigma_bounds) + buffer upper_buffer = max(sigma_bounds) - buffer if model["sig"] <= lower_buffer: log.warning( "WARNING: Optimal sigma is within %r from the lower sigma bound", buffer, ) elif model["sig"] >= upper_buffer: log.warning( "WARNING: Optimal sigma is within %r from the upper sigma bound", buffer, ) train_idxs = model["idxs_train"] prob_s = ProblematicStructures( [gdmlModel(model, comp_ids=self.comp_ids)], predict_gdml ) prob_idxs = prob_s.find(dataset, n_train_step, save_dir=save_dir_i) train_idxs = np.concatenate((train_idxs, prob_idxs)) n_train = len(train_idxs) log.t_stop(t_job, message="\nJob duration : {time} s", precision=2) return model