Source code for gemseo.mlearning.core.calibration

# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# Contributors:
#    INITIAL AUTHORS - API and implementation and/or documentation
#        :author: Matthias De Lozzo
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""Calibration of a machine learning algorithm.

A machine learning algorithm depends on hyper-parameters,
e.g. the number of clusters for a clustering algorithm,
the regularization constant for a regression model,
the kernel for a Gaussian process regression, ...
Its ability to generalize the information learned during the training stage,
and thus to avoid over-fitting,
which is an over-reliance on the learning data set,
depends on the values of these hyper-parameters.
Thus,
the hyper-parameters minimizing the learning quality measure are rarely
those minimizing the generalization one.
Classically,
the generalization one decreases before growing again as the model becomes more complex,
while the learning error keeps decreasing.
This phenomenon is called the curse of dimensionality.

In this module,
the :class:`.MLAlgoCalibration` class aims to calibrate the hyper-parameters
in order to minimize this measure of the generalization quality
over a calibration parameter space.
This class relies on the :class:`.MLAlgoAssessor` class
which is a discipline (:class:`.MDODiscipline`)
built from a machine learning algorithm (:class:`.MLAlgo`),
a dataset (:class:`.Dataset`),
a quality measure (:class:`.MLQualityMeasure`)
and various options for the data scaling,
the quality measure
and the machine learning algorithm.
The inputs of this discipline are hyper-parameters of the machine learning algorithm
while the output is the quality criterion.
"""

from __future__ import division, unicode_literals

from typing import Dict, Iterable, Optional, Union

from numpy import argmax, argmin, array, ndarray

from gemseo.algos.design_space import DesignSpace
from gemseo.algos.doe.doe_factory import DOEFactory
from gemseo.core.dataset import Dataset
from gemseo.core.discipline import MDODiscipline
from gemseo.core.doe_scenario import DOEScenario
from gemseo.core.mdo_scenario import MDOScenario
from gemseo.core.scenario import ScenarioInputDataType
from gemseo.mlearning.core.factory import MLAlgoFactory
from gemseo.mlearning.core.ml_algo import MLAlgo, MLAlgoParameterType, TransformerType
from gemseo.mlearning.qual_measure.quality_measure import MLQualityMeasure

MeasureOptionsType = Dict[str, Union[bool, int, Dataset]]


[docs]class MLAlgoAssessor(MDODiscipline): """Discipline assessing the quality of a machine learning algorithm. This quality depends on the values of parameters to calibrate with the :class:`.MLAlgoCalibration`. Attributes: algo (str): The name of a machine learning algorithm. measure (MLQualityMeasure): The measure to assess the machine learning algorithm. measure_options (Dict[str,Union[int,Dataset]]): The options of the quality measure. parameters (List(str)): The parameters of the machine learning algorithm to calibrate. dataset (Dataset): The learning dataset. transformer (TransformerType): The transformation strategy for data groups. algos (List(MLAlgo)): The instances of the machine learning algorithm (one per execution of the machine learning algorithm assessor). """ CRITERION = "criterion" LEARNING = "learning" MULTIOUTPUT = "multioutput" def __init__( self, algo, # type: str dataset, # type: Dataset parameters, # type: Iterable[str] measure, # type: MLQualityMeasure measure_options=None, # type: Optional[MeasureOptionsType] transformer=None, # type: Optional[TransformerType] **algo_options # type: MLAlgoParameterType ): # type: (...) -> None """ Args: algo: The name of a machine learning algorithm. dataset: A learning dataset. parameters: The parameters of the machine learning algorithm to calibrate. measure: A measure to assess the machine learning algorithm. measure_options: The options of the quality measure. If "multioutput" is missing, it is added with False as value. If None, do not use quality measure options. transformer (Dict[str,Transformer]): The strategies to transform the variables. The values are instances of :class:`.Transformer` while the keys are the names of either the variables or the groups of variables, e.g. "inputs" or "outputs" in the case of the regression algorithms. If a group is specified, the :class:`.Transformer` will be applied to all the variables of this group. If None, do not transform the variables. **options: The options of the machine learning algorithm. Raises: ValueError: If the measure option "multioutput" is True. """ super(MLAlgoAssessor, self).__init__() self.input_grammar.initialize_from_data_names(parameters) self.output_grammar.initialize_from_data_names([self.CRITERION, self.LEARNING]) self.algo = algo self.measure = measure self.measure_options = measure_options or {} self.parameters = algo_options self.data = dataset self.transformer = transformer self.algos = [] if self.measure_options.get("multioutput", False): raise ValueError("MLAlgoAssessor does not support multioutput.") self.measure_options[self.MULTIOUTPUT] = False def _run(self): # type: (...) -> None """Run method. This method creates a new instance of the machine learning algorithm, from the hyper-parameters stored in the local_data attribute of the :class:`.MLAlgoAssessor`. It trains it on the learning dataset and measures its quality with the :class:`.MLQualityMeasure`. """ inputs = self.get_input_data() for index in inputs: if len(inputs[index]) == 1: inputs[index] = inputs[index][0] self.parameters.update(inputs) factory = MLAlgoFactory() algo = factory.create( self.algo, data=self.data, transformer=self.transformer, **self.parameters ) algo.learn() measure = self.measure(algo) learning = measure.evaluate(multioutput=False) criterion = measure.evaluate(**self.measure_options) self.store_local_data(criterion=array([criterion]), learning=array([learning])) self.algos.append(algo)
[docs]class MLAlgoCalibration(object): """Calibration of a machine learning algorithm. Attributes: algo_assessor (MLAlgoAssessor): The assessor for the machine learning algorithm. calibration_space (DesignSpace): The space defining the calibration variables. maximize_objective (bool): Whether to maximize the quality measure. dataset (Dataset): The learning dataset. optimal_parameters (Dict[str,ndarray]): The optimal parameters for the machine learning algorithm. optimal_criterion (float): The optimal quality measure. optimal_algorithm (MLAlgo): The optimal machine learning algorithm. scenario (Scenario): The scenario used to calibrate the machine learning algorithm. """ def __init__( self, algo, # type: str dataset, # type: Dataset parameters, # type: Iterable[str] calibration_space, # type: DesignSpace measure, # type: MLQualityMeasure measure_options=None, # type: Optional[MeasureOptionsType] transformer=None, # type: Optional[TransformerType] **algo_options # type: MLAlgoParameterType ): # type: (...) -> None """ Args: algo: The name of a machine learning algorithm. dataset: A learning dataset. parameters: The parameters of the machine learning algorithm to calibrate. calibration_space: The space defining the calibration variables. measure: A measure to assess the machine learning algorithm. measure_options: The options of the quality measure. If None, do not use the quality measure options. transformer: The transformation strategy for the data groups. If None, do not transform data. **algo_options: The options of the machine learning algorithm. """ disc = MLAlgoAssessor( algo, dataset, parameters, measure, measure_options, transformer, **algo_options ) self.algo_assessor = disc self.calibration_space = calibration_space self.maximize_objective = not measure.SMALLER_IS_BETTER disc.set_cache_policy(disc.MEMORY_FULL_CACHE) self.dataset = None self.optimal_parameters = None self.optimal_criterion = None self.optimal_algorithm = None self.scenario = None
[docs] def execute( self, input_data, # type: ScenarioInputDataType ): # type: (...) -> None """Calibrate the machine learning algorithm from a driver. The driver can be either a DOE or an optimizer. Args: input_data: The driver properties. """ doe_factory = DOEFactory() if doe_factory.is_available(input_data["algo"]): self.scenario = DOEScenario( [self.algo_assessor], "DisciplinaryOpt", self.algo_assessor.CRITERION, self.calibration_space, maximize_objective=self.maximize_objective, ) else: self.scenario = MDOScenario( [self.algo_assessor], "DisciplinaryOpt", self.algo_assessor.CRITERION, self.calibration_space, maximize_objective=self.maximize_objective, ) self.scenario.disciplines[0].cache.clear() self.scenario.execute(input_data) x_opt = self.scenario.design_space.get_current_x_dict() f_opt = self.scenario.get_optimum().f_opt cache = self.scenario.disciplines[0].cache self.dataset = cache.export_to_dataset(by_group=False) if self.maximize_objective: algo_opt = self.algos[ argmax(self.get_history(self.algo_assessor.CRITERION)) ] else: algo_opt = self.algos[ argmin(self.get_history(self.algo_assessor.CRITERION)) ] self.optimal_parameters = x_opt self.optimal_criterion = f_opt self.optimal_algorithm = algo_opt
[docs] def get_history( self, name, # type: str ): # type: (...) -> ndarray """Return the history of a variable. Args: name: The name of the variable. Returns: The history of the variable. """ if self.dataset is not None: return self.dataset.data[name]
@property def algos(self): # type: (...) -> MLAlgo """The trained machine learning algorithms.""" return self.scenario.disciplines[0].algos