Source code for gemseo_mlearning.adaptive.distribution

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""This module defines the notion of distribution of a machine learning algorithm.

Once a :class:`.MLAlgo` has been trained,
assessing its quality is important before using it.

One can not only measure its global quality (e.g. from a :class:`.MLQualityMeasure`)
but also its local one.

The :class:`.MLRegressorDistribution` class addresses the latter case,
by quantifying the robustness of a machine learning algorithm to a learning point.
The more robust it is,
the less variability it has around this point.

.. note::

    For now, one does not consider any :class:`.MLAlgo`
    but instances of :class:`.MLRegressionAlgo`.

The :class:`.MLRegressorDistribution` can be particularly useful to:

- study the robustness of a :class:`.MLAlgo` w.r.t. learning dataset elements,
- evaluate acquisition criteria for adaptive learning purposes
  (see :class:`.MLDataAcquisition` and :class:`.MLDataAcquisitionCriterion`),
- etc.

The abstract :class:`.MLRegressorDistribution` class is derived into two classes:

- :class:`.KrigingDistribution`:
    the :class:`.MLRegressionAlgo` is a Kriging model
    and this assessor takes advantage of the underlying Gaussian stochastic process,
- :class:`.RegressorDistribution`:
    this class is based on sampling methods,
    such as bootstrap,
    cross-validation
    or leave-one-out.

.. seealso::

    KrigingDistribution
    RegressorDistribution
    MLDataAcquisition
    MLDataAcquisitionCriterion
    MLDataAcquisitionCriterionFactory
"""
from __future__ import annotations

import logging

from docstring_inheritance import GoogleDocstringInheritanceMeta
from gemseo.core.dataset import Dataset
from gemseo.mlearning.core.ml_algo import DataType
from gemseo.mlearning.regression import regression
from gemseo.mlearning.regression.regression import MLRegressionAlgo
from numpy import ndarray

LOGGER = logging.getLogger(__name__)


[docs]class MLRegressorDistribution(metaclass=GoogleDocstringInheritanceMeta): """Distribution related to a regression model.""" algo: MLRegressionAlgo """The regression model.""" _samples: list[int] """The indices of the learning samples in the learning dataset.""" _transform_input_group: bool """Whether to transform the input group.""" _transform_output_group: bool """Whether to transform the output group.""" _input_variables_to_transform: list[str] """The names of the input variables to be transformed.""" _output_variables_to_transform: list[str] """The names of the output variables to be transformed.""" def __init__(self, algo: MLRegressionAlgo) -> None: """# noqa: D205 D212 D415 Args: algo: A regression model. """ self.algo = algo self._samples = [] self._transform_input_group = self.algo._transform_input_group self._transform_output_group = self.algo._transform_output_group self._input_variables_to_transform = self.algo._input_variables_to_transform self._output_variables_to_transform = self.algo._output_variables_to_transform @property def learning_set(self) -> Dataset: """The learning dataset used by the original machine learning algorithm.""" return self.algo.learning_set @property def input_names(self) -> list[str]: """The names of the original machine learning algorithm inputs.""" return self.algo.input_names @property def output_names(self) -> list[str]: """The names of the original machine learning algorithm outputs.""" return self.algo.output_names @property def output_dimension(self) -> int: """The dimension of the machine learning output space.""" return self.algo.output_dimension
[docs] def learn(self, samples: list[int] | None = None) -> None: """Train the machine learning algorithm from the learning dataset. Args: samples: The indices of the learning samples. If ``None``, use the whole learning dataset """ self._samples = samples or range(len(self.learning_set)) self.algo.learn(self._samples)
[docs] def predict( self, input_data: DataType, ) -> DataType: """Predict the output of the original machine learning algorithm. The user can specify the input data either as a NumPy array, e.g. :code:`array([1., 2., 3.])` or as a dictionary, e.g. :code:`{'a': array([1.]), 'b': array([2., 3.])}`. The output data type will be consistent with the input data type. Args: input_data: The input data. Returns: The predicted output data. """ return self.algo.predict(input_data)
[docs] def compute_confidence_interval( self, input_data: DataType, level: float = 0.95, ) -> tuple[dict[str, ndarray], dict[str, ndarray], tuple[ndarray, ndarray]] | None: """Predict the lower bounds and upper bounds from input data. The user can specify the input data either as a NumPy array, e.g. :code:`array([1., 2., 3.])` or as a dictionary, e.g. :code:`{'a': array([1.]), 'b': array([2., 3.])}`. The output data type will be consistent with the input data type. Args: input_data: The input data. level: A quantile level. Returns: The lower and upper bound values. """ raise NotImplementedError
@regression.MLRegressionAlgo.DataFormatters.format_input_output def compute_mean( self, input_data: DataType, ) -> DataType: """Compute the mean from input data. The user can specify the input data either as a NumPy array, e.g. :code:`array([1., 2., 3.])` or as a dictionary, e.g. :code:`{'a': array([1.]), 'b': array([2., 3.])}`. The output data type will be consistent with the input data type. Args: input_data: The input data. Returns: The mean value. """ raise NotImplementedError @regression.MLRegressionAlgo.DataFormatters.format_input_output def compute_variance( self, input_data: DataType, ) -> DataType: """Compute the variance from input data. The user can specify the input data either as a NumPy array, e.g. :code:`array([1., 2., 3.])` or as a dictionary, e.g. :code:`{'a': array([1.]), 'b': array([2., 3.])}`. The output data type will be consistent with the input data type. Args: input_data: The input data. Returns: The variance value. """ raise NotImplementedError @regression.MLRegressionAlgo.DataFormatters.format_input_output def compute_standard_deviation( self, input_data: DataType, ) -> DataType: """Compute the standard deviation from input data. The user can specify the input data either as a NumPy array, e.g. :code:`array([1., 2., 3.])` or as a dictionary, e.g. :code:`{'a': array([1.]), 'b': array([2., 3.])}`. The output data type will be consistent with the input data type. Args: input_data: The input data. Returns: The standard deviation value. """ return self.compute_variance(input_data) ** 0.5 @regression.MLRegressionAlgo.DataFormatters.format_input_output def compute_expected_improvement( self, input_data: DataType, fopt: float, maximize: bool = False ) -> DataType: """Compute the expected improvement from input data. The user can specify the input data either as a NumPy array, e.g. :code:`array([1., 2., 3.])` or as a dictionary, e.g. :code:`{'a': array([1.]), 'b': array([2., 3.])}`. The output data type will be consistent with the input data type. Args: input_data: The input data. fopt: The current optimum value. maximize: The type of optimum to seek. Returns: The expected improvement value. """ raise NotImplementedError
[docs] def change_learning_set(self, learning_set: Dataset) -> None: """Re-train the machine learning algorithm relying on the initial learning set. Args: learning_set: The new learning set. """ self.algo.learning_set = learning_set self.learn()