Source code for gemseo_mlearning.adaptive.distribution

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""This module defines the notion of distribution of a machine learning algorithm.

Once a :class:`.MLAlgo` has been trained,
assessing its quality is important before using it.

One can not only measure its global quality (e.g. from a :class:`.MLQualityMeasure`)
but also its local one.

The :class:`.MLRegressorDistribution` class addresses the latter case,
by quantifying the robustness of a machine learning algorithm to a learning point.
The more robust it is,
the less variability it has around this point.

.. note::

    For now, one does not consider any :class:`.MLAlgo`
    but instances of :class:`.MLRegressionAlgo`.

The :class:`.MLRegressorDistribution` can be particularly useful to:

- study the robustness of a :class:`.MLAlgo` w.r.t. learning dataset elements,
- evaluate acquisition criteria for adaptive learning purposes
  (see :class:`.MLDataAcquisition` and :class:`.MLDataAcquisitionCriterion`),
- etc.

The abstract :class:`.MLRegressorDistribution` class is derived into two classes:

- :class:`.KrigingDistribution`:
    the :class:`.MLRegressionAlgo` is a Kriging model
    and this assessor takes advantage of the underlying Gaussian stochastic process,
- :class:`.RegressorDistribution`:
    this class is based on sampling methods,
    such as bootstrap,
    cross-validation
    or leave-one-out.

.. seealso::

    KrigingDistribution
    RegressorDistribution
    MLDataAcquisition
    MLDataAcquisitionCriterion
    MLDataAcquisitionCriterionFactory
"""
from __future__ import annotations

import logging

from docstring_inheritance import GoogleDocstringInheritanceMeta
from gemseo.core.dataset import Dataset
from gemseo.mlearning.core.ml_algo import DataType
from gemseo.mlearning.regression import regression
from gemseo.mlearning.regression.regression import MLRegressionAlgo
from numpy import ndarray

LOGGER = logging.getLogger(__name__)


[docs]class MLRegressorDistribution(metaclass=GoogleDocstringInheritanceMeta):
    """Distribution related to a regression model."""

    algo: MLRegressionAlgo
    """The regression model."""

    _samples: list[int]
    """The indices of the learning samples in the learning dataset."""

    _transform_input_group: bool
    """Whether to transform the input group."""

    _transform_output_group: bool
    """Whether to transform the output group."""

    _input_variables_to_transform: list[str]
    """The names of the input variables to be transformed."""

    _output_variables_to_transform: list[str]
    """The names of the output variables to be transformed."""

    def __init__(self, algo: MLRegressionAlgo) -> None:
        """# noqa: D205 D212 D415
        Args:
            algo: A regression model.
        """
        self.algo = algo
        self._samples = []
        self._transform_input_group = self.algo._transform_input_group
        self._transform_output_group = self.algo._transform_output_group
        self._input_variables_to_transform = self.algo._input_variables_to_transform
        self._output_variables_to_transform = self.algo._output_variables_to_transform

    @property
    def learning_set(self) -> Dataset:
        """The learning dataset used by the original machine learning algorithm."""
        return self.algo.learning_set

    @property
    def input_names(self) -> list[str]:
        """The names of the original machine learning algorithm inputs."""
        return self.algo.input_names

    @property
    def output_names(self) -> list[str]:
        """The names of the original machine learning algorithm outputs."""
        return self.algo.output_names

    @property
    def output_dimension(self) -> int:
        """The dimension of the machine learning output space."""
        return self.algo.output_dimension

[docs]    def learn(self, samples: list[int] | None = None) -> None:
        """Train the machine learning algorithm from the learning dataset.

        Args:
            samples: The indices of the learning samples.
                If ``None``, use the whole learning dataset
        """
        self._samples = samples or range(len(self.learning_set))
        self.algo.learn(self._samples)

[docs]    def predict(
        self,
        input_data: DataType,
    ) -> DataType:
        """Predict the output of the original machine learning algorithm.

        The user can specify the input data either as a NumPy array,
        e.g. :code:`array([1., 2., 3.])`
        or as a dictionary,
        e.g.  :code:`{'a': array([1.]), 'b': array([2., 3.])}`.

        The output data type will be consistent with the input data type.

        Args:
            input_data: The input data.

        Returns:
            The predicted output data.
        """
        return self.algo.predict(input_data)

[docs]    def compute_confidence_interval(
        self,
        input_data: DataType,
        level: float = 0.95,
    ) -> tuple[dict[str, ndarray], dict[str, ndarray], tuple[ndarray, ndarray]] | None:
        """Predict the lower bounds and upper bounds from input data.

        The user can specify the input data either as a NumPy array,
        e.g. :code:`array([1., 2., 3.])`
        or as a dictionary,
        e.g.  :code:`{'a': array([1.]), 'b': array([2., 3.])}`.

        The output data type will be consistent with the input data type.

        Args:
            input_data: The input data.
            level: A quantile level.

        Returns:
            The lower and upper bound values.
        """
        raise NotImplementedError

    @regression.MLRegressionAlgo.DataFormatters.format_input_output
    def compute_mean(
        self,
        input_data: DataType,
    ) -> DataType:
        """Compute the mean from input data.

        The user can specify the input data either as a NumPy array,
        e.g. :code:`array([1., 2., 3.])`
        or as a dictionary,
        e.g.  :code:`{'a': array([1.]), 'b': array([2., 3.])}`.

        The output data type will be consistent with the input data type.

        Args:
            input_data: The input data.

        Returns:
            The mean value.
        """
        raise NotImplementedError

    @regression.MLRegressionAlgo.DataFormatters.format_input_output
    def compute_variance(
        self,
        input_data: DataType,
    ) -> DataType:
        """Compute the variance from input data.

        The user can specify the input data either as a NumPy array,
        e.g. :code:`array([1., 2., 3.])`
        or as a dictionary,
        e.g.  :code:`{'a': array([1.]), 'b': array([2., 3.])}`.

        The output data type will be consistent with the input data type.

        Args:
            input_data: The input data.

        Returns:
            The variance value.
        """
        raise NotImplementedError

    @regression.MLRegressionAlgo.DataFormatters.format_input_output
    def compute_standard_deviation(
        self,
        input_data: DataType,
    ) -> DataType:
        """Compute the standard deviation from input data.

        The user can specify the input data either as a NumPy array,
        e.g. :code:`array([1., 2., 3.])`
        or as a dictionary,
        e.g.  :code:`{'a': array([1.]), 'b': array([2., 3.])}`.

        The output data type will be consistent with the input data type.

        Args:
            input_data: The input data.

        Returns:
            The standard deviation value.
        """
        return self.compute_variance(input_data) ** 0.5

    @regression.MLRegressionAlgo.DataFormatters.format_input_output
    def compute_expected_improvement(
        self, input_data: DataType, fopt: float, maximize: bool = False
    ) -> DataType:
        """Compute the expected improvement from input data.

        The user can specify the input data either as a NumPy array,
        e.g. :code:`array([1., 2., 3.])`
        or as a dictionary,
        e.g.  :code:`{'a': array([1.]), 'b': array([2., 3.])}`.

        The output data type will be consistent with the input data type.

        Args:
            input_data: The input data.
            fopt: The current optimum value.
            maximize: The type of optimum to seek.

        Returns:
            The expected improvement value.
        """
        raise NotImplementedError

[docs]    def change_learning_set(self, learning_set: Dataset) -> None:
        """Re-train the machine learning algorithm relying on the initial learning set.

        Args:
            learning_set: The new learning set.
        """
        self.algo.learning_set = learning_set
        self.learn()