Source code for gemseo.disciplines.surrogate

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                         documentation
#        :author: Matthias De Lozzo
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""Surrogate discipline."""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING
from typing import Any

from gemseo.core.discipline import Discipline
from gemseo.mlearning.regression.algos.base_regressor import BaseRegressor
from gemseo.mlearning.regression.algos.factory import RegressorFactory
from gemseo.mlearning.regression.quality.factory import RegressorQualityFactory
from gemseo.post.mlearning.ml_regressor_quality_viewer import MLRegressorQualityViewer
from gemseo.utils.constants import READ_ONLY_EMPTY_DICT
from gemseo.utils.string_tools import MultiLineString
from gemseo.utils.string_tools import pretty_str

if TYPE_CHECKING:
    from collections.abc import Iterable
    from collections.abc import Mapping
    from collections.abc import Sequence

    from numpy import ndarray

    from gemseo.datasets.io_dataset import IODataset
    from gemseo.mlearning.core.algos.ml_algo import TransformerType
    from gemseo.mlearning.regression.algos.base_regressor_settings import (
        BaseRegressorSettings,
    )
    from gemseo.mlearning.regression.quality.base_regressor_quality import (
        BaseRegressorQuality,
    )
    from gemseo.typing import StrKeyMapping

LOGGER = logging.getLogger(__name__)


[docs] class SurrogateDiscipline(Discipline): """A discipline wrapping a regression model built from a dataset. Examples: >>> import numpy as np >>> from gemseo.datasets.io_dataset import IODataset >>> from gemseo.disciplines.surrogate import SurrogateDiscipline >>> >>> # Create an input-output dataset. >>> dataset = IODataset() >>> dataset.add_input_variable("x", np.array([[1.0], [2.0], [3.0]])) >>> dataset.add_output_variable("y", np.array([[3.0], [5.0], [6.0]])) >>> >>> # Build a surrogate discipline relying on a linear regression model. >>> surrogate_discipline = SurrogateDiscipline("LinearRegressor", dataset) >>> >>> # Assess its quality with the R2 measure. >>> r2 = surrogate_discipline.get_error_measure("R2Measure") >>> learning_r2 = r2.compute_learning_measure() >>> >>> # Execute the surrogate discipline, with default or custom input values. >>> surrogate_discipline.execute() >>> surrogate_discipline.execute({"x": np.array([1.5])}) """ regression_model: BaseRegressor """The regression model called by the surrogate discipline.""" def __init__( self, surrogate: str | BaseRegressor | BaseRegressorSettings, data: IODataset | None = None, # TODO: API: remove in favor of settings or surrogate as BaseRegressorSettings. transformer: TransformerType = BaseRegressor.DEFAULT_TRANSFORMER, # TODO: API: rename to name. disc_name: str = "", default_input_data: dict[str, ndarray] = READ_ONLY_EMPTY_DICT, input_names: Sequence[str] = (), output_names: Sequence[str] = (), **settings: Any, ) -> None: """ Args: surrogate: Either a regressor class name, a regressor instance or regressor settings. data: The dataset to train the regression model. If ``None``, the regression model is supposed to be trained. transformer: The strategies to transform the variables. This argument is ignored when ``surrogate`` is a :class:`.BaseRegressor`; in this case, these strategies are defined with the ``transformer`` argument of this :class:`.BaseRegressor`, whose default value is :attr:`.BaseMLAlgo.IDENTITY`, which means no transformation. In the other cases, the values of the dictionary are instances of :class:`.BaseTransformer` while the keys can be variable names, the group name ``"inputs"`` or the group name ``"outputs"``. If a group name is specified, the :class:`.BaseTransformer` will be applied to all the variables of this group. If :attr:`.BaseMLAlgo.IDENTITY`, do not transform the variables. The :attr:`.BaseRegressor.DEFAULT_TRANSFORMER` uses the :class:`.MinMaxScaler` strategy for both input and output variables. This argument is ignored when the type of ``surrogate`` is :class:`.BaseRegressorSettings`. disc_name: The name of the discipline. If empty, the concatenation of the short name of the surrogate algorithm and the name of the training dataset is used. default_input_data: The default values of the input variables. If empty, the center of the learning input space is used. input_names: The names of the input variables of the discipline. If empty and ``surrogate`` is a regressor instance, all input variables of the regressor are used. If empty and ``surrogate`` is not a regressor instance, all input variables mentioned in the training dataset are used. If the type of ``surrogate`` is :class:`.BaseRegressorSettings`, ``surrogate.input_names`` is ignored and replaced by ``input_names``. output_names: The names of the output variables of the discipline. If empty and ``surrogate`` is a regressor instance, all output variables of the regressor are used. If empty and ``surrogate`` is not a regressor instance, all output variables mentioned in the training dataset are used. If the type of ``surrogate`` is :class:`.BaseRegressorSettings`, ``surrogate.output_names`` is ignored and replaced by ``output_names``. **settings: The settings of the machine learning algorithm. These arguments are ignored when the type of ``surrogate`` is :class:`.BaseRegressorSettings`. Raises: ValueError: If the training dataset is missing whilst the regression model is not trained. """ # noqa: D205, D212, D415 if isinstance(surrogate, BaseRegressor): self.regression_model = surrogate elif data is None: msg = "data is required to train the surrogate model." raise ValueError(msg) elif isinstance(surrogate, str): self.regression_model = RegressorFactory().create( surrogate, data, transformer=transformer, input_names=input_names, output_names=output_names, **settings, ) else: surrogate.input_names = input_names surrogate.output_names = output_names self.regression_model = RegressorFactory().create( surrogate._TARGET_CLASS_NAME, data, settings_model=surrogate ) if not self.regression_model.is_trained: self.regression_model.learn() if not disc_name: disc_name = ( f"{self.regression_model.SHORT_ALGO_NAME}_" f"{self.regression_model.learning_set.name}" ) super().__init__(disc_name) self._initialize_grammars(input_names, output_names) self._set_default_inputs(default_input_data) self.add_differentiated_inputs() self.add_differentiated_outputs() try: self.regression_model.predict_jacobian(self.io.input_grammar.defaults) self.linearization_mode = self.LinearizationMode.AUTO except NotImplementedError: self.linearization_mode = self.LinearizationMode.FINITE_DIFFERENCES def _get_string_representation(self) -> MultiLineString: """The string representation of the object.""" mls = MultiLineString() mls.add("Surrogate discipline: {}", self.name) mls.indent() mls.add("Dataset name: {}", self.regression_model.learning_set.name) mls.add("Dataset size: {}", len(self.regression_model.learning_set)) mls.add("Surrogate model: {}", self.regression_model.__class__.__name__) mls.add("Inputs: {}", pretty_str(self.regression_model.input_names)) mls.add("Outputs: {}", pretty_str(self.regression_model.output_names)) mls.add("Linearization mode: {}", self.linearization_mode) return mls def __repr__(self) -> str: return str(self._get_string_representation()) def _repr_html_(self) -> str: return self._get_string_representation()._repr_html_() def _initialize_grammars( self, input_names: Iterable[str] = (), output_names: Iterable[str] = () ) -> None: """Initialize the input and output grammars. Args: input_names: The names of the discipline inputs. If empty, use all the inputs of the regression model. output_names: The names of the discipline outputs. If empty, use all the outputs of the regression model. """ self.io.input_grammar.update_from_names( input_names or self.regression_model.input_names ) self.io.output_grammar.update_from_names( output_names or self.regression_model.output_names ) def _set_default_inputs( self, default_input_data: Mapping[str, ndarray] = READ_ONLY_EMPTY_DICT, ) -> None: """Set the default values of the inputs. Args: default_input_data: The default values of the inputs. If empty, use the center of the learning input space. """ if not default_input_data: default_input_data = self.regression_model.input_space_center self.io.input_grammar.defaults = default_input_data def _run(self, input_data: StrKeyMapping) -> StrKeyMapping | None: self.__check_validity_domain(input_data) return { name: value.flatten() for name, value in self.regression_model.predict(input_data).items() } def _compute_jacobian( self, input_names: Iterable[str] = (), output_names: Iterable[str] = (), ) -> None: input_data = self.io.get_input_data() self.__check_validity_domain(input_data) self.jac = self.regression_model.predict_jacobian(input_data) def __check_validity_domain(self, input_data: Mapping[str, ndarray]) -> None: """Check whether a point belongs to the domain of validity of the surrogate. Args: input_data: The input data to be checked. """ domain = self.regression_model.validity_domain try: domain.check_membership(domain.convert_dict_to_array(input_data)) except ValueError: LOGGER.warning( ( "The surrogate discipline %s is used at an input point " "outside its domain of validity: %s." ), self.name, # workaround because input_data is updated somewhere with output_data. dict(input_data), )
[docs] def get_quality_viewer(self) -> MLRegressorQualityViewer: """Return a viewer of the quality of the underlying regressor. Returns: A viewer of the quality of the underlying regressor. """ return MLRegressorQualityViewer(self.regression_model)
[docs] def get_error_measure( self, measure_name: str, **measure_options: Any, ) -> BaseRegressorQuality: """Return an error measure. Args: measure_name: The class name of the error measure. **measure_options: The options of the error measure. Returns: The error measure. """ return RegressorQualityFactory().create( measure_name, algo=self.regression_model, **measure_options )