Source code for gemseo.mlearning.resampling.resampler

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""A base class for resampling and surrogate modeling."""

from __future__ import annotations

from abc import abstractmethod
from copy import deepcopy
from typing import TYPE_CHECKING

from numpy import concatenate
from numpy import ndarray
from numpy import vstack

from gemseo.utils.metaclasses import ABCGoogleDocstringInheritanceMeta
from gemseo.utils.seeder import SEED

if TYPE_CHECKING:
    from numpy.typing import NDArray

    from gemseo.mlearning.core.ml_algo import MLAlgo
    from gemseo.mlearning.resampling.splits import Splits


[docs] class Resampler(metaclass=ABCGoogleDocstringInheritanceMeta): """A base class for resampling and surrogate modeling.""" name: str """The name of the resampler. Use the class name by default. """ _sample_indices: NDArray[int] """The original indices of the samples.""" _seed: int | None """The seed to initialize the random generator. If ``None``, then fresh, unpredictable entropy will be pulled from the OS. """ _splits: Splits """The train-test splits resulting from the splitting of the samples. A train-test split is a partition whose first component contains the indices of the learning samples and the second one the indices of the test samples. """ _n_splits: int """The number of train-test splits.""" def __init__( self, sample_indices: NDArray[int], n_splits: int, seed: int | None = SEED, ) -> None: """ Args: sample_indices: The original indices of the samples. n_splits: The number of train-test splits. seed: The seed to initialize the random generator. If ``None``, then fresh, unpredictable entropy will be pulled from the OS. """ # noqa: D205 D212 self._n_splits = n_splits self._seed = seed self._sample_indices = sample_indices self._splits = self._create_splits() self.name = self.__class__.__name__ @abstractmethod def _create_splits(self) -> Splits: """Create the train-test splits.""" @property def sample_indices(self) -> NDArray[int]: """The indices of the samples after shuffling.""" return self._sample_indices @property def seed(self) -> int: """The seed to initialize the random generator.""" return self._seed @property def splits(self) -> Splits: """The train-test splits resulting from the splitting of the samples. A train-test split is a partition whose first component contains the indices of the learning samples and the second one the indices of the test samples. """ return self._splits def __eq__(self, other: Resampler) -> bool: return self._splits == other._splits
[docs] def execute( self, model: MLAlgo, return_models: bool, predict: bool, stack_predictions: bool, fit_transformers: bool, store_sampling_result: bool, input_data: ndarray, output_data_shape: tuple[int, ...], ) -> tuple[list[MLAlgo], list[ndarray] | ndarray]: """Apply the resampling technique to a machine learning model. Args: model: The machine learning model. return_models: Whether the sub-models resulting from resampling are returned. predict: Whether the sub-models resulting from sampling do prediction on their corresponding learning data. stack_predictions: Whether the sub-predictions are stacked. fit_transformers: Whether to re-fit the transformers. store_sampling_result: Whether to store the sampling results in the attribute :class:`~.MLAlgo.resampling_results` of the original model. input_data: The input data. output_data_shape: The shape of the output data array. Returns: First the sub-models resulting from resampling if ``return_models`` is ``True`` then the predictions, either per fold or stacked. Raises: ValueError: When the model is neither a supervised algorithm nor a clustering one. """ if self.name in model.resampling_results: (resampler, sub_models, predictions) = model.resampling_results[self.name] if self == resampler: return sub_models, predictions if not return_models: sub_model = deepcopy(model) predictions = [] sub_models = [] for split in self._splits: if return_models: sub_model = deepcopy(model) sub_models.append(sub_model) sub_model.learn(samples=split.train, fit_transformers=fit_transformers) if predict: predictions.append(sub_model.predict(input_data[split.test])) if predict: predictions = self._post_process_predictions( predictions, output_data_shape, stack_predictions ) if store_sampling_result: model.resampling_results[self.name] = (self, sub_models, predictions) return sub_models, predictions
def _post_process_predictions( self, predictions: list[ndarray], output_data_shape: tuple[int, ...], stack_predictions: bool, ) -> ndarray | list[ndarray]: """Stack the predictions if required. Args: predictions: The predictions per fold. output_data_shape: The shape of the full learning output data. stack_predictions: Whether to stack the predictions. Returns: The predictions, either stacked or as is. """ if stack_predictions: function = concatenate if len(output_data_shape) == 1 else vstack return function(predictions) return predictions