Source code for gemseo.mlearning.resampling.resampler
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""A base class for resampling and surrogate modeling."""
from __future__ import annotations
from abc import abstractmethod
from copy import deepcopy
from typing import TYPE_CHECKING
from numpy import concatenate
from numpy import ndarray
from numpy import vstack
from gemseo.utils.metaclasses import ABCGoogleDocstringInheritanceMeta
from gemseo.utils.seeder import SEED
if TYPE_CHECKING:
from numpy.typing import NDArray
from gemseo.mlearning.core.ml_algo import MLAlgo
from gemseo.mlearning.resampling.splits import Splits
[docs]
class Resampler(metaclass=ABCGoogleDocstringInheritanceMeta):
"""A base class for resampling and surrogate modeling."""
name: str
"""The name of the resampler.
Use the class name by default.
"""
_sample_indices: NDArray[int]
"""The original indices of the samples."""
_seed: int | None
"""The seed to initialize the random generator.
If ``None``,
then fresh, unpredictable entropy will be pulled from the OS.
"""
_splits: Splits
"""The train-test splits resulting from the splitting of the samples.
A train-test split is a partition whose first component contains the indices of the
learning samples and the second one the indices of the test samples.
"""
_n_splits: int
"""The number of train-test splits."""
def __init__(
self,
sample_indices: NDArray[int],
n_splits: int,
seed: int | None = SEED,
) -> None:
"""
Args:
sample_indices: The original indices of the samples.
n_splits: The number of train-test splits.
seed: The seed to initialize the random generator.
If ``None``,
then fresh, unpredictable entropy will be pulled from the OS.
""" # noqa: D205 D212
self._n_splits = n_splits
self._seed = seed
self._sample_indices = sample_indices
self._splits = self._create_splits()
self.name = self.__class__.__name__
@abstractmethod
def _create_splits(self) -> Splits:
"""Create the train-test splits."""
@property
def sample_indices(self) -> NDArray[int]:
"""The indices of the samples after shuffling."""
return self._sample_indices
@property
def seed(self) -> int:
"""The seed to initialize the random generator."""
return self._seed
@property
def splits(self) -> Splits:
"""The train-test splits resulting from the splitting of the samples.
A train-test split is a partition whose first component contains the indices of
the learning samples and the second one the indices of the test samples.
"""
return self._splits
def __eq__(self, other: Resampler) -> bool:
return self._splits == other._splits
[docs]
def execute(
self,
model: MLAlgo,
return_models: bool,
predict: bool,
stack_predictions: bool,
fit_transformers: bool,
store_sampling_result: bool,
input_data: ndarray,
output_data_shape: tuple[int, ...],
) -> tuple[list[MLAlgo], list[ndarray] | ndarray]:
"""Apply the resampling technique to a machine learning model.
Args:
model: The machine learning model.
return_models: Whether the sub-models resulting
from resampling are returned.
predict: Whether the sub-models resulting from sampling do prediction
on their corresponding learning data.
stack_predictions: Whether the sub-predictions are stacked.
fit_transformers: Whether to re-fit the transformers.
store_sampling_result: Whether to store the sampling results
in the attribute :class:`~.MLAlgo.resampling_results`
of the original model.
input_data: The input data.
output_data_shape: The shape of the output data array.
Returns:
First the sub-models resulting from resampling
if ``return_models`` is ``True``
then the predictions, either per fold or stacked.
Raises:
ValueError: When the model is
neither a supervised algorithm nor a clustering one.
"""
if self.name in model.resampling_results:
(resampler, sub_models, predictions) = model.resampling_results[self.name]
if self == resampler:
return sub_models, predictions
if not return_models:
sub_model = deepcopy(model)
predictions = []
sub_models = []
for split in self._splits:
if return_models:
sub_model = deepcopy(model)
sub_models.append(sub_model)
sub_model.learn(samples=split.train, fit_transformers=fit_transformers)
if predict:
predictions.append(sub_model.predict(input_data[split.test]))
if predict:
predictions = self._post_process_predictions(
predictions, output_data_shape, stack_predictions
)
if store_sampling_result:
model.resampling_results[self.name] = (self, sub_models, predictions)
return sub_models, predictions
def _post_process_predictions(
self,
predictions: list[ndarray],
output_data_shape: tuple[int, ...],
stack_predictions: bool,
) -> ndarray | list[ndarray]:
"""Stack the predictions if required.
Args:
predictions: The predictions per fold.
output_data_shape: The shape of the full learning output data.
stack_predictions: Whether to stack the predictions.
Returns:
The predictions, either stacked or as is.
"""
if stack_predictions:
function = concatenate if len(output_data_shape) == 1 else vstack
return function(predictions)
return predictions