Source code for gemseo.mlearning.resampling.cross_validation

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""A cross-validation tool for resampling and surrogate modeling."""

from __future__ import annotations

from typing import TYPE_CHECKING

from numpy import array_split
from numpy import concatenate
from numpy import empty
from numpy import ndarray
from numpy import setdiff1d
from numpy import vstack
from numpy.random import default_rng

from gemseo.mlearning.resampling.base_resampler import BaseResampler
from gemseo.mlearning.resampling.split import Split
from gemseo.mlearning.resampling.splits import Splits
from gemseo.utils.seeder import SEED

if TYPE_CHECKING:
    from numpy.typing import NDArray

    from gemseo.mlearning import MLAlgo



[docs]
class CrossValidation(BaseResampler):
    """A cross-validation tool for resampling and surrogate modeling."""

    __randomize: bool
    """Whether the sample indices are shuffled before splitting."""

    __shuffled_sample_indices: NDArray[int]
    """The indices of the samples after shuffling."""

    def __init__(
        self,
        sample_indices: NDArray[int],
        n_folds: int = 5,
        randomize: bool = False,
        seed: int | None = SEED,
    ) -> None:
        """
        Args:
            n_folds: The number of folds.
            randomize: Whether the sample indices are shuffled before splitting.
        """  # noqa: D205 D212
        self.__randomize = randomize
        self.__shuffled_sample_indices = sample_indices.copy()
        if randomize:
            default_rng(seed).shuffle(self.__shuffled_sample_indices)
        super().__init__(sample_indices, n_splits=n_folds, seed=seed)
        if len(sample_indices) == n_folds:
            self.name = "LeaveOneOut"


[docs]
    def execute(
        self,
        model: MLAlgo,
        return_models: bool = False,
        input_data: ndarray | None = None,
        stack_predictions: bool = True,
        fit_transformers: bool = True,
        store_sampling_result: bool = False,
    ) -> tuple[list[MLAlgo], list[ndarray] | ndarray]:
        """
        Args:
            stack_predictions: Whether the sub-predictions are stacked
                in the order of the ``sample_indices`` passed at instantiation
                (first the prediction at index ``sample_indices[0]``,
                then the prediction at index ``sample_indices[1]``,
                etc.).
                This argument is ignored when ``input_data`` is ``None``.
        """  # noqa: D205, D212, D415
        return super().execute(
            model,
            return_models=return_models,
            input_data=input_data,
            stack_predictions=stack_predictions,
            fit_transformers=fit_transformers,
            store_sampling_result=store_sampling_result,
        )


    def _create_splits(self) -> Splits:
        return Splits(*[
            Split(
                setdiff1d(self.__shuffled_sample_indices, test_indices),
                test_indices,
            )
            for test_indices in array_split(
                self.__shuffled_sample_indices, self._n_splits
            )
        ])

    @property
    def shuffled_sample_indices(self) -> NDArray[int]:
        """The original indices of the samples."""
        return self.__shuffled_sample_indices

    @property
    def n_folds(self) -> int:
        """The number of folds."""
        return self._n_splits

    @property
    def randomize(self) -> bool:
        """Whether the sample indices are shuffled before splitting."""
        return self.__randomize

    def _post_process_predictions(
        self,
        predictions: list[ndarray],
        stack_predictions: bool,
    ) -> ndarray | list[ndarray]:
        if stack_predictions:
            n_predictions = sum([len(prediction) for prediction in predictions])
            predictions_0 = predictions[0]
            if predictions_0.ndim == 1:
                final_predictions = empty((n_predictions,))
                function = concatenate
            else:
                final_predictions = empty((n_predictions, predictions_0.shape[1]))
                function = vstack

            final_predictions[self.__shuffled_sample_indices] = function(predictions)
            return final_predictions
        return predictions