Source code for gemseo.uncertainty.distributions.base_distribution_fitter

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""Fitting a probability distribution to data using a UQ library."""

from __future__ import annotations

import logging
from abc import abstractmethod
from collections.abc import Iterable
from collections.abc import Mapping
from collections.abc import MutableSequence
from typing import TYPE_CHECKING
from typing import Any
from typing import ClassVar
from typing import Generic
from typing import TypeVar
from typing import Union

from strenum import StrEnum

from gemseo.utils.metaclasses import ABCGoogleDocstringInheritanceMeta

if TYPE_CHECKING:
    from gemseo.typing import RealArray
    from gemseo.typing import StrKeyMapping

LOGGER = logging.getLogger(__name__)

FittingTestResultType = tuple[bool, Mapping[str, float]]
MeasureType = Union[FittingTestResultType, float]
_DistributionT = TypeVar("_DistributionT")



[docs]
class BaseDistributionFitter(
    Generic[_DistributionT], metaclass=ABCGoogleDocstringInheritanceMeta
):
    """Base class to fit a probability distribution from data using a UQ library."""

    _data: RealArray
    """The data array."""

    _samples: Any
    """The samples."""

    _CRITERIA_TO_WRAPPED_OBJECTS: ClassVar[StrKeyMapping]
    """Fitting criteria to objects of the UQ library."""

    DistributionName: ClassVar[StrEnum]
    """The names of the probability distributions in the UQ library."""

    FittingCriterion: ClassVar[StrEnum]
    """The names of the fitting criteria."""

    SignificanceTest: ClassVar[StrEnum]
    """The names of the fitting criteria that are statistical significance tests."""


[docs]
    class SelectionCriterion(StrEnum):
        """The selection criteria."""

        FIRST = "first"
        """Select the first distribution satisfying a fitting criterion."""

        BEST = "best"
        """Select the distribution that best satisfies a fitting criterion"""


    _FITTING_CRITERIA_TO_MINIMIZE: ClassVar[set[str]] = set()
    """The fitting criteria to minimize (the others are to be maximized)."""

    def __init__(self, data: RealArray) -> None:
        """
        Args:
            data: A data array.
        """  # noqa: D205,D212,D415
        self.data = data

    @property
    def data(self) -> RealArray:
        """The data array."""
        return self._data

    @data.setter
    def data(self, data_: RealArray) -> None:
        self._data = data_
        self._samples = data_.ravel()


[docs]
    @abstractmethod
    def fit(
        self,
        distribution: DistributionName,  # noqa: F821
    ) -> _DistributionT:
        """Fit a probability distribution to the data.

        Args:
            distribution: The name of a probability distribution in the UQ library.

        Returns:
            The probability distribution fitted to the data.
        """



[docs]
    def compute_measure(
        self,
        distribution: _DistributionT | DistributionName,  # noqa: F821
        criterion: FittingCriterion,  # noqa: F821
        level: float = 0.05,
    ) -> MeasureType:
        """Measure a goodness-of-fit of a probability distribution fitted to data.

        Args:
            distribution: Either a |g| probability distribution fitted to :attr:`.data`
                or the name of a probability distribution in the UQ library.
            criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.

        Returns:
            The goodness-of-fit of the probability distribution fitted to data.
        """
        goodness_of_fit = self._compute_measure(distribution, criterion, level)
        if criterion in {t.value for t in self.SignificanceTest}:
            return self._format_significance_test_goodness_of_fit(
                goodness_of_fit, level
            )

        return goodness_of_fit


    @abstractmethod
    def _compute_measure(
        self,
        distribution: _DistributionT | DistributionName,  # noqa: F821
        criterion: FittingCriterion,  # noqa: F821
        level: float,
    ) -> Any:
        """Compute a goodness-of-fit of a probability distribution fitted to data.

        This method does not format the result,
        unlike its caller :meth:`.compute_measure`.

        Args:
            distribution: Either a |g| probability distribution fitted to :attr:`.data`
                or the name of a probability distribution in the UQ library.
            criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.

        Returns:
            The unformatted goodness-of-fit
            of the probability distribution fitted to data.
        """

    @staticmethod
    def _format_significance_test_goodness_of_fit(
        goodness_of_fit: Any, level: float
    ) -> FittingTestResultType:
        """Format a goodness-of-fit measured according to a fitting criterion.

        Args:
            goodness_of_fit: The goodness-of-fit
                measured according to a fitting criterion.

        Returns:
            First,
            whether the null hypothesis is accepted,
            then,
            a dictionary whose keys are "p-value", "statistics" and "level".
        """


[docs]
    def select(
        self,
        distributions: MutableSequence[_DistributionT | DistributionName],  # noqa: F821
        fitting_criterion: FittingCriterion,  # noqa: F821
        level: float = 0.05,
        selection_criterion: SelectionCriterion = SelectionCriterion.BEST,
    ) -> _DistributionT:
        """Select the best probability distribution according to a fitting criterion.

        Args:
            distributions: A collection of |g| probability distributions
                fitted to :attr:`.data`
                or names of probability distributions in the UQ library.
            fitting_criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.
            selection_criterion: The name of the selection criterion.

        Returns:
            The best probability distribution
            according to the fitting criterion and the selection criterion.
        """
        measures = []
        for index, distribution in enumerate(distributions):
            if distribution in self.DistributionName.__members__:
                distribution = self.fit(distribution)

            distributions[index] = distribution
            measures.append(
                self.compute_measure(distribution, fitting_criterion, level)
            )

        best_distribution_index = self.select_from_measures(
            measures, fitting_criterion, level, selection_criterion
        )
        return distributions[best_distribution_index]



[docs]
    @classmethod
    def select_from_measures(
        cls,
        measures: MutableSequence[MeasureType],
        fitting_criterion: FittingCriterion,  # noqa: F821
        level: float = 0.05,
        selection_criterion: SelectionCriterion = SelectionCriterion.BEST,
    ) -> int:
        """Select the best probability distribution according to a fitting criterion.

        Args:
            measures: The goodness-of-fit measures.
            fitting_criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.
            selection_criterion: The name of the selection criterion.

        Returns:
            The index of the best probability distribution
            according to the fitting criterion and the selection criterion.
        """
        is_significant_test = fitting_criterion in cls.SignificanceTest.__members__
        if is_significant_test:
            measures = [measure[1]["p-value"] for index, measure in enumerate(measures)]
            if all(p_value < level for p_value in measures):
                LOGGER.warning(
                    "All criteria values are lower than the significance level %s.",
                    level,
                )

        if (
            selection_criterion == cls.SelectionCriterion.BEST
            or not is_significant_test
        ):
            return cls.__compute_index(fitting_criterion, measures)

        for index, measure in enumerate(measures):
            if measure >= level:
                return index

        return cls.__compute_index(fitting_criterion, measures)


    @classmethod
    def __compute_index(
        cls,
        fitting_criterion: FittingCriterion,  # noqa: F821
        measures: Iterable[MeasureType],
    ) -> int:
        """Compute the best distribution index according to a fitting criterion.

        Args:
            fitting_criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            measures: The goodness-of-fit measures.

        Returns:
            The index of the best probability distribution
            according to a fitting criterion.
        """
        op = min if fitting_criterion in cls._FITTING_CRITERIA_TO_MINIMIZE else max
        return measures.index(op(measures))

    @property
    def available_distributions(self) -> list[str]:
        """The available probability distributions."""
        return sorted({t.value for t in self.DistributionName})

    @property
    def available_criteria(self) -> list[str]:
        """The available fitting criteria."""
        return sorted({t.value for t in self.FittingCriterion})

    @property
    def available_significance_tests(self) -> list[str]:
        """The significance tests."""
        return sorted({t.value for t in self.SignificanceTest})