Source code for gemseo.uncertainty.distributions.base_distribution_fitter

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""Fitting a probability distribution to data using a UQ library."""

from __future__ import annotations

import logging
from abc import abstractmethod
from collections.abc import Mapping
from typing import TYPE_CHECKING
from typing import Any
from typing import ClassVar
from typing import Generic
from typing import TypeVar

from strenum import StrEnum

from gemseo.utils.metaclasses import ABCGoogleDocstringInheritanceMeta

if TYPE_CHECKING:
    from collections.abc import Iterable
    from collections.abc import MutableSequence

    from gemseo.typing import RealArray
    from gemseo.typing import StrKeyMapping

LOGGER = logging.getLogger(__name__)

FittingTestResultType = tuple[bool, Mapping[str, float]]
MeasureType = FittingTestResultType | float
_DistributionT = TypeVar("_DistributionT")



[docs]
class BaseDistributionFitter(
    Generic[_DistributionT], metaclass=ABCGoogleDocstringInheritanceMeta
):
    """Base class to fit a probability distribution from data using a UQ library."""

    _data: RealArray
    """The data array."""

    _samples: Any
    """The samples."""

    variable: str
    """The name of the variable."""

    _CRITERIA_TO_WRAPPED_OBJECTS: ClassVar[StrKeyMapping]
    """Fitting criteria to objects of the UQ library."""

    DistributionName: ClassVar[StrEnum]
    """The names of the probability distributions in the UQ library."""

    FittingCriterion: ClassVar[StrEnum]
    """The names of the fitting criteria."""

    default_fitting_criterion: ClassVar[BaseDistributionFitter.FittingCriterion]
    """The names of the default fitting criterion."""

    SignificanceTest: ClassVar[StrEnum]
    """The names of the fitting criteria that are statistical significance tests."""


[docs]
    class SelectionCriterion(StrEnum):
        """The selection criteria."""

        FIRST = "first"
        """Select the first distribution satisfying a fitting criterion."""

        BEST = "best"
        """Select the distribution that best satisfies a fitting criterion"""


    _FITTING_CRITERIA_TO_MINIMIZE: ClassVar[set[str]] = set()
    """The fitting criteria to minimize (the others are to be maximized)."""

    def __init__(self, variable: str, data: RealArray) -> None:
        # TODO: API: rename variable to variable_name or remove it because useless.
        """
        Args:
            variable: The name of the variable.
            data: A data array.
        """  # noqa: D205,D212,D415
        self.data = data
        self.variable = variable

    @property
    def data(self) -> RealArray:
        """The data array."""
        return self._data

    @data.setter
    def data(self, data_: RealArray) -> None:
        self._data = data_
        self._samples = data_.ravel()


[docs]
    @abstractmethod
    def fit(
        self,
        distribution: DistributionName,  # noqa: F821
    ) -> _DistributionT:
        """Fit a probability distribution to the data.

        Args:
            distribution: The name of a probability distribution in the UQ library.

        Returns:
            The probability distribution fitted to the data.
        """



[docs]
    def compute_measure(
        self,
        distribution: _DistributionT | DistributionName,  # noqa: F821
        criterion: FittingCriterion,  # noqa: F821
        level: float = 0.05,
    ) -> MeasureType:
        """Measure a goodness-of-fit of a probability distribution fitted to data.

        Args:
            distribution: Either a |g| probability distribution fitted to :attr:`.data`
                or the name of a probability distribution in the UQ library.
            criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.

        Returns:
            The goodness-of-fit of the probability distribution fitted to data.
        """
        goodness_of_fit = self._compute_measure(distribution, criterion, level)
        if criterion in {t.value for t in self.SignificanceTest}:
            return self._format_significance_test_goodness_of_fit(
                goodness_of_fit, level
            )

        return goodness_of_fit


    @abstractmethod
    def _compute_measure(
        self,
        distribution: _DistributionT | DistributionName,  # noqa: F821
        criterion: FittingCriterion,  # noqa: F821
        level: float,
    ) -> Any:
        """Compute a goodness-of-fit of a probability distribution fitted to data.

        This method does not format the result,
        unlike its caller :meth:`.compute_measure`.

        Args:
            distribution: Either a |g| probability distribution fitted to :attr:`.data`
                or the name of a probability distribution in the UQ library.
            criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.

        Returns:
            The unformatted goodness-of-fit
            of the probability distribution fitted to data.
        """

    @staticmethod
    def _format_significance_test_goodness_of_fit(
        goodness_of_fit: Any, level: float
    ) -> FittingTestResultType:
        """Format a goodness-of-fit measured according to a fitting criterion.

        Args:
            goodness_of_fit: The goodness-of-fit
                measured according to a fitting criterion.

        Returns:
            First,
            whether the null hypothesis is accepted,
            then,
            a dictionary whose keys are "p-value", "statistics" and "level".
        """


[docs]
    def select(
        self,
        distributions: MutableSequence[_DistributionT | DistributionName],  # noqa: F821
        fitting_criterion: FittingCriterion,  # noqa: F821
        level: float = 0.05,
        selection_criterion: SelectionCriterion = SelectionCriterion.BEST,
    ) -> _DistributionT:
        """Select the best probability distribution according to a fitting criterion.

        Args:
            distributions: A collection of |g| probability distributions
                fitted to :attr:`.data`
                or names of probability distributions in the UQ library.
            fitting_criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.
            selection_criterion: The name of the selection criterion.

        Returns:
            The best probability distribution
            according to the fitting criterion and the selection criterion.
        """
        measures = []
        for index, distribution in enumerate(distributions):
            if distribution in set(self.DistributionName):
                distribution = self.fit(distribution)

            distributions[index] = distribution
            measures.append(
                self.compute_measure(distribution, fitting_criterion, level)
            )

        best_distribution_index = self.select_from_measures(
            measures, fitting_criterion, level, selection_criterion
        )
        return distributions[best_distribution_index]



[docs]
    @classmethod
    def select_from_measures(
        cls,
        measures: MutableSequence[MeasureType],
        fitting_criterion: FittingCriterion,  # noqa: F821
        level: float = 0.05,
        selection_criterion: SelectionCriterion = SelectionCriterion.BEST,
    ) -> int:
        """Select the best probability distribution according to a fitting criterion.

        Args:
            measures: The goodness-of-fit measures.
            fitting_criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.
            selection_criterion: The name of the selection criterion.

        Returns:
            The index of the best probability distribution
            according to the fitting criterion and the selection criterion.
        """
        is_significant_test = fitting_criterion in set(cls.SignificanceTest)
        if is_significant_test:
            measures = [measure[1]["p-value"] for index, measure in enumerate(measures)]
            if all(p_value < level for p_value in measures):
                LOGGER.warning(
                    "All criteria values are lower than the significance level %s.",
                    level,
                )

        if (
            selection_criterion == cls.SelectionCriterion.BEST
            or not is_significant_test
        ):
            return cls.__compute_index(fitting_criterion, measures)

        for index, measure in enumerate(measures):
            if measure >= level:
                return index

        return cls.__compute_index(fitting_criterion, measures)


    @classmethod
    def __compute_index(
        cls,
        fitting_criterion: FittingCriterion,  # noqa: F821
        measures: Iterable[MeasureType],
    ) -> int:
        """Compute the best distribution index according to a fitting criterion.

        Args:
            fitting_criterion: The name of the fitting criterion
                to measure the goodness-of-fit of the probability distribution.
            measures: The goodness-of-fit measures.

        Returns:
            The index of the best probability distribution
            according to a fitting criterion.
        """
        op = min if fitting_criterion in cls._FITTING_CRITERIA_TO_MINIMIZE else max
        return measures.index(op(measures))

    @property
    def available_distributions(self) -> list[str]:
        """The available probability distributions."""
        return sorted({t.value for t in self.DistributionName})

    @property
    def available_criteria(self) -> list[str]:
        """The available fitting criteria."""
        return sorted({t.value for t in self.FittingCriterion})

    @property
    def available_significance_tests(self) -> list[str]:
        """The significance tests."""
        return sorted({t.value for t in self.SignificanceTest})