Source code for gemseo.uncertainty.distributions.base_distribution_fitter

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""Fitting a probability distribution to data using a UQ library."""

from __future__ import annotations

import logging
from abc import abstractmethod
from collections.abc import Iterable
from collections.abc import Mapping
from collections.abc import MutableSequence
from typing import TYPE_CHECKING
from typing import Any
from typing import ClassVar
from typing import Generic
from typing import TypeVar
from typing import Union

from strenum import StrEnum

from gemseo.utils.metaclasses import ABCGoogleDocstringInheritanceMeta

if TYPE_CHECKING:
    from gemseo.typing import RealArray
    from gemseo.typing import StrKeyMapping

LOGGER = logging.getLogger(__name__)

FittingTestResultType = tuple[bool, Mapping[str, float]]
MeasureType = Union[FittingTestResultType, float]
_DistributionT = TypeVar("_DistributionT")


[docs] class BaseDistributionFitter( Generic[_DistributionT], metaclass=ABCGoogleDocstringInheritanceMeta ): """Base class to fit a probability distribution from data using a UQ library.""" _data: RealArray """The data array.""" _samples: Any """The samples.""" _CRITERIA_TO_WRAPPED_OBJECTS: ClassVar[StrKeyMapping] """Fitting criteria to objects of the UQ library.""" DistributionName: ClassVar[StrEnum] """The names of the probability distributions in the UQ library.""" FittingCriterion: ClassVar[StrEnum] """The names of the fitting criteria.""" SignificanceTest: ClassVar[StrEnum] """The names of the fitting criteria that are statistical significance tests."""
[docs] class SelectionCriterion(StrEnum): """The selection criteria.""" FIRST = "first" """Select the first distribution satisfying a fitting criterion.""" BEST = "best" """Select the distribution that best satisfies a fitting criterion"""
_FITTING_CRITERIA_TO_MINIMIZE: ClassVar[set[str]] = set() """The fitting criteria to minimize (the others are to be maximized).""" def __init__(self, data: RealArray) -> None: """ Args: data: A data array. """ # noqa: D205,D212,D415 self.data = data @property def data(self) -> RealArray: """The data array.""" return self._data @data.setter def data(self, data_: RealArray) -> None: self._data = data_ self._samples = data_.ravel()
[docs] @abstractmethod def fit( self, distribution: DistributionName, # noqa: F821 ) -> _DistributionT: """Fit a probability distribution to the data. Args: distribution: The name of a probability distribution in the UQ library. Returns: The probability distribution fitted to the data. """
[docs] def compute_measure( self, distribution: _DistributionT | DistributionName, # noqa: F821 criterion: FittingCriterion, # noqa: F821 level: float = 0.05, ) -> MeasureType: """Measure a goodness-of-fit of a probability distribution fitted to data. Args: distribution: Either a |g| probability distribution fitted to :attr:`.data` or the name of a probability distribution in the UQ library. criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. level: A test level, i.e. the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. Returns: The goodness-of-fit of the probability distribution fitted to data. """ goodness_of_fit = self._compute_measure(distribution, criterion, level) if criterion in {t.value for t in self.SignificanceTest}: return self._format_significance_test_goodness_of_fit( goodness_of_fit, level ) return goodness_of_fit
@abstractmethod def _compute_measure( self, distribution: _DistributionT | DistributionName, # noqa: F821 criterion: FittingCriterion, # noqa: F821 level: float, ) -> Any: """Compute a goodness-of-fit of a probability distribution fitted to data. This method does not format the result, unlike its caller :meth:`.compute_measure`. Args: distribution: Either a |g| probability distribution fitted to :attr:`.data` or the name of a probability distribution in the UQ library. criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. level: A test level, i.e. the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. Returns: The unformatted goodness-of-fit of the probability distribution fitted to data. """ @staticmethod def _format_significance_test_goodness_of_fit( goodness_of_fit: Any, level: float ) -> FittingTestResultType: """Format a goodness-of-fit measured according to a fitting criterion. Args: goodness_of_fit: The goodness-of-fit measured according to a fitting criterion. Returns: First, whether the null hypothesis is accepted, then, a dictionary whose keys are "p-value", "statistics" and "level". """
[docs] def select( self, distributions: MutableSequence[_DistributionT | DistributionName], # noqa: F821 fitting_criterion: FittingCriterion, # noqa: F821 level: float = 0.05, selection_criterion: SelectionCriterion = SelectionCriterion.BEST, ) -> _DistributionT: """Select the best probability distribution according to a fitting criterion. Args: distributions: A collection of |g| probability distributions fitted to :attr:`.data` or names of probability distributions in the UQ library. fitting_criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. level: A test level, i.e. the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. selection_criterion: The name of the selection criterion. Returns: The best probability distribution according to the fitting criterion and the selection criterion. """ measures = [] for index, distribution in enumerate(distributions): if distribution in self.DistributionName.__members__: distribution = self.fit(distribution) distributions[index] = distribution measures.append( self.compute_measure(distribution, fitting_criterion, level) ) best_distribution_index = self.select_from_measures( measures, fitting_criterion, level, selection_criterion ) return distributions[best_distribution_index]
[docs] @classmethod def select_from_measures( cls, measures: MutableSequence[MeasureType], fitting_criterion: FittingCriterion, # noqa: F821 level: float = 0.05, selection_criterion: SelectionCriterion = SelectionCriterion.BEST, ) -> int: """Select the best probability distribution according to a fitting criterion. Args: measures: The goodness-of-fit measures. fitting_criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. level: A test level, i.e. the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. selection_criterion: The name of the selection criterion. Returns: The index of the best probability distribution according to the fitting criterion and the selection criterion. """ is_significant_test = fitting_criterion in cls.SignificanceTest.__members__ if is_significant_test: measures = [measure[1]["p-value"] for index, measure in enumerate(measures)] if all(p_value < level for p_value in measures): LOGGER.warning( "All criteria values are lower than the significance level %s.", level, ) if ( selection_criterion == cls.SelectionCriterion.BEST or not is_significant_test ): return cls.__compute_index(fitting_criterion, measures) for index, measure in enumerate(measures): if measure >= level: return index return cls.__compute_index(fitting_criterion, measures)
@classmethod def __compute_index( cls, fitting_criterion: FittingCriterion, # noqa: F821 measures: Iterable[MeasureType], ) -> int: """Compute the best distribution index according to a fitting criterion. Args: fitting_criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. measures: The goodness-of-fit measures. Returns: The index of the best probability distribution according to a fitting criterion. """ op = min if fitting_criterion in cls._FITTING_CRITERIA_TO_MINIMIZE else max return measures.index(op(measures)) @property def available_distributions(self) -> list[str]: """The available probability distributions.""" return sorted({t.value for t in self.DistributionName}) @property def available_criteria(self) -> list[str]: """The available fitting criteria.""" return sorted({t.value for t in self.FittingCriterion}) @property def available_significance_tests(self) -> list[str]: """The significance tests.""" return sorted({t.value for t in self.SignificanceTest})