Source code for gemseo.uncertainty.distributions.base_distribution_fitter

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
"""Fitting a probability distribution to data using a UQ library."""

from __future__ import annotations

import logging
from abc import abstractmethod
from collections.abc import Mapping
from typing import TYPE_CHECKING
from typing import Any
from typing import ClassVar
from typing import Generic
from typing import TypeVar

from strenum import StrEnum

from gemseo.utils.metaclasses import ABCGoogleDocstringInheritanceMeta

if TYPE_CHECKING:
    from collections.abc import Iterable
    from collections.abc import MutableSequence

    from gemseo.typing import RealArray
    from gemseo.typing import StrKeyMapping

LOGGER = logging.getLogger(__name__)

FittingTestResultType = tuple[bool, Mapping[str, float]]
MeasureType = FittingTestResultType | float
_DistributionT = TypeVar("_DistributionT")


[docs] class BaseDistributionFitter( Generic[_DistributionT], metaclass=ABCGoogleDocstringInheritanceMeta ): """Base class to fit a probability distribution from data using a UQ library.""" _data: RealArray """The data array.""" _samples: Any """The samples.""" variable: str """The name of the variable.""" _CRITERIA_TO_WRAPPED_OBJECTS: ClassVar[StrKeyMapping] """Fitting criteria to objects of the UQ library.""" DistributionName: ClassVar[StrEnum] """The names of the probability distributions in the UQ library.""" FittingCriterion: ClassVar[StrEnum] """The names of the fitting criteria.""" default_fitting_criterion: ClassVar[BaseDistributionFitter.FittingCriterion] """The names of the default fitting criterion.""" SignificanceTest: ClassVar[StrEnum] """The names of the fitting criteria that are statistical significance tests."""
[docs] class SelectionCriterion(StrEnum): """The selection criteria.""" FIRST = "first" """Select the first distribution satisfying a fitting criterion.""" BEST = "best" """Select the distribution that best satisfies a fitting criterion"""
_FITTING_CRITERIA_TO_MINIMIZE: ClassVar[set[str]] = set() """The fitting criteria to minimize (the others are to be maximized).""" def __init__(self, variable: str, data: RealArray) -> None: # TODO: API: rename variable to variable_name or remove it because useless. """ Args: variable: The name of the variable. data: A data array. """ # noqa: D205,D212,D415 self.data = data self.variable = variable @property def data(self) -> RealArray: """The data array.""" return self._data @data.setter def data(self, data_: RealArray) -> None: self._data = data_ self._samples = data_.ravel()
[docs] @abstractmethod def fit( self, distribution: DistributionName, # noqa: F821 ) -> _DistributionT: """Fit a probability distribution to the data. Args: distribution: The name of a probability distribution in the UQ library. Returns: The probability distribution fitted to the data. """
[docs] def compute_measure( self, distribution: _DistributionT | DistributionName, # noqa: F821 criterion: FittingCriterion, # noqa: F821 level: float = 0.05, ) -> MeasureType: """Measure a goodness-of-fit of a probability distribution fitted to data. Args: distribution: Either a |g| probability distribution fitted to :attr:`.data` or the name of a probability distribution in the UQ library. criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. level: A test level, i.e. the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. Returns: The goodness-of-fit of the probability distribution fitted to data. """ goodness_of_fit = self._compute_measure(distribution, criterion, level) if criterion in {t.value for t in self.SignificanceTest}: return self._format_significance_test_goodness_of_fit( goodness_of_fit, level ) return goodness_of_fit
@abstractmethod def _compute_measure( self, distribution: _DistributionT | DistributionName, # noqa: F821 criterion: FittingCriterion, # noqa: F821 level: float, ) -> Any: """Compute a goodness-of-fit of a probability distribution fitted to data. This method does not format the result, unlike its caller :meth:`.compute_measure`. Args: distribution: Either a |g| probability distribution fitted to :attr:`.data` or the name of a probability distribution in the UQ library. criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. level: A test level, i.e. the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. Returns: The unformatted goodness-of-fit of the probability distribution fitted to data. """ @staticmethod def _format_significance_test_goodness_of_fit( goodness_of_fit: Any, level: float ) -> FittingTestResultType: """Format a goodness-of-fit measured according to a fitting criterion. Args: goodness_of_fit: The goodness-of-fit measured according to a fitting criterion. Returns: First, whether the null hypothesis is accepted, then, a dictionary whose keys are "p-value", "statistics" and "level". """
[docs] def select( self, distributions: MutableSequence[_DistributionT | DistributionName], # noqa: F821 fitting_criterion: FittingCriterion, # noqa: F821 level: float = 0.05, selection_criterion: SelectionCriterion = SelectionCriterion.BEST, ) -> _DistributionT: """Select the best probability distribution according to a fitting criterion. Args: distributions: A collection of |g| probability distributions fitted to :attr:`.data` or names of probability distributions in the UQ library. fitting_criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. level: A test level, i.e. the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. selection_criterion: The name of the selection criterion. Returns: The best probability distribution according to the fitting criterion and the selection criterion. """ measures = [] for index, distribution in enumerate(distributions): if distribution in set(self.DistributionName): distribution = self.fit(distribution) distributions[index] = distribution measures.append( self.compute_measure(distribution, fitting_criterion, level) ) best_distribution_index = self.select_from_measures( measures, fitting_criterion, level, selection_criterion ) return distributions[best_distribution_index]
[docs] @classmethod def select_from_measures( cls, measures: MutableSequence[MeasureType], fitting_criterion: FittingCriterion, # noqa: F821 level: float = 0.05, selection_criterion: SelectionCriterion = SelectionCriterion.BEST, ) -> int: """Select the best probability distribution according to a fitting criterion. Args: measures: The goodness-of-fit measures. fitting_criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. level: A test level, i.e. the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. selection_criterion: The name of the selection criterion. Returns: The index of the best probability distribution according to the fitting criterion and the selection criterion. """ is_significant_test = fitting_criterion in set(cls.SignificanceTest) if is_significant_test: measures = [measure[1]["p-value"] for index, measure in enumerate(measures)] if all(p_value < level for p_value in measures): LOGGER.warning( "All criteria values are lower than the significance level %s.", level, ) if ( selection_criterion == cls.SelectionCriterion.BEST or not is_significant_test ): return cls.__compute_index(fitting_criterion, measures) for index, measure in enumerate(measures): if measure >= level: return index return cls.__compute_index(fitting_criterion, measures)
@classmethod def __compute_index( cls, fitting_criterion: FittingCriterion, # noqa: F821 measures: Iterable[MeasureType], ) -> int: """Compute the best distribution index according to a fitting criterion. Args: fitting_criterion: The name of the fitting criterion to measure the goodness-of-fit of the probability distribution. measures: The goodness-of-fit measures. Returns: The index of the best probability distribution according to a fitting criterion. """ op = min if fitting_criterion in cls._FITTING_CRITERIA_TO_MINIMIZE else max return measures.index(op(measures)) @property def available_distributions(self) -> list[str]: """The available probability distributions.""" return sorted({t.value for t in self.DistributionName}) @property def available_criteria(self) -> list[str]: """The available fitting criteria.""" return sorted({t.value for t in self.FittingCriterion}) @property def available_significance_tests(self) -> list[str]: """The significance tests.""" return sorted({t.value for t in self.SignificanceTest})