# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""Fitting a probability distribution to data using a UQ library."""
from __future__ import annotations
import logging
from abc import abstractmethod
from collections.abc import Mapping
from typing import TYPE_CHECKING
from typing import Any
from typing import ClassVar
from typing import Generic
from typing import TypeVar
from strenum import StrEnum
from gemseo.utils.metaclasses import ABCGoogleDocstringInheritanceMeta
if TYPE_CHECKING:
from collections.abc import Iterable
from collections.abc import MutableSequence
from gemseo.typing import RealArray
from gemseo.typing import StrKeyMapping
LOGGER = logging.getLogger(__name__)
FittingTestResultType = tuple[bool, Mapping[str, float]]
MeasureType = FittingTestResultType | float
_DistributionT = TypeVar("_DistributionT")
[docs]
class BaseDistributionFitter(
Generic[_DistributionT], metaclass=ABCGoogleDocstringInheritanceMeta
):
"""Base class to fit a probability distribution from data using a UQ library."""
_data: RealArray
"""The data array."""
_samples: Any
"""The samples."""
variable: str
"""The name of the variable."""
_CRITERIA_TO_WRAPPED_OBJECTS: ClassVar[StrKeyMapping]
"""Fitting criteria to objects of the UQ library."""
DistributionName: ClassVar[StrEnum]
"""The names of the probability distributions in the UQ library."""
FittingCriterion: ClassVar[StrEnum]
"""The names of the fitting criteria."""
default_fitting_criterion: ClassVar[BaseDistributionFitter.FittingCriterion]
"""The names of the default fitting criterion."""
SignificanceTest: ClassVar[StrEnum]
"""The names of the fitting criteria that are statistical significance tests."""
[docs]
class SelectionCriterion(StrEnum):
"""The selection criteria."""
FIRST = "first"
"""Select the first distribution satisfying a fitting criterion."""
BEST = "best"
"""Select the distribution that best satisfies a fitting criterion"""
_FITTING_CRITERIA_TO_MINIMIZE: ClassVar[set[str]] = set()
"""The fitting criteria to minimize (the others are to be maximized)."""
def __init__(self, variable: str, data: RealArray) -> None:
# TODO: API: rename variable to variable_name or remove it because useless.
"""
Args:
variable: The name of the variable.
data: A data array.
""" # noqa: D205,D212,D415
self.data = data
self.variable = variable
@property
def data(self) -> RealArray:
"""The data array."""
return self._data
@data.setter
def data(self, data_: RealArray) -> None:
self._data = data_
self._samples = data_.ravel()
[docs]
@abstractmethod
def fit(
self,
distribution: DistributionName, # noqa: F821
) -> _DistributionT:
"""Fit a probability distribution to the data.
Args:
distribution: The name of a probability distribution in the UQ library.
Returns:
The probability distribution fitted to the data.
"""
[docs]
def compute_measure(
self,
distribution: _DistributionT | DistributionName, # noqa: F821
criterion: FittingCriterion, # noqa: F821
level: float = 0.05,
) -> MeasureType:
"""Measure a goodness-of-fit of a probability distribution fitted to data.
Args:
distribution: Either a |g| probability distribution fitted to :attr:`.data`
or the name of a probability distribution in the UQ library.
criterion: The name of the fitting criterion
to measure the goodness-of-fit of the probability distribution.
level: A test level,
i.e. the risk of committing a Type 1 error,
that is an incorrect rejection of a true null hypothesis,
for criteria based on test hypothesis.
Returns:
The goodness-of-fit of the probability distribution fitted to data.
"""
goodness_of_fit = self._compute_measure(distribution, criterion, level)
if criterion in {t.value for t in self.SignificanceTest}:
return self._format_significance_test_goodness_of_fit(
goodness_of_fit, level
)
return goodness_of_fit
@abstractmethod
def _compute_measure(
self,
distribution: _DistributionT | DistributionName, # noqa: F821
criterion: FittingCriterion, # noqa: F821
level: float,
) -> Any:
"""Compute a goodness-of-fit of a probability distribution fitted to data.
This method does not format the result,
unlike its caller :meth:`.compute_measure`.
Args:
distribution: Either a |g| probability distribution fitted to :attr:`.data`
or the name of a probability distribution in the UQ library.
criterion: The name of the fitting criterion
to measure the goodness-of-fit of the probability distribution.
level: A test level,
i.e. the risk of committing a Type 1 error,
that is an incorrect rejection of a true null hypothesis,
for criteria based on test hypothesis.
Returns:
The unformatted goodness-of-fit
of the probability distribution fitted to data.
"""
@staticmethod
def _format_significance_test_goodness_of_fit(
goodness_of_fit: Any, level: float
) -> FittingTestResultType:
"""Format a goodness-of-fit measured according to a fitting criterion.
Args:
goodness_of_fit: The goodness-of-fit
measured according to a fitting criterion.
Returns:
First,
whether the null hypothesis is accepted,
then,
a dictionary whose keys are "p-value", "statistics" and "level".
"""
[docs]
def select(
self,
distributions: MutableSequence[_DistributionT | DistributionName], # noqa: F821
fitting_criterion: FittingCriterion, # noqa: F821
level: float = 0.05,
selection_criterion: SelectionCriterion = SelectionCriterion.BEST,
) -> _DistributionT:
"""Select the best probability distribution according to a fitting criterion.
Args:
distributions: A collection of |g| probability distributions
fitted to :attr:`.data`
or names of probability distributions in the UQ library.
fitting_criterion: The name of the fitting criterion
to measure the goodness-of-fit of the probability distribution.
level: A test level,
i.e. the risk of committing a Type 1 error,
that is an incorrect rejection of a true null hypothesis,
for criteria based on test hypothesis.
selection_criterion: The name of the selection criterion.
Returns:
The best probability distribution
according to the fitting criterion and the selection criterion.
"""
measures = []
for index, distribution in enumerate(distributions):
if distribution in set(self.DistributionName):
distribution = self.fit(distribution)
distributions[index] = distribution
measures.append(
self.compute_measure(distribution, fitting_criterion, level)
)
best_distribution_index = self.select_from_measures(
measures, fitting_criterion, level, selection_criterion
)
return distributions[best_distribution_index]
[docs]
@classmethod
def select_from_measures(
cls,
measures: MutableSequence[MeasureType],
fitting_criterion: FittingCriterion, # noqa: F821
level: float = 0.05,
selection_criterion: SelectionCriterion = SelectionCriterion.BEST,
) -> int:
"""Select the best probability distribution according to a fitting criterion.
Args:
measures: The goodness-of-fit measures.
fitting_criterion: The name of the fitting criterion
to measure the goodness-of-fit of the probability distribution.
level: A test level,
i.e. the risk of committing a Type 1 error,
that is an incorrect rejection of a true null hypothesis,
for criteria based on test hypothesis.
selection_criterion: The name of the selection criterion.
Returns:
The index of the best probability distribution
according to the fitting criterion and the selection criterion.
"""
is_significant_test = fitting_criterion in set(cls.SignificanceTest)
if is_significant_test:
measures = [measure[1]["p-value"] for index, measure in enumerate(measures)]
if all(p_value < level for p_value in measures):
LOGGER.warning(
"All criteria values are lower than the significance level %s.",
level,
)
if (
selection_criterion == cls.SelectionCriterion.BEST
or not is_significant_test
):
return cls.__compute_index(fitting_criterion, measures)
for index, measure in enumerate(measures):
if measure >= level:
return index
return cls.__compute_index(fitting_criterion, measures)
@classmethod
def __compute_index(
cls,
fitting_criterion: FittingCriterion, # noqa: F821
measures: Iterable[MeasureType],
) -> int:
"""Compute the best distribution index according to a fitting criterion.
Args:
fitting_criterion: The name of the fitting criterion
to measure the goodness-of-fit of the probability distribution.
measures: The goodness-of-fit measures.
Returns:
The index of the best probability distribution
according to a fitting criterion.
"""
op = min if fitting_criterion in cls._FITTING_CRITERIA_TO_MINIMIZE else max
return measures.index(op(measures))
@property
def available_distributions(self) -> list[str]:
"""The available probability distributions."""
return sorted({t.value for t in self.DistributionName})
@property
def available_criteria(self) -> list[str]:
"""The available fitting criteria."""
return sorted({t.value for t in self.FittingCriterion})
@property
def available_significance_tests(self) -> list[str]:
"""The significance tests."""
return sorted({t.value for t in self.SignificanceTest})