Source code for gemseo.uncertainty.distributions.ot_fdist

# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                           documentation
#        :author: Matthias De Lozzo
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""
Fitting a distribution from data based on OpenTURNS
===================================================

Overview
--------

The :class:`.OTDistributionFitter` class considers several samples
of an uncertain variable, fits a user-defined probability distribution
from this dataset and returns a :class:`.OTDistribution`.
It can also return a goodness-of-fit measure
associated with this distribution,
e.g. Bayesian Information Criterion, Kolmogorov test or Chi Squared test,
or select an optimal distribution among a collection according to
a criterion with a threshold.

Construction
------------

The :class:`.OTDistributionFitter` of a given uncertain variable is built
from only two arguments:

- a variable name,
- a one-dimensional numpy array.

Capabilities
------------

Fit a distribution
~~~~~~~~~~~~~~~~~~

The :meth:`.OTDistributionFitter.fit` method takes a distribution name
recognized by OpenTURNS as argument (e.g. 'Normal', 'Uniform', 'Exponential',
...) as argument and returns an :class:`.OTDistribution`
whose underlying OpenTURNS distribution is the specified one fitted
from the dataset passed to the constructor.

Measure the goodness-of-fit
~~~~~~~~~~~~~~~~~~~~~~~~~~~

The :meth:`.OTDistributionFitter.measure` method has two mandatory arguments:

- a distribution which is a either a :class:`.OTDistribution`
  or a distribution name from which :meth:`!fit` method
  builds a :class:`.OTDistribution`,
- a fitting criterion name.

.. note::

   Use the :meth:`.OTDistributionFitter.get_available_criteria` method to get
   the complete list of available criteria
   and the :meth:`.OTDistributionFitter.get_significance_tests` method
   to get the list of available criteria which are significance tests.

The :meth:`.OTDistributionFitter.measure` method can also use a level
associated with the criterion.

The :meth:`.OTDistributionFitter.measure` methods returns a goodness-of-fit
measure whose nature is either a scalar
when the criterion is not a significance test
or a tuple when the criterion is a significance test. In that case,
the first component of the tuple is a boolean indicating if the measured
distribution is acceptable to model the data and the second one is
a dictionary containing the test statistics, the p-value and
the significance level.

Select an optimal distribution
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The :meth:`.OTDistributionFitter.select` method select aims to select an
optimal distribution among a collection. It uses two mandatory arguments:

- a list of distribution, either a list of distributions names
  or a list of :class:`.OTDistribution`,
- a fitting criterion name.

The :meth:`.OTDistributionFitter.select` method can also use a level
associated with the criterion and a criterion selection:

- 'best': select the distribution minimizing (or maximizing, depending
  on the criterion) the criterion,
- 'first': Select the first distribution for which the criterion is
  greater (or lower, depending on the criterion) than the level.
"""
from __future__ import absolute_import, division, unicode_literals

import openturns as ots
from future import standard_library
from numpy import ndarray
from past.builtins import basestring

from gemseo.uncertainty.distributions.ot_dist import OTDistribution

standard_library.install_aliases()

from gemseo import LOGGER


[docs]class OTDistributionFitter(object): """ OpenTURNS distribution fitter. """ AVAILABLE_FACTORIES = {} for factory in ots.DistributionFactory.GetContinuousUniVariateFactories(): factory_class_name = factory.getImplementation().getClassName() dist_name = factory_class_name.split("Factory")[0] AVAILABLE_FACTORIES[dist_name] = getattr(ots, factory_class_name) AVAILABLE_FITTING_TESTS = { "BIC": ots.FittingTest.BIC, "Kolmogorov": ots.FittingTest.Kolmogorov, "ChiSquared": ots.FittingTest.ChiSquared, } SIGNIFICANCE_TESTS = ["Kolmogorov", "ChiSquared"] CRITERIA_TO_MAXIMIZE = [] CRITERIA_TO_MINIMIZE = ["BIC"] def __init__(self, variable, data): """Constructor. :param str variable: variable name. :param array data: data. """ self.variable = variable try: isinstance(data, ndarray) self.data = ots.Sample(data.reshape((-1, 1))) except AttributeError: raise TypeError("data must be a numpy array") def _get_factory(self, distribution): """Get distribution factory. :param str distribution: distribution name. """ try: distribution_factory = self.AVAILABLE_FACTORIES[distribution] except KeyError: distributions = ", ".join(list(self.AVAILABLE_FACTORIES.keys())) raise ValueError( "{} is not a name of " "distribution available for fitting. " "Available ones are: {}.".format(distribution, distributions) ) return distribution_factory def _get_fitting_test(self, criterion): """Get fitting test. :param str criterion: criterion name. """ try: fitting_test = self.AVAILABLE_FITTING_TESTS[criterion] except KeyError: tests = ", ".join(list(self.AVAILABLE_FITTING_TESTS.keys())) raise ValueError( criterion + " is not a name of fitting test. " "Available ones are: " + tests + "." ) return fitting_test
[docs] def fit(self, distribution): """Fit a distribution. :param str distribution: distribution name. """ factory = self._get_factory(distribution) fitted_distribution = factory().build(self.data) parameters = fitted_distribution.getParameter() distribution = OTDistribution(self.variable, distribution, parameters) return distribution
[docs] def measure(self, distribution, criterion, level=0.05): """Measure the goodness-of-fit of a distribution to data. :param distribution: distribution. :type distribution: OTDistribution or str :param str fitting_criterion: goodness-of-fit criterion. :param float level: risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. Default: 0.05. """ if isinstance(distribution, basestring): distribution = self.fit(distribution) if distribution.dimension > 1: raise TypeError("A 1D distribution is required.") distribution = distribution.marginals[0] fitting_test = self._get_fitting_test(criterion) if criterion in self.SIGNIFICANCE_TESTS: result = fitting_test(self.data, distribution, level) details = { "p-value": result.getPValue(), "statistics": result.getStatistic(), "level": level, } result = (result.getBinaryQualityMeasure(), details) else: result = fitting_test(self.data, distribution) return result
[docs] def select( self, distributions, fitting_criterion, level=0.05, selection_criterion="best" ): """Select the best distribution. :param distributions: list of distributions. :type distribution: list(OTDistribution) or list(str) :param str fitting_criterion: goodness-of-fit criterion. :param float level: significance level. For hypothesis tests, this is the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis. For other tests, this is a threshold. Default: 0.05. :param str selection_criterion: selection criterion """ results = [] for index, distribution in enumerate(distributions): if isinstance(distribution, basestring): distribution = self.fit(distribution) results.append(self.measure(distribution, fitting_criterion, level)) distributions[index] = distribution index = self.select_from_results( results, fitting_criterion, level, selection_criterion ) return distributions[index]
[docs] @classmethod def select_from_results( cls, results, fitting_criterion, level=0.05, selection_criterion="best" ): """Select the best distribution from results :param list results: results :param str fitting_criterion: goodness-of-fit criterion. :param float level: significance level. For hypothesis tests, this is the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis. For other tests, this is a threshold. Default: 0.05. :param str selection_criterion: selection criterion """ if fitting_criterion in cls.SIGNIFICANCE_TESTS: for index, _ in enumerate(results): results[index] = results[index][1]["p-value"] if sum([pval > level for pval in results]) == 0: LOGGER.warning( "All criteria values are lower than the " "significance level %s.", level, ) if selection_criterion == "best" or level is None: index = cls.__find_opt_distribution(results, fitting_criterion) else: index = cls.__apply_first_strategy(results, fitting_criterion, level) return index
@classmethod def __apply_first_strategy(cls, results, fitting_criterion, level=0.05): """Select the index of the best distribution from results by applying the "first" strategy. :param dict results: results :param str fitting_criterion: goodness-of-fit criterion. :param float level: significance level. For hypothesis tests, this is the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis. For other tests, this is a threshold. Default: 0.05. """ select = False index = 0 for result in results: select = result >= level if select: break index += 1 if not select: index = cls.__find_opt_distribution(results, fitting_criterion) return index @classmethod def __find_opt_distribution(cls, results, fitting_criterion): """ Returns the index of the optimum distribution.""" if fitting_criterion in cls.CRITERIA_TO_MINIMIZE: index = results.index(min(results)) else: index = results.index(max(results)) return index
[docs] def get_available_distributions(self): """ Get available distributions. """ return sorted(self.AVAILABLE_FACTORIES.keys())
[docs] def get_available_criteria(self): """ Get available goodness-of-fit criteria. """ return sorted(self.AVAILABLE_FITTING_TESTS.keys())
[docs] def get_significance_tests(self): """ Get significance tests. """ return sorted(self.SIGNIFICANCE_TESTS)