Source code for gemseo.uncertainty.statistics.parametric

# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                           documentation
#        :author: Matthias De Lozzo
#    OTHER AUTHORS   - MACROSCOPIC CHANGES

"""Class for the parametric estimation of statistics from a dataset.

Overview
--------

The :class:`.ParametricStatistics` class inherits
from the abstract :class:`.Statistics` class
and aims to estimate statistics from a :class:`.Dataset`,
based on candidate parametric distributions calibrated from this :class:`.Dataset`.

For each variable,

1. the parameters of these distributions are calibrated from the :class:`.Dataset`,
2. the fitted parametric :class:`.Distribution` which is optimal
   in the sense of a goodness-of-fit criterion and a selection criterion
   is selected to estimate the statistics related to this variable.

The :class:`.ParametricStatistics` relies on the OpenTURNS library
through the :class:`.OTDistribution` and :class:`.OTDistributionFitter` classes.

Construction
------------

The :class:`.ParametricStatistics` is built from two mandatory arguments:

- a dataset,
- a list of distributions names,

and can consider optional arguments:

- a subset of variables names
  (by default, statistics are computed for all variables),
- a fitting criterion name
  (by default, BIC is used;
  see :attr:`.AVAILABLE_CRITERIA`
  and :attr:`.AVAILABLE_SIGNIFICANCE_TESTS`
  for more information),
- a level associated with the fitting criterion,
- a selection criterion:

  - 'best':
    select the distribution minimizing
    (or maximizing, depending on the criterion)
    the criterion,
  - 'first':
    select the first distribution
    for which the criterion is greater
    (or lower, depending on the criterion)
    than the level,

- a name for the :class:`.ParametricStatistics` object
  (by default, the name is the concatenation of 'ParametricStatistics'
  and the name of the :class:`.Dataset`).

Capabilities
------------

By inheritance,
a :class:`.ParametricStatistics` object has
the same capabilities as :class:`.Statistics`.
Additional ones are:

- :meth:`.get_fitting_matrix`:
  this method displays the values of the fitting criterion
  for the different variables
  and candidate probability distributions
  as well as the select probability distribution,
- :meth:`.plot_criteria`:
  this method plots the criterion values for a given variable.
"""

from __future__ import division, unicode_literals

import logging
import os
from typing import Dict, Iterable, Optional, Sequence, Tuple, Union

import matplotlib.pyplot as plt
from numpy import array, linspace, ndarray

from gemseo.core.dataset import Dataset
from gemseo.third_party.prettytable.prettytable import PrettyTable
from gemseo.uncertainty.distributions.openturns.distribution import OTDistribution
from gemseo.uncertainty.distributions.openturns.fitting import (
    MeasureType,
    OTDistributionFitter,
)
from gemseo.uncertainty.statistics.statistics import Statistics
from gemseo.uncertainty.statistics.tolerance_interval.distribution import (
    ToleranceIntervalFactory,
    ToleranceIntervalSide,
)

LOGGER = logging.getLogger(__name__)


[docs]class ParametricStatistics(Statistics): """Parametric estimation of statistics. Attributes: fitting_criterion (str): The name of the goodness-of-fit criterion, measuring how the distribution fits the data. level (float): The test level, i.e. risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. selection_criterion (str): The name of the selection criterion to select a distribution from a list of candidates. distributions (dict(str, dict(str, OTDistribution))): The probability distributions of the random variables. Examples: >>> from gemseo.api import ( ... create_discipline, ... create_parameter_space, ... create_scenario ... ) >>> from gemseo.uncertainty.statistics.parametric import ParametricStatistics >>> >>> expressions = {"y1": "x1+2*x2", "y2": "x1-3*x2"} >>> discipline = create_discipline( ... "AnalyticDiscipline", expressions_dict=expressions ... ) >>> discipline.set_cache_policy(discipline.MEMORY_FULL_CACHE) >>> >>> parameter_space = create_parameter_space() >>> parameter_space.add_random_variable( ... "x1", "OTUniformDistribution", minimum=-1, maximum=1 ... ) >>> parameter_space.add_random_variable( ... "x2", "OTNormalDistribution", mu=0.5, sigma=2 ... ) >>> >>> scenario = create_scenario( ... [discipline], ... "DisciplinaryOpt", ... "y1", parameter_space, scenario_type="DOE" ... ) >>> scenario.execute({'algo': 'OT_MONTE_CARLO', 'n_samples': 100}) >>> >>> dataset = discipline.cache.export_to_dataset() >>> >>> statistics = ParametricStatistics( ... dataset, ['Normal', 'Uniform', 'Triangular'] ... ) >>> fitting_matrix = statistics.get_fitting_matrix() >>> mean = statistics.mean() """ AVAILABLE_DISTRIBUTIONS = sorted( OTDistributionFitter._AVAILABLE_DISTRIBUTIONS.keys() ) AVAILABLE_CRITERIA = sorted(OTDistributionFitter._AVAILABLE_FITTING_TESTS.keys()) AVAILABLE_SIGNIFICANCE_TESTS = sorted(OTDistributionFitter.SIGNIFICANCE_TESTS) def __init__( self, dataset, # type: Dataset distributions, # type: Sequence[str] variables_names=None, # type: Optional[Iterable[str]] fitting_criterion="BIC", # type: str level=0.05, # type: float selection_criterion="best", # type: str name=None, # type: Optional[str] ): # type: (...) -> None # noqa: D205,D212,D415 """ Args: distributions: The names of the distributions. fitting_criterion: The name of the goodness-of-fit criterion, measuring how the distribution fits the data. Use :meth:`.ParametricStatistics.get_criteria` to get the available criteria. level: A test level, i.e. the risk of committing a Type 1 error, that is an incorrect rejection of a true null hypothesis, for criteria based on test hypothesis. selection_criterion: The name of the selection criterion to select a distribution from a list of candidates. Either 'first' or 'best'. """ super(ParametricStatistics, self).__init__(dataset, variables_names, name) significance_tests = OTDistributionFitter.SIGNIFICANCE_TESTS self.fitting_criterion = fitting_criterion self.selection_criterion = selection_criterion LOGGER.info("| Set goodness-of-fit criterion: %s.", fitting_criterion) if self.fitting_criterion in significance_tests: self.level = level LOGGER.info("| Set significance level of hypothesis test: %s.", level) else: self.level = None self._all_distributions = None self.distributions = None self._build_distributions(distributions) def _build_distributions( self, distributions, # type: Sequence[str] ): # type: (...) -> None """Build distributions from distributions names. Args: distributions: The names of the distributions. """ self._all_distributions = self._fit_distributions(distributions) self.distributions = self._select_best_distributions(distributions)
[docs] def get_fitting_matrix(self): # type: (...) -> str """Get the fitting matrix. This matrix contains goodness-of-fit measures for each pair < variable, distribution >. Returns: The printable fitting matrix. """ variables = sorted(self._all_distributions.keys()) distributions = list(self._all_distributions[variables[0]].keys()) table = PrettyTable(["Variable"] + distributions + ["Selection"]) for variable in variables: row, _ = self.get_criteria(variable) row = [variable] + [row[distribution] for distribution in distributions] row = row + [self.distributions[variable]["name"]] table.add_row(row) return str(table)
[docs] def get_criteria( self, variable, # type:str ): # type: (...) -> Tuple[Dict[str,float],bool] """Get criteria for a given variable name and the different distributions. Args: variable: The name of the variable. Returns: The criterion for the different distributions. and an indicator equal to True is the criterion is a p-value. """ all_distributions = self._all_distributions[variable] criteria = { distribution: result["criterion"] for distribution, result in all_distributions.items() } is_p_value = False significance_tests = OTDistributionFitter.SIGNIFICANCE_TESTS if self.fitting_criterion in significance_tests: criteria = { distribution: result[1]["p-value"] for distribution, result in criteria.items() } is_p_value = True return criteria, is_p_value
[docs] def plot_criteria( self, variable, # type: str title=None, # type: Optional[str] save=False, # type:bool show=True, # type: bool n_legend_cols=4, # type: int directory=".", # type:str ): # type: (...) -> None """Plot criteria for a given variable name. Args: variable: The name of the variable. title: A plot title. save: If True, save the plot on the disk. show: If True, show the plot. n_legend_cols: The number of text columns in the upper legend. directory: The directory path, either absolute or relative. Raises: ValueError: If the variable is missing from the dataset. """ if variable not in self.names: raise ValueError( "The variable '{}' is missing from the dataset." "Available ones are: {}.".format(variable, ", ".join(self.names)) ) criteria, is_p_value = self.get_criteria(variable) x_values = [] y_values = [] labels = [] x_value = 0 for distribution, criterion in criteria.items(): x_value += 1 x_values.append(x_value) y_values.append(criterion) labels.append(distribution) plt.subplot(121) plt.bar(x_values, y_values, tick_label=labels, align="center") if is_p_value: plt.ylabel("p-value from {} test".format(self.fitting_criterion)) plt.axhline(self.level, color="r", linewidth=2.0) plt.grid(True, "both") plt.subplot(122) data = array(self.dataset[variable]) data_min = min(data) data_max = max(data) x_values = linspace(data_min, data_max, 1000) distributions = self._all_distributions[variable] try: plt.hist(data, density=True) except AttributeError: plt.hist(data, normed=True) for dist_name, dist_value in distributions.items(): pdf = dist_value["fitted_distribution"].distribution.computePDF y_values = [pdf([x_value])[0] for x_value in x_values] plt.plot(x_values, y_values, label=dist_name, linewidth=2.0) plt.legend( bbox_to_anchor=(0.0, 1.02, 1.0, 0.102), loc="lower left", ncol=n_legend_cols, mode="expand", borderaxespad=0.0, ) plt.grid(True, "both") if title is not None: plt.suptitle(title) filename = os.path.join(directory, "criteria.pdf") if save: plt.savefig(filename) if show: plt.show() plt.close()
def _select_best_distributions( self, distributions_names # type: Sequence[str] ): # type: (...) -> Dict[str,Dict[str,Union[str,OTDistribution]]] """Select the best distributions for the different variables. Args: distributions_names: The names of the distributions. Returns: The best distributions for the different variables. """ LOGGER.info("Select the best distribution for each variable.") distributions = {} for variable in self.names: all_distributions = self._all_distributions[variable] criteria = [ all_distributions[distribution]["criterion"] for distribution in distributions_names ] select_from_measures = OTDistributionFitter.select_from_measures index = select_from_measures( criteria, self.fitting_criterion, self.level, self.selection_criterion ) name = distributions_names[index] value = all_distributions[name]["fitted_distribution"] distributions[variable] = {"name": name, "value": value} LOGGER.info("| The best distribution for %s is %s.", variable, value) return distributions def _fit_distributions( self, distributions, # type: Iterable[str] ): # type: (...) -> Dict[str,Dict[str,Dict[str,Union[OTDistribution,MeasureType]]]] """Fit different distributions for the different marginals. Args: distributions: The distributions names. Returns: dict(str, dict): The distributions for the different variables. """ dist_list = ", ".join(distributions) LOGGER.info( "Fit different distributions (%s) per variable " "and compute the goodness-of-fit criterion.", dist_list, ) results = {} for variable in self.names: LOGGER.info("| Fit different distributions for %s.", variable) dataset = self.dataset[variable] results[variable] = self._fit_marginal_distributions( variable, dataset, distributions ) return results def _fit_marginal_distributions( self, variable, # type: str sample, # type: ndarray distributions, # type: Iterable[str] ): # type: (...) -> Dict[str,Dict[str,Union[OTDistribution,MeasureType]]] """Fit different distributions for a given dataset marginal. Args: variable: A variable name. sample: A data array. distributions: The names of the distributions. Returns: The distributions for the different variables. """ result = {} factory = OTDistributionFitter(variable, sample) for distribution in distributions: fitted_distribution = factory.fit(distribution) test_result = factory.compute_measure( fitted_distribution, self.fitting_criterion, self.level ) result[distribution] = {} result[distribution]["fitted_distribution"] = fitted_distribution result[distribution]["criterion"] = test_result return result
[docs] def compute_maximum(self): # type: (...) -> Dict[str, ndarray] # noqa: D102 result = { name: self.distributions[name]["value"].math_upper_bound for name in self.names } return result
[docs] def compute_mean(self): # type: (...) -> Dict[str, ndarray] # noqa: D102 result = {name: self.distributions[name]["value"].mean for name in self.names} return result
[docs] def compute_minimum(self): # type: (...) -> Dict[str, ndarray] # noqa: D102 result = { name: self.distributions[name]["value"].math_lower_bound for name in self.names } return result
[docs] def compute_probability( self, thresh, # type: float greater=True, # type: bool ): # type: (...) -> Dict[str, ndarray] # noqa: D102 dist = self.distributions if greater: result = { name: 1 - dist[name]["value"].compute_cdf(thresh[name])[0] for name in self.names } else: result = { name: dist[name]["value"].compute_cdf(thresh[name])[0] for name in self.names } return result
[docs] def compute_tolerance_interval( self, coverage, # type: float confidence=0.95, # type: float side=ToleranceIntervalSide.BOTH, # type: ToleranceIntervalSide ): # type: (...) -> Dict[str, Tuple[ndarray,ndarray]] # noqa: D102 D205 D212 D415 if not 0.0 <= coverage <= 1.0: raise ValueError("The argument 'coverage' must be number in [0,1].") if not 0.0 <= confidence <= 1.0: raise ValueError("The argument 'confidence' must be number in [0,1].") limits = {} factory = ToleranceIntervalFactory() for variable in self.names: distribution = self.distributions[variable] cls = factory.get_class(distribution["name"]) parameters = distribution["value"].marginals[0].getParameter() tolerance_interval = cls(self.n_samples, *parameters) limits[variable] = tolerance_interval.compute(coverage, confidence, side) return limits
[docs] def compute_quantile( self, prob, # type:float ): # type: (...) -> Dict[str,ndarray] # noqa: D102 prob = array([prob]) result = { name: self.distributions[name]["value"].compute_inverse_cdf(prob) for name in self.names } return result
[docs] def compute_standard_deviation( self, ): # type: (...) -> Dict[str, ndarray] # noqa: D102 result = { name: self.distributions[name]["value"].standard_deviation for name in self.names } return result
[docs] def compute_variance(self): # type: (...) -> Dict[str, ndarray] # noqa: D102 result = { name: self.distributions[name]["value"].standard_deviation ** 2 for name in self.names } return result
[docs] def compute_moment( self, order, # type: int ): # type: (...) -> Dict[str, ndarray] # noqa: D102 dist = self.distributions result = [ dist[name]["value"].distribution.getMoment(order)[0] for name in self.names ] return result
[docs] def compute_range(self): # type: (...) -> Dict[str, ndarray] # noqa: D102 result = {} for name in self.names: dist = self.distributions[name]["value"] result[name] = dist.math_upper_bound - dist.math_lower_bound return result