# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# Contributors:
# INITIAL AUTHORS - initial API and implementation and/or initial
# documentation
# :author: Matthias De Lozzo
# OTHER AUTHORS - MACROSCOPIC CHANGES
"""Class for the parametric estimation of statistics from a dataset.
Overview
--------
The :class:`.ParametricStatistics` class inherits
from the abstract :class:`.Statistics` class
and aims to estimate statistics from a :class:`.Dataset`,
based on candidate parametric distributions calibrated from this :class:`.Dataset`.
For each variable,
1. the parameters of these distributions are calibrated from the :class:`.Dataset`,
2. the fitted parametric :class:`.Distribution` which is optimal
in the sense of a goodness-of-fit criterion and a selection criterion
is selected to estimate the statistics related to this variable.
The :class:`.ParametricStatistics` relies on the OpenTURNS library
through the :class:`.OTDistribution` and :class:`.OTDistributionFitter` classes.
Construction
------------
The :class:`.ParametricStatistics` is built from two mandatory arguments:
- a dataset,
- a list of distributions names,
and can consider optional arguments:
- a subset of variables names
(by default, statistics are computed for all variables),
- a fitting criterion name
(by default, BIC is used;
see :attr:`.AVAILABLE_CRITERIA`
and :attr:`.AVAILABLE_SIGNIFICANCE_TESTS`
for more information),
- a level associated with the fitting criterion,
- a selection criterion:
- 'best':
select the distribution minimizing
(or maximizing, depending on the criterion)
the criterion,
- 'first':
select the first distribution
for which the criterion is greater
(or lower, depending on the criterion)
than the level,
- a name for the :class:`.ParametricStatistics` object
(by default, the name is the concatenation of 'ParametricStatistics'
and the name of the :class:`.Dataset`).
Capabilities
------------
By inheritance,
a :class:`.ParametricStatistics` object has
the same capabilities as :class:`.Statistics`.
Additional ones are:
- :meth:`.get_fitting_matrix`:
this method displays the values of the fitting criterion
for the different variables
and candidate probability distributions
as well as the select probability distribution,
- :meth:`.plot_criteria`:
this method plots the criterion values for a given variable.
"""
from __future__ import division, unicode_literals
import logging
import os
from typing import Dict, Iterable, Optional, Sequence, Tuple, Union
import matplotlib.pyplot as plt
from numpy import array, linspace, ndarray
from gemseo.core.dataset import Dataset
from gemseo.third_party.prettytable.prettytable import PrettyTable
from gemseo.uncertainty.distributions.openturns.distribution import OTDistribution
from gemseo.uncertainty.distributions.openturns.fitting import (
MeasureType,
OTDistributionFitter,
)
from gemseo.uncertainty.statistics.statistics import Statistics
from gemseo.uncertainty.statistics.tolerance_interval.distribution import (
ToleranceIntervalFactory,
ToleranceIntervalSide,
)
LOGGER = logging.getLogger(__name__)
[docs]class ParametricStatistics(Statistics):
"""Parametric estimation of statistics.
Attributes:
fitting_criterion (str): The name of the goodness-of-fit criterion,
measuring how the distribution fits the data.
level (float): The test level,
i.e. risk of committing a Type 1 error,
that is an incorrect rejection of a true null hypothesis,
for criteria based on test hypothesis.
selection_criterion (str): The name of the selection criterion
to select a distribution from a list of candidates.
distributions (dict(str, dict(str, OTDistribution))): The probability
distributions of the random variables.
Examples:
>>> from gemseo.api import (
... create_discipline,
... create_parameter_space,
... create_scenario
... )
>>> from gemseo.uncertainty.statistics.parametric import ParametricStatistics
>>>
>>> expressions = {"y1": "x1+2*x2", "y2": "x1-3*x2"}
>>> discipline = create_discipline(
... "AnalyticDiscipline", expressions_dict=expressions
... )
>>> discipline.set_cache_policy(discipline.MEMORY_FULL_CACHE)
>>>
>>> parameter_space = create_parameter_space()
>>> parameter_space.add_random_variable(
... "x1", "OTUniformDistribution", minimum=-1, maximum=1
... )
>>> parameter_space.add_random_variable(
... "x2", "OTNormalDistribution", mu=0.5, sigma=2
... )
>>>
>>> scenario = create_scenario(
... [discipline],
... "DisciplinaryOpt",
... "y1", parameter_space, scenario_type="DOE"
... )
>>> scenario.execute({'algo': 'OT_MONTE_CARLO', 'n_samples': 100})
>>>
>>> dataset = discipline.cache.export_to_dataset()
>>>
>>> statistics = ParametricStatistics(
... dataset, ['Normal', 'Uniform', 'Triangular']
... )
>>> fitting_matrix = statistics.get_fitting_matrix()
>>> mean = statistics.mean()
"""
AVAILABLE_DISTRIBUTIONS = sorted(
OTDistributionFitter._AVAILABLE_DISTRIBUTIONS.keys()
)
AVAILABLE_CRITERIA = sorted(OTDistributionFitter._AVAILABLE_FITTING_TESTS.keys())
AVAILABLE_SIGNIFICANCE_TESTS = sorted(OTDistributionFitter.SIGNIFICANCE_TESTS)
def __init__(
self,
dataset, # type: Dataset
distributions, # type: Sequence[str]
variables_names=None, # type: Optional[Iterable[str]]
fitting_criterion="BIC", # type: str
level=0.05, # type: float
selection_criterion="best", # type: str
name=None, # type: Optional[str]
): # type: (...) -> None # noqa: D205,D212,D415
"""
Args:
distributions: The names of the distributions.
fitting_criterion: The name of
the goodness-of-fit criterion,
measuring how the distribution fits the data.
Use :meth:`.ParametricStatistics.get_criteria`
to get the available criteria.
level: A test level,
i.e. the risk of committing a Type 1 error,
that is an incorrect rejection of a true null hypothesis,
for criteria based on test hypothesis.
selection_criterion: The name of the selection criterion
to select a distribution from a list of candidates.
Either 'first' or 'best'.
"""
super(ParametricStatistics, self).__init__(dataset, variables_names, name)
significance_tests = OTDistributionFitter.SIGNIFICANCE_TESTS
self.fitting_criterion = fitting_criterion
self.selection_criterion = selection_criterion
LOGGER.info("| Set goodness-of-fit criterion: %s.", fitting_criterion)
if self.fitting_criterion in significance_tests:
self.level = level
LOGGER.info("| Set significance level of hypothesis test: %s.", level)
else:
self.level = None
self._all_distributions = None
self.distributions = None
self._build_distributions(distributions)
def _build_distributions(
self,
distributions, # type: Sequence[str]
): # type: (...) -> None
"""Build distributions from distributions names.
Args:
distributions: The names of the distributions.
"""
self._all_distributions = self._fit_distributions(distributions)
self.distributions = self._select_best_distributions(distributions)
[docs] def get_fitting_matrix(self): # type: (...) -> str
"""Get the fitting matrix.
This matrix contains goodness-of-fit measures
for each pair < variable, distribution >.
Returns:
The printable fitting matrix.
"""
variables = sorted(self._all_distributions.keys())
distributions = list(self._all_distributions[variables[0]].keys())
table = PrettyTable(["Variable"] + distributions + ["Selection"])
for variable in variables:
row, _ = self.get_criteria(variable)
row = [variable] + [row[distribution] for distribution in distributions]
row = row + [self.distributions[variable]["name"]]
table.add_row(row)
return str(table)
[docs] def get_criteria(
self,
variable, # type:str
): # type: (...) -> Tuple[Dict[str,float],bool]
"""Get criteria for a given variable name and the different distributions.
Args:
variable: The name of the variable.
Returns:
The criterion for the different distributions.
and an indicator equal to True is the criterion is a p-value.
"""
all_distributions = self._all_distributions[variable]
criteria = {
distribution: result["criterion"]
for distribution, result in all_distributions.items()
}
is_p_value = False
significance_tests = OTDistributionFitter.SIGNIFICANCE_TESTS
if self.fitting_criterion in significance_tests:
criteria = {
distribution: result[1]["p-value"]
for distribution, result in criteria.items()
}
is_p_value = True
return criteria, is_p_value
[docs] def plot_criteria(
self,
variable, # type: str
title=None, # type: Optional[str]
save=False, # type:bool
show=True, # type: bool
n_legend_cols=4, # type: int
directory=".", # type:str
): # type: (...) -> None
"""Plot criteria for a given variable name.
Args:
variable: The name of the variable.
title: A plot title.
save: If True, save the plot on the disk.
show: If True, show the plot.
n_legend_cols: The number of text columns in the upper legend.
directory: The directory path, either absolute or relative.
Raises:
ValueError: If the variable is missing from the dataset.
"""
if variable not in self.names:
raise ValueError(
"The variable '{}' is missing from the dataset."
"Available ones are: {}.".format(variable, ", ".join(self.names))
)
criteria, is_p_value = self.get_criteria(variable)
x_values = []
y_values = []
labels = []
x_value = 0
for distribution, criterion in criteria.items():
x_value += 1
x_values.append(x_value)
y_values.append(criterion)
labels.append(distribution)
plt.subplot(121)
plt.bar(x_values, y_values, tick_label=labels, align="center")
if is_p_value:
plt.ylabel("p-value from {} test".format(self.fitting_criterion))
plt.axhline(self.level, color="r", linewidth=2.0)
plt.grid(True, "both")
plt.subplot(122)
data = array(self.dataset[variable])
data_min = min(data)
data_max = max(data)
x_values = linspace(data_min, data_max, 1000)
distributions = self._all_distributions[variable]
try:
plt.hist(data, density=True)
except AttributeError:
plt.hist(data, normed=True)
for dist_name, dist_value in distributions.items():
pdf = dist_value["fitted_distribution"].distribution.computePDF
y_values = [pdf([x_value])[0] for x_value in x_values]
plt.plot(x_values, y_values, label=dist_name, linewidth=2.0)
plt.legend(
bbox_to_anchor=(0.0, 1.02, 1.0, 0.102),
loc="lower left",
ncol=n_legend_cols,
mode="expand",
borderaxespad=0.0,
)
plt.grid(True, "both")
if title is not None:
plt.suptitle(title)
filename = os.path.join(directory, "criteria.pdf")
if save:
plt.savefig(filename)
if show:
plt.show()
plt.close()
def _select_best_distributions(
self, distributions_names # type: Sequence[str]
): # type: (...) -> Dict[str,Dict[str,Union[str,OTDistribution]]]
"""Select the best distributions for the different variables.
Args:
distributions_names: The names of the distributions.
Returns:
The best distributions for the different variables.
"""
LOGGER.info("Select the best distribution for each variable.")
distributions = {}
for variable in self.names:
all_distributions = self._all_distributions[variable]
criteria = [
all_distributions[distribution]["criterion"]
for distribution in distributions_names
]
select_from_measures = OTDistributionFitter.select_from_measures
index = select_from_measures(
criteria, self.fitting_criterion, self.level, self.selection_criterion
)
name = distributions_names[index]
value = all_distributions[name]["fitted_distribution"]
distributions[variable] = {"name": name, "value": value}
LOGGER.info("| The best distribution for %s is %s.", variable, value)
return distributions
def _fit_distributions(
self,
distributions, # type: Iterable[str]
): # type: (...) -> Dict[str,Dict[str,Dict[str,Union[OTDistribution,MeasureType]]]]
"""Fit different distributions for the different marginals.
Args:
distributions: The distributions names.
Returns:
dict(str, dict): The distributions for the different variables.
"""
dist_list = ", ".join(distributions)
LOGGER.info(
"Fit different distributions (%s) per variable "
"and compute the goodness-of-fit criterion.",
dist_list,
)
results = {}
for variable in self.names:
LOGGER.info("| Fit different distributions for %s.", variable)
dataset = self.dataset[variable]
results[variable] = self._fit_marginal_distributions(
variable, dataset, distributions
)
return results
def _fit_marginal_distributions(
self,
variable, # type: str
sample, # type: ndarray
distributions, # type: Iterable[str]
): # type: (...) -> Dict[str,Dict[str,Union[OTDistribution,MeasureType]]]
"""Fit different distributions for a given dataset marginal.
Args:
variable: A variable name.
sample: A data array.
distributions: The names of the distributions.
Returns:
The distributions for the different variables.
"""
result = {}
factory = OTDistributionFitter(variable, sample)
for distribution in distributions:
fitted_distribution = factory.fit(distribution)
test_result = factory.compute_measure(
fitted_distribution, self.fitting_criterion, self.level
)
result[distribution] = {}
result[distribution]["fitted_distribution"] = fitted_distribution
result[distribution]["criterion"] = test_result
return result
[docs] def compute_maximum(self): # type: (...) -> Dict[str, ndarray] # noqa: D102
result = {
name: self.distributions[name]["value"].math_upper_bound
for name in self.names
}
return result
[docs] def compute_mean(self): # type: (...) -> Dict[str, ndarray] # noqa: D102
result = {name: self.distributions[name]["value"].mean for name in self.names}
return result
[docs] def compute_minimum(self): # type: (...) -> Dict[str, ndarray] # noqa: D102
result = {
name: self.distributions[name]["value"].math_lower_bound
for name in self.names
}
return result
[docs] def compute_probability(
self,
thresh, # type: float
greater=True, # type: bool
): # type: (...) -> Dict[str, ndarray] # noqa: D102
dist = self.distributions
if greater:
result = {
name: 1 - dist[name]["value"].compute_cdf(thresh[name])[0]
for name in self.names
}
else:
result = {
name: dist[name]["value"].compute_cdf(thresh[name])[0]
for name in self.names
}
return result
[docs] def compute_tolerance_interval(
self,
coverage, # type: float
confidence=0.95, # type: float
side=ToleranceIntervalSide.BOTH, # type: ToleranceIntervalSide
): # type: (...) -> Dict[str, Tuple[ndarray,ndarray]]
# noqa: D102 D205 D212 D415
if not 0.0 <= coverage <= 1.0:
raise ValueError("The argument 'coverage' must be number in [0,1].")
if not 0.0 <= confidence <= 1.0:
raise ValueError("The argument 'confidence' must be number in [0,1].")
limits = {}
factory = ToleranceIntervalFactory()
for variable in self.names:
distribution = self.distributions[variable]
cls = factory.get_class(distribution["name"])
parameters = distribution["value"].marginals[0].getParameter()
tolerance_interval = cls(self.n_samples, *parameters)
limits[variable] = tolerance_interval.compute(coverage, confidence, side)
return limits
[docs] def compute_quantile(
self,
prob, # type:float
): # type: (...) -> Dict[str,ndarray] # noqa: D102
prob = array([prob])
result = {
name: self.distributions[name]["value"].compute_inverse_cdf(prob)
for name in self.names
}
return result
[docs] def compute_standard_deviation(
self,
): # type: (...) -> Dict[str, ndarray] # noqa: D102
result = {
name: self.distributions[name]["value"].standard_deviation
for name in self.names
}
return result
[docs] def compute_variance(self): # type: (...) -> Dict[str, ndarray] # noqa: D102
result = {
name: self.distributions[name]["value"].standard_deviation ** 2
for name in self.names
}
return result
[docs] def compute_moment(
self,
order, # type: int
): # type: (...) -> Dict[str, ndarray] # noqa: D102
dist = self.distributions
result = [
dist[name]["value"].distribution.getMoment(order)[0] for name in self.names
]
return result
[docs] def compute_range(self): # type: (...) -> Dict[str, ndarray] # noqa: D102
result = {}
for name in self.names:
dist = self.distributions[name]["value"]
result[name] = dist.math_upper_bound - dist.math_lower_bound
return result