Source code for gemseo.uncertainty.statistics.parametric

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                           documentation
#        :author: Matthias De Lozzo
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""Class for the parametric estimation of statistics from a dataset.

Overview
--------

The :class:`.ParametricStatistics` class inherits
from the abstract :class:`.Statistics` class
and aims to estimate statistics from a :class:`.Dataset`,
based on candidate parametric distributions calibrated from this :class:`.Dataset`.

For each variable,

1. the parameters of these distributions are calibrated from the :class:`.Dataset`,
2. the fitted parametric :class:`.Distribution` which is optimal
   in the sense of a goodness-of-fit criterion and a selection criterion
   is selected to estimate the statistics related to this variable.

The :class:`.ParametricStatistics` relies on the OpenTURNS library
through the :class:`.OTDistribution` and :class:`.OTDistributionFitter` classes.

Construction
------------

The :class:`.ParametricStatistics` is built from two mandatory arguments:

- a dataset,
- a list of distributions names,

and can consider optional arguments:

- a subset of variables names
  (by default, statistics are computed for all variables),
- a fitting criterion name
  (by default, BIC is used;
  see :attr:`.AVAILABLE_CRITERIA`
  and :attr:`.AVAILABLE_SIGNIFICANCE_TESTS`
  for more information),
- a level associated with the fitting criterion,
- a selection criterion:

  - 'best':
    select the distribution minimizing
    (or maximizing, depending on the criterion)
    the criterion,
  - 'first':
    select the first distribution
    for which the criterion is greater
    (or lower, depending on the criterion)
    than the level,

- a name for the :class:`.ParametricStatistics` object
  (by default, the name is the concatenation of 'ParametricStatistics'
  and the name of the :class:`.Dataset`).

Capabilities
------------

By inheritance,
a :class:`.ParametricStatistics` object has
the same capabilities as :class:`.Statistics`.
Additional ones are:

- :meth:`.get_fitting_matrix`:
  this method displays the values of the fitting criterion
  for the different variables
  and candidate probability distributions
  as well as the select probability distribution,
- :meth:`.plot_criteria`:
  this method plots the criterion values for a given variable.
"""
from __future__ import annotations

import logging
import os
from typing import Iterable
from typing import Sequence

import matplotlib.pyplot as plt
from numpy import array
from numpy import linspace
from numpy import ndarray

from gemseo.core.dataset import Dataset
from gemseo.third_party.prettytable.prettytable import PrettyTable
from gemseo.uncertainty.distributions.openturns.distribution import OTDistribution
from gemseo.uncertainty.distributions.openturns.fitting import MeasureType
from gemseo.uncertainty.distributions.openturns.fitting import OTDistributionFitter
from gemseo.uncertainty.statistics.statistics import Statistics
from gemseo.uncertainty.statistics.tolerance_interval.distribution import (
    ToleranceIntervalFactory,
)
from gemseo.uncertainty.statistics.tolerance_interval.distribution import (
    ToleranceIntervalSide,
)

LOGGER = logging.getLogger(__name__)


[docs]class ParametricStatistics(Statistics):
    """Parametric estimation of statistics.

    Examples:
        >>> from gemseo.api import (
        ...     create_discipline,
        ...     create_parameter_space,
        ...     create_scenario
        ... )
        >>> from gemseo.uncertainty.statistics.parametric import ParametricStatistics
        >>>
        >>> expressions = {"y1": "x1+2*x2", "y2": "x1-3*x2"}
        >>> discipline = create_discipline(
        ...     "AnalyticDiscipline", expressions=expressions
        ... )
        >>>
        >>> parameter_space = create_parameter_space()
        >>> parameter_space.add_random_variable(
        ...     "x1", "OTUniformDistribution", minimum=-1, maximum=1
        ... )
        >>> parameter_space.add_random_variable(
        ...     "x2", "OTNormalDistribution", mu=0.5, sigma=2
        ... )
        >>>
        >>> scenario = create_scenario(
        ...     [discipline],
        ...     "DisciplinaryOpt",
        ...     "y1", parameter_space, scenario_type="DOE"
        ... )
        >>> scenario.execute({'algo': 'OT_MONTE_CARLO', 'n_samples': 100})
        >>>
        >>> dataset = scenario.export_to_dataset(opt_naming=False)
        >>>
        >>> statistics = ParametricStatistics(
        ...     dataset, ['Normal', 'Uniform', 'Triangular']
        ... )
        >>> fitting_matrix = statistics.get_fitting_matrix()
        >>> mean = statistics.mean()
    """

    fitting_criterion: str
    """The name of the goodness-of-fit criterion,
    measuring how the distribution fits the data."""

    level: float
    """The test level,
    i.e. risk of committing a Type 1 error,
    that is an incorrect rejection of a true null hypothesis,
    for criteria based on test hypothesis."""

    selection_criterion: str
    """The name of the selection criterion
    to select a distribution from a list of candidates."""

    distributions: dict[str, dict[str, OTDistribution]]
    """The probability distributions of the random variables."""

    AVAILABLE_DISTRIBUTIONS = sorted(
        OTDistributionFitter._AVAILABLE_DISTRIBUTIONS.keys()
    )

    AVAILABLE_CRITERIA = sorted(OTDistributionFitter._AVAILABLE_FITTING_TESTS.keys())
    AVAILABLE_SIGNIFICANCE_TESTS = sorted(OTDistributionFitter.SIGNIFICANCE_TESTS)

    def __init__(
        self,
        dataset: Dataset,
        distributions: Sequence[str],
        variables_names: Iterable[str] | None = None,
        fitting_criterion: str = "BIC",
        level: float = 0.05,
        selection_criterion: str = "best",
        name: str | None = None,
    ) -> None:
        """.. # noqa: D205,D212,D415
        Args:
            distributions: The names of the distributions.
            fitting_criterion: The name of
                the goodness-of-fit criterion,
                measuring how the distribution fits the data.
                Use :meth:`.ParametricStatistics.get_criteria`
                to get the available criteria.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.
            selection_criterion: The name of the selection criterion
                to select a distribution from a list of candidates.
                Either 'first' or 'best'.
        """
        super().__init__(dataset, variables_names, name)
        significance_tests = OTDistributionFitter.SIGNIFICANCE_TESTS
        self.fitting_criterion = fitting_criterion
        self.selection_criterion = selection_criterion
        LOGGER.info("| Set goodness-of-fit criterion: %s.", fitting_criterion)
        if self.fitting_criterion in significance_tests:
            self.level = level
            LOGGER.info("| Set significance level of hypothesis test: %s.", level)
        else:
            self.level = None
        self._all_distributions = None
        self.distributions = None
        self._build_distributions(distributions)

    def _build_distributions(
        self,
        distributions: Sequence[str],
    ) -> None:
        """Build distributions from distributions names.

        Args:
            distributions: The names of the distributions.
        """
        self._all_distributions = self._fit_distributions(distributions)
        self.distributions = self._select_best_distributions(distributions)

[docs]    def get_fitting_matrix(self) -> str:
        """Get the fitting matrix.

        This matrix contains goodness-of-fit measures
        for each pair < variable, distribution >.

        Returns:
            The printable fitting matrix.
        """
        variables = sorted(self._all_distributions.keys())
        distributions = list(self._all_distributions[variables[0]].keys())
        table = PrettyTable(["Variable"] + distributions + ["Selection"])
        for variable in variables:
            row, _ = self.get_criteria(variable)
            row = [variable] + [row[distribution] for distribution in distributions]
            row = row + [self.distributions[variable]["name"]]
            table.add_row(row)
        return str(table)

[docs]    def get_criteria(
        self,
        variable: str,
    ) -> tuple[dict[str, float], bool]:
        """Get criteria for a given variable name and the different distributions.

        Args:
            variable: The name of the variable.

        Returns:
            The criterion for the different distributions.
            and an indicator equal to True is the criterion is a p-value.
        """
        all_distributions = self._all_distributions[variable]
        criteria = {
            distribution: result["criterion"]
            for distribution, result in all_distributions.items()
        }
        is_p_value = False
        significance_tests = OTDistributionFitter.SIGNIFICANCE_TESTS
        if self.fitting_criterion in significance_tests:
            criteria = {
                distribution: result[1]["p-value"]
                for distribution, result in criteria.items()
            }
            is_p_value = True
        return criteria, is_p_value

[docs]    def plot_criteria(
        self,
        variable: str,
        title: str | None = None,
        save: bool = False,
        show: bool = True,
        n_legend_cols: int = 4,
        directory: str = ".",
    ) -> None:
        """Plot criteria for a given variable name.

        Args:
            variable: The name of the variable.
            title: A plot title.
            save: If True, save the plot on the disk.
            show: If True, show the plot.
            n_legend_cols: The number of text columns in the upper legend.
            directory: The directory path, either absolute or relative.

        Raises:
            ValueError: If the variable is missing from the dataset.
        """
        if variable not in self.names:
            raise ValueError(
                "The variable '{}' is missing from the dataset."
                "Available ones are: {}.".format(variable, ", ".join(self.names))
            )
        criteria, is_p_value = self.get_criteria(variable)
        x_values = []
        y_values = []
        labels = []
        x_value = 0
        for distribution, criterion in criteria.items():
            x_value += 1
            x_values.append(x_value)
            y_values.append(criterion)
            labels.append(distribution)
        plt.subplot(121)
        plt.bar(x_values, y_values, tick_label=labels, align="center")
        if is_p_value:
            plt.ylabel(f"p-value from {self.fitting_criterion} test")
            plt.axhline(self.level, color="r", linewidth=2.0)
        plt.grid(True, "both")
        plt.subplot(122)
        data = array(self.dataset[variable])
        data_min = min(data)
        data_max = max(data)
        x_values = linspace(data_min, data_max, 1000)
        distributions = self._all_distributions[variable]
        try:
            plt.hist(data, density=True)
        except AttributeError:
            plt.hist(data, normed=True)
        for dist_name, dist_value in distributions.items():
            pdf = dist_value["fitted_distribution"].distribution.computePDF
            y_values = [pdf([x_value])[0] for x_value in x_values]
            plt.plot(x_values, y_values, label=dist_name, linewidth=2.0)
        plt.legend(
            bbox_to_anchor=(0.0, 1.02, 1.0, 0.102),
            loc="lower left",
            ncol=n_legend_cols,
            mode="expand",
            borderaxespad=0.0,
        )
        plt.grid(True, "both")
        if title is not None:
            plt.suptitle(title)
        filename = os.path.join(directory, "criteria.pdf")
        if save:
            plt.savefig(filename)
        if show:
            plt.show()
        plt.close()

    def _select_best_distributions(
        self, distributions_names: Sequence[str]
    ) -> dict[str, dict[str, str | OTDistribution]]:
        """Select the best distributions for the different variables.

        Args:
            distributions_names: The names of the distributions.

        Returns:
            The best distributions for the different variables.
        """
        LOGGER.info("Select the best distribution for each variable.")
        distributions = {}
        for variable in self.names:
            all_distributions = self._all_distributions[variable]
            criteria = [
                all_distributions[distribution]["criterion"]
                for distribution in distributions_names
            ]
            select_from_measures = OTDistributionFitter.select_from_measures
            index = select_from_measures(
                criteria, self.fitting_criterion, self.level, self.selection_criterion
            )
            name = distributions_names[index]
            value = all_distributions[name]["fitted_distribution"]
            distributions[variable] = {"name": name, "value": value}
            LOGGER.info("| The best distribution for %s is %s.", variable, value)
        return distributions

    def _fit_distributions(
        self,
        distributions: Iterable[str],
    ) -> dict[str, dict[str, dict[str, OTDistribution | MeasureType]]]:
        """Fit different distributions for the different marginals.

        Args:
            distributions: The distributions names.

        Returns:
            The distributions for the different variables.
        """
        LOGGER.info(
            "Fit different distributions (%s) per variable "
            "and compute the goodness-of-fit criterion.",
            ", ".join(distributions),
        )
        results = {}
        for variable in self.names:
            LOGGER.info("| Fit different distributions for %s.", variable)
            dataset = self.dataset[variable]
            results[variable] = self._fit_marginal_distributions(
                variable, dataset, distributions
            )
        return results

    def _fit_marginal_distributions(
        self,
        variable: str,
        sample: ndarray,
        distributions: Iterable[str],
    ) -> dict[str, dict[str, OTDistribution | MeasureType]]:
        """Fit different distributions for a given dataset marginal.

        Args:
            variable: A variable name.
            sample: A data array.
            distributions: The names of the distributions.

        Returns:
            The distributions for the different variables.
        """
        result = {}
        factory = OTDistributionFitter(variable, sample)
        for distribution in distributions:
            fitted_distribution = factory.fit(distribution)
            test_result = factory.compute_measure(
                fitted_distribution, self.fitting_criterion, self.level
            )
            result[distribution] = {}
            result[distribution]["fitted_distribution"] = fitted_distribution
            result[distribution]["criterion"] = test_result
        return result

[docs]    def compute_maximum(self) -> dict[str, ndarray]:  # noqa: D102
        result = {
            name: self.distributions[name]["value"].math_upper_bound
            for name in self.names
        }
        return result

[docs]    def compute_mean(self) -> dict[str, ndarray]:  # noqa: D102
        result = {name: self.distributions[name]["value"].mean for name in self.names}
        return result

[docs]    def compute_minimum(self) -> dict[str, ndarray]:  # noqa: D102
        result = {
            name: self.distributions[name]["value"].math_lower_bound
            for name in self.names
        }
        return result

[docs]    def compute_probability(  # noqa: D102
        self,
        thresh: float,
        greater: bool = True,
    ) -> dict[str, ndarray]:
        dist = self.distributions
        if greater:
            result = {
                name: 1 - dist[name]["value"].compute_cdf(thresh[name])[0]
                for name in self.names
            }
        else:
            result = {
                name: dist[name]["value"].compute_cdf(thresh[name])[0]
                for name in self.names
            }
        return result

[docs]    def compute_tolerance_interval(  # noqa: D102
        self,
        coverage: float,
        confidence: float = 0.95,
        side: ToleranceIntervalSide = ToleranceIntervalSide.BOTH,
    ) -> dict[str, tuple[ndarray, ndarray]]:

        if not 0.0 <= coverage <= 1.0:
            raise ValueError("The argument 'coverage' must be number in [0,1].")
        if not 0.0 <= confidence <= 1.0:
            raise ValueError("The argument 'confidence' must be number in [0,1].")
        limits = {}
        factory = ToleranceIntervalFactory()
        for variable in self.names:
            distribution = self.distributions[variable]
            cls = factory.get_class(distribution["name"])
            parameters = distribution["value"].marginals[0].getParameter()
            tolerance_interval = cls(self.n_samples, *parameters)
            limits[variable] = tolerance_interval.compute(coverage, confidence, side)
        return limits

[docs]    def compute_quantile(  # noqa: D102
        self,
        prob: float,
    ) -> dict[str, ndarray]:
        prob = array([prob])
        result = {
            name: self.distributions[name]["value"].compute_inverse_cdf(prob)
            for name in self.names
        }
        return result

[docs]    def compute_standard_deviation(  # noqa: D102
        self,
    ) -> dict[str, ndarray]:
        result = {
            name: self.distributions[name]["value"].standard_deviation
            for name in self.names
        }
        return result

[docs]    def compute_variance(self) -> dict[str, ndarray]:  # noqa: D102
        result = {
            name: self.distributions[name]["value"].standard_deviation ** 2
            for name in self.names
        }
        return result

[docs]    def compute_moment(  # noqa: D102
        self,
        order: int,
    ) -> dict[str, ndarray]:
        dist = self.distributions
        result = [
            dist[name]["value"].distribution.getMoment(order)[0] for name in self.names
        ]
        return result

[docs]    def compute_range(self) -> dict[str, ndarray]:  # noqa: D102
        result = {}
        for name in self.names:
            dist = self.distributions[name]["value"]
            result[name] = dist.math_upper_bound - dist.math_lower_bound
        return result