Source code for gemseo.uncertainty.statistics.parametric

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                           documentation
#        :author: Matthias De Lozzo
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""Class for the parametric estimation of statistics from a dataset.

Overview
--------

The :class:`.ParametricStatistics` class inherits
from the abstract :class:`.BaseStatistics` class
and aims to estimate statistics from a :class:`.Dataset`,
based on candidate parametric distributions calibrated from this :class:`.Dataset`.

For each variable,

1. the parameters of these distributions are calibrated from the :class:`.Dataset`,
2. the fitted parametric :class:`.BaseDistribution` which is optimal
   in the sense of a goodness-of-fit criterion and a selection criterion
   is selected to estimate the statistics related to this variable.

The :class:`.ParametricStatistics` relies on the OpenTURNS library
through the :class:`.OTDistribution` and :class:`.OTDistributionFitter` classes.

Construction
------------

The :class:`.ParametricStatistics` is built from two mandatory arguments:

- a dataset,
- a list of distributions names,

and can consider optional arguments:

- a subset of variables names
  (by default, statistics are computed for all variables),
- a fitting criterion name
  (by default, BIC is used;
  see :attr:`.FittingCriterion`
  and :attr:`.SignificanceTest`
  for more information),
- a level associated with the fitting criterion,
- a selection criterion:

  - 'best':
    select the distribution minimizing
    (or maximizing, depending on the criterion)
    the criterion,
  - 'first':
    select the first distribution
    for which the criterion is greater
    (or lower, depending on the criterion)
    than the level,

- a name for the :class:`.ParametricStatistics` object
  (by default, the name is the concatenation of 'ParametricStatistics'
  and the name of the :class:`.Dataset`).

Capabilities
------------

By inheritance,
a :class:`.ParametricStatistics` object has
the same capabilities as :class:`.BaseStatistics`.
Additional ones are:

- :meth:`.get_fitting_matrix`:
  this method displays the values of the fitting criterion
  for the different variables
  and candidate probability distributions
  as well as the select probability distribution,
- :meth:`.plot_criteria`:
  this method plots the criterion values for a given variable.
"""

from __future__ import annotations

import logging
from itertools import starmap
from pathlib import Path
from typing import TYPE_CHECKING
from typing import NamedTuple
from typing import Union

import matplotlib.pyplot as plt
from numpy import array
from numpy import linspace
from numpy import ndarray

from gemseo.third_party.prettytable.prettytable import PrettyTable
from gemseo.uncertainty.distributions.openturns.distribution import OTDistribution
from gemseo.uncertainty.distributions.openturns.fitting import MeasureType
from gemseo.uncertainty.distributions.openturns.fitting import OTDistributionFitter
from gemseo.uncertainty.statistics.base_statistics import BaseStatistics
from gemseo.uncertainty.statistics.tolerance_interval.distribution import (
    BaseToleranceInterval,
)
from gemseo.uncertainty.statistics.tolerance_interval.factory import (
    ToleranceIntervalFactory,
)
from gemseo.utils.matplotlib_figure import FigSizeType
from gemseo.utils.matplotlib_figure import save_show_figure
from gemseo.utils.string_tools import pretty_str
from gemseo.utils.string_tools import repr_variable

if TYPE_CHECKING:
    from collections.abc import Iterable
    from collections.abc import Mapping
    from collections.abc import Sequence

    from matplotlib.figure import Figure

    from gemseo.datasets.dataset import Dataset

LOGGER = logging.getLogger(__name__)

DistributionType = dict[str, Union[str, OTDistribution]]


class _Distribution(NamedTuple):
    """A probability distribution."""

    name: str
    """The name of the probability distribution."""

    value: OTDistribution
    """The probability distribution."""



[docs]
class ParametricStatistics(BaseStatistics):
    """A toolbox to compute statistics based on probability distribution-fitting.

    Unless otherwise stated,
    the statistics are computed *variable-wise* and *component-wise*,
    i.e. variable-by-variable and component-by-component.
    So, for the sake of readability,
    the methods named as :meth:`compute_statistic` return ``dict[str, ndarray]`` objects
    whose values are the names of the variables
    and the values are the statistic estimated for the different component.

    Examples:
        >>> from gemseo import (
        ...     create_discipline,
        ...     create_parameter_space,
        ...     create_scenario,
        ... )
        >>> from gemseo.uncertainty.statistics.parametric import ParametricStatistics
        >>>
        >>> expressions = {"y1": "x1+2*x2", "y2": "x1-3*x2"}
        >>> discipline = create_discipline(
        ...     "AnalyticDiscipline", expressions=expressions
        ... )
        >>>
        >>> parameter_space = create_parameter_space()
        >>> parameter_space.add_random_variable(
        ...     "x1", "OTUniformDistribution", minimum=-1, maximum=1
        ... )
        >>> parameter_space.add_random_variable(
        ...     "x2", "OTNormalDistribution", mu=0.5, sigma=2
        ... )
        >>>
        >>> scenario = create_scenario(
        ...     [discipline],
        ...     "DisciplinaryOpt",
        ...     "y1",
        ...     parameter_space,
        ...     scenario_type="DOE",
        ... )
        >>> scenario.execute({"algo": "OT_MONTE_CARLO", "n_samples": 100})
        >>>
        >>> dataset = scenario.to_dataset(opt_naming=False)
        >>>
        >>> statistics = ParametricStatistics(
        ...     dataset, ["Normal", "Uniform", "Triangular"]
        ... )
        >>> fitting_matrix = statistics.get_fitting_matrix()
        >>> mean = statistics.compute_mean()
    """

    DistributionName = OTDistributionFitter.DistributionName
    FittingCriterion = OTDistributionFitter.FittingCriterion
    SignificanceTest = OTDistributionFitter.SignificanceTest
    SelectionCriterion = OTDistributionFitter.SelectionCriterion

    fitting_criterion: FittingCriterion
    """The goodness-of-fit criterion, measuring how the distribution fits the data."""

    level: float
    """The test level used by the selection criteria that are significance tests.

    In statistical hypothesis testing, the test level corresponds to the risk of
    committing a type 1 error, that is an incorrect rejection of the null hypothesis
    """

    selection_criterion: SelectionCriterion
    """The selection criterion to select a distribution from a list of candidates."""

    distributions: dict[str, DistributionType | list[DistributionType]]
    """The probability distributions of the random variables.

    When a random variable is a random vector, its probability distribution is expressed
    as a list of marginal distributions. Otherwise, its probability distribution is
    expressed as the unique marginal distribution.
    """

    def __init__(
        self,
        dataset: Dataset,
        distributions: Sequence[DistributionName],
        variable_names: Iterable[str] = (),
        fitting_criterion: FittingCriterion = FittingCriterion.BIC,
        level: float = 0.05,
        selection_criterion: SelectionCriterion = SelectionCriterion.BEST,
        name: str = "",
    ) -> None:
        """
        Args:
            distributions: The names of the distributions.
            fitting_criterion: The name of
                the goodness-of-fit criterion,
                measuring how the distribution fits the data.
                Use :meth:`.ParametricStatistics.get_criteria`
                to get the available criteria.
            level: A test level,
                i.e. the risk of committing a Type 1 error,
                that is an incorrect rejection of a true null hypothesis,
                for criteria based on test hypothesis.
            selection_criterion: The selection criterion
                to select a distribution from a list of candidates.
        """  # noqa: D205,D212,D415
        super().__init__(dataset, variable_names, name)
        self.fitting_criterion = fitting_criterion
        self.selection_criterion = selection_criterion
        LOGGER.info("| Set goodness-of-fit criterion: %s.", fitting_criterion)
        self.level = level
        if self.fitting_criterion in self.SignificanceTest.__members__:
            LOGGER.info("| Set significance level of hypothesis test: %s.", level)

        self._all_distributions = self._fit_distributions(distributions)
        self.__distributions = {}
        self.distributions = self._select_best_distributions(distributions)


[docs]
    def get_fitting_matrix(self) -> str:
        """Get the fitting matrix.

        This matrix contains goodness-of-fit measures
        for each pair < variable, distribution >.

        Returns:
            The printable fitting matrix.
        """
        variables = sorted(self._all_distributions.keys())
        distributions = list(self._all_distributions[variables[0]][0].keys())
        table = PrettyTable(["Variable", *distributions, "Selection"])
        for variable in variables:
            for index in range(self.dataset.variable_names_to_n_components[variable]):
                row = (
                    [variable]
                    + [
                        str(self.get_criteria(variable, index)[0][distribution])
                        for distribution in distributions
                    ]
                    + [self.__distributions[variable][index].name]
                )
                table.add_row(row)
        return str(table)



[docs]
    def get_criteria(
        self, variable: str, index: int = 0
    ) -> tuple[dict[str, float], bool]:
        """Get the value of the fitting criterion for the different distributions.

        Args:
            variable: The name of the variable.
            index: The component of the variable.

        Returns:
            The value of the fitting criterion for the given variable name and component
            and the different distributions,
            as well as whether this fitting criterion is a statistical test
            and so this value a p-value.
        """
        distribution_names_to_criterion_values = {
            name: result["criterion"]
            for name, result in self._all_distributions[variable][index].items()
        }
        criterion_value_is_p_value = False
        if self.fitting_criterion in self.SignificanceTest.__members__:
            distribution_names_to_criterion_values = {
                name: result[1]["p-value"]
                for name, result in distribution_names_to_criterion_values.items()
            }
            criterion_value_is_p_value = True

        return distribution_names_to_criterion_values, criterion_value_is_p_value



[docs]
    def plot_criteria(
        self,
        variable: str,
        title: str = "",
        save: bool = False,
        show: bool = True,
        directory: str | Path = ".",
        index: int = 0,
        fig_size: FigSizeType = (6.4, 3.2),
    ) -> Figure:
        """Plot criteria for a given variable name.

        Args:
            variable: The name of the variable.
            title: The title of the plot, if any.
            save: If ``True``, save the plot on the disk.
            show: If ``True``, show the plot.
            directory: The directory path, either absolute or relative.
            index: The index of the component of the variable.
            fig_size: The width and height of the figure in inches, e.g. ``(w, h)``.

        Raises:
            ValueError: If the variable is missing from the dataset.
        """
        if variable not in self.names:
            msg = (
                f"The variable '{variable}' is missing from the dataset; "
                f"available ones are: {pretty_str(self.names)}."
            )
            raise ValueError(msg)
        criteria, is_p_value = self.get_criteria(variable, index)
        x_values = []
        y_values = []
        labels = []
        x_value = 0
        for distribution, criterion in criteria.items():
            x_value += 1  # noqa: SIM113
            x_values.append(x_value)
            y_values.append(criterion)
            labels.append(distribution)

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=fig_size)
        ax1.bar(x_values, y_values, tick_label=labels)
        if is_p_value:
            ax1.set_title(f"{self.fitting_criterion} (p-value)")
            ax1.axhline(self.level, color="r", linewidth=2.0)
        else:
            ax1.set_title(self.fitting_criterion)

        ax1.grid(True, "both")
        ax1.set_box_aspect(1)
        ax1.set_xlabel("Probability distributions")

        data = self.dataset.get_view(variable_names=variable).to_numpy()
        data_min = min(data)
        data_max = max(data)
        x_values = linspace(data_min, data_max, 1000)
        distributions = self._all_distributions[variable][index]
        ax2.hist(data, density=True)
        for dist_name, dist_value in distributions.items():
            pdf = dist_value["fitted_distribution"].distribution.computePDF
            y_values = [pdf([x_value])[0] for x_value in x_values]
            ax2.plot(x_values, y_values, label=dist_name, linewidth=2.0)

        ax2.set_box_aspect(1)
        ax2.legend()
        ax2.grid(True, "both")
        ax2.set_title("Probability density function")
        ax2.set_xlabel(
            repr_variable(
                variable, index, self.dataset.variable_names_to_n_components[variable]
            )
        )
        if title:
            plt.suptitle(title)

        save_show_figure(fig, show, Path(directory) / "criteria.pdf" if save else "")

        return fig


    def _select_best_distributions(
        self, distribution_names: Sequence[DistributionName]
    ) -> dict[str, DistributionType | list[DistributionType]]:
        """Select the best distributions for the different variables.

        Args:
            distribution_names: The distribution names.

        Returns:
            The best distributions for the different variables.
        """
        LOGGER.info("Select the best distribution for each variable.")
        distributions = {}
        select_from_measures = OTDistributionFitter.select_from_measures
        for variable in self.names:
            selected_distribution_names = []
            marginal_distributions = []
            for component, all_distributions in enumerate(
                self._all_distributions[variable]
            ):
                distribution_name = distribution_names[
                    select_from_measures(
                        [
                            all_distributions[distribution]["criterion"]
                            for distribution in distribution_names
                        ],
                        self.fitting_criterion,
                        self.level,
                        self.selection_criterion,
                    )
                ]
                best_dist = all_distributions[distribution_name]["fitted_distribution"]
                selected_distribution_names.append(distribution_name)
                marginal_distributions.append(best_dist)
                LOGGER.info(
                    "| The best distribution for %s[%s] is %s.",
                    variable,
                    component,
                    best_dist,
                )

            self.__distributions[variable] = list(
                starmap(
                    _Distribution,
                    zip(selected_distribution_names, marginal_distributions),
                )
            )
            if len(marginal_distributions) == 1:
                distributions[variable] = self.__distributions[variable][0]
            else:
                distributions[variable] = self.__distributions[variable]

        return distributions

    def _fit_distributions(
        self,
        distributions: Iterable[DistributionName],
    ) -> dict[str, list[dict[str, dict[str, OTDistribution | MeasureType]]]]:
        """Fit different distributions for the different marginals.

        Args:
            distributions: The distributions names.

        Returns:
            The distributions for the different variables.
        """
        LOGGER.info(
            "Fit different distributions (%s) per variable "
            "and compute the goodness-of-fit criterion.",
            ", ".join(distributions),
        )
        results = {}
        for name in self.names:
            LOGGER.info("| Fit different distributions for %s.", name)
            dataset_values = self.dataset.get_view(variable_names=name).to_numpy()
            size = self.dataset.variable_names_to_n_components[name]
            results[name] = [
                self._fit_marginal_distributions(
                    repr_variable(name, index, size), column, distributions
                )
                for index, column in enumerate(dataset_values.T)
            ]
        return results

    def _fit_marginal_distributions(
        self,
        variable: str,
        sample: ndarray,
        distributions: Iterable[DistributionName],
    ) -> dict[str, dict[str, OTDistribution | MeasureType]]:
        """Fit different distributions for a given dataset marginal.

        Args:
            variable: A variable name.
            sample: A data array.
            distributions: The names of the distributions.

        Returns:
            The distributions for the different variables.
        """
        factory = OTDistributionFitter(variable, sample)
        result = {}
        for distribution in distributions:
            fitted_distribution = factory.fit(distribution)
            result[distribution] = {
                "fitted_distribution": fitted_distribution,
                "criterion": factory.compute_measure(
                    fitted_distribution, self.fitting_criterion, self.level
                ),
            }
        return result


[docs]
    def compute_maximum(self) -> dict[str, ndarray]:  # noqa: D102
        return {
            name: array([
                distribution.value.math_upper_bound[0]
                for distribution in self.__distributions[name]
            ])
            for name in self.names
        }



[docs]
    def compute_mean(self) -> dict[str, ndarray]:  # noqa: D102
        return {
            name: array([
                distribution.value.mean[0]
                for distribution in self.__distributions[name]
            ])
            for name in self.names
        }



[docs]
    def compute_minimum(self) -> dict[str, ndarray]:  # noqa: D102
        return {
            name: array([
                distribution.value.math_lower_bound[0]
                for distribution in self.__distributions[name]
            ])
            for name in self.names
        }



[docs]
    def compute_probability(  # noqa: D102
        self, thresh: Mapping[str, float | ndarray], greater: bool = True
    ) -> dict[str, ndarray]:
        func = lambda x: 1 - x if greater else x  # noqa: E731
        new_thresh = {}
        for name, value in thresh.items():
            if isinstance(value, float):
                new_thresh[name] = [
                    value
                ] * self.dataset.variable_names_to_n_components[name]
            elif len(value) == 1:
                new_thresh[name] = [
                    value[0]
                ] * self.dataset.variable_names_to_n_components[name]
            else:
                new_thresh[name] = value

        return {
            name: array([
                func(distribution.value.compute_cdf([new_thresh[name][index]])[0])
                for index, distribution in enumerate(self.__distributions[name])
            ])
            for name in self.names
        }



[docs]
    def compute_joint_probability(  # noqa: D102
        self, thresh: Mapping[str, float | ndarray], greater: bool = True
    ) -> dict[str, float]:
        raise NotImplementedError



[docs]
    def compute_tolerance_interval(  # noqa: D102
        self,
        coverage: float,
        confidence: float = 0.95,
        side: BaseToleranceInterval.ToleranceIntervalSide = BaseToleranceInterval.ToleranceIntervalSide.BOTH,  # noqa:E501
    ) -> dict[str, list[BaseToleranceInterval.Bounds]]:
        if not 0.0 <= coverage <= 1.0:
            msg = "The argument 'coverage' must be a number in [0,1]."
            raise ValueError(msg)

        if not 0.0 <= confidence <= 1.0:
            msg = "The argument 'confidence' must be a number in [0,1]."
            raise ValueError(msg)

        tolerance_interval_factory = ToleranceIntervalFactory()
        return {
            name: [
                tolerance_interval_factory.get_class(distribution.name)(
                    self.n_samples,
                    *distribution.value.marginals[0].getParameter(),
                ).compute(coverage, confidence, side)
                for distribution in self.__distributions[name]
            ]
            for name in self.names
        }



[docs]
    def compute_quantile(self, prob: float) -> dict[str, ndarray]:  # noqa: D102
        prob = array([prob])
        return {
            name: array([
                distribution.value.compute_inverse_cdf(prob)[0]
                for distribution in self.__distributions[name]
            ])
            for name in self.names
        }



[docs]
    def compute_standard_deviation(self) -> dict[str, ndarray]:  # noqa: D102
        return {
            name: array([
                distribution.value.standard_deviation[0]
                for distribution in self.__distributions[name]
            ])
            for name in self.names
        }



[docs]
    def compute_variance(self) -> dict[str, ndarray]:  # noqa: D102
        return {
            name: array([
                distribution.value.standard_deviation[0] ** 2
                for distribution in self.__distributions[name]
            ])
            for name in self.names
        }



[docs]
    def compute_moment(self, order: int) -> dict[str, ndarray]:  # noqa: D102
        return {
            name: array([
                distribution.value.distribution.getMoment(order)[0]
                for distribution in self.__distributions[name]
            ])
            for name in self.names
        }



[docs]
    def compute_range(self) -> dict[str, ndarray]:  # noqa: D102
        return {
            name: array([
                distribution.value.math_upper_bound[0]
                - distribution.value.math_lower_bound[0]
                for distribution in self.__distributions[name]
            ])
            for name in self.names
        }



[docs]
    def plot(
        self,
        save: bool = False,
        show: bool = True,
        directory_path: str | Path = "",
        file_format: str = "png",
    ) -> dict[str, Figure]:
        """Visualize the cumulative distribution and probability density functions.

        Args:
            save: Whether to save the figures.
            show: Whether to show the figures.
            directory_path: The path to save the figures.
            file_format: The file extension.

        Returns:
            The cumulative distribution and probability density functions
            for each variable.
        """
        plots = {}
        for name in self.names:
            size = self.dataset.variable_names_to_n_components[name]
            for index, distribution in enumerate(self.__distributions[name]):
                plots[repr_variable(name, index, size)] = distribution.value.plot(
                    save=save,
                    show=show,
                    directory_path=directory_path,
                    file_extension=file_format,
                )

        return plots