Source code for gemseo.uncertainty.statistics.statistics

# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry,
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# Contributors:
#    INITIAL AUTHORS - API and implementation and/or documentation
#       :author: Matthias De Lozzo
Estimation of statistics from a dataset


The abstract :class:`.Statistics` class implements the concept of
statistics library. It is enriched by concrete classes
such as :class:`.EmpiricalStatistics` and :class:`.ParametricStatistics`.


A :class:`.Statistics` is built from a :class:`.Dataset` and optionally
a list of variables names. In this case, statistics are only computed
for these variables. Otherwise, statistics are computed for all variables.
Lastly, the user can name its :class:`.Statistics`. By default,
the name is the concatenation of the name of the class
overloading :class:`.Statistics`
and the name of the :class:`.Dataset`.


A :class:`.Statistics` returns standard descriptive and statistical measures
for the different variables:

- :meth:`.Statistics.minimum`: the minimum value,
- :meth:`.Statistics.maximum`: the maximum value,
- :meth:`.Statistics.range`: the difference between minimum and maximum values,
- :meth:`.Statistics.mean`: the expectation, a.k.a. mean value,
- :meth:`.Statistics.moment`: the central moment which is a the expected value
  of a specified integer power of the deviation from the mean,
- :meth:`.Statistics.variance`: the variance, which is the mean squared
  variation around the mean value,
- :meth:`.Statistics.standard_deviation`: the standard deviation, which is the
  square root of the variance,
- :meth:`.Statistics.quantile`: the quantile associated with a probability,
  which is the cut point diving the range into a first continuous interval
  with this given probability and a second continuous interval
  with the complementary probability; common *q*-quantiles dividing
  the range into *q* continuous interval with equal probabilities are also

    - :meth:`.Statistics.median` which implements the 2-quantile (50%).
    - :meth:`.Statistics.quartile` whose order (1, 2 or 3) implements
      the 4-quantiles (respectively 25%, 50% and 75%),
    - :meth:`.Statistics.percentile` whose order (1, 2, ..., 99) implements
      the 100-quantiles (1%, 2%, ..., 99%),

- :meth:`.Statistics.probability`: the probability that the random variable
  is larger or smaller than a certain threshold,
- :meth:`.Statistics.tolerance_interval`: the left-sided, right-sided or
  both-sided tolerance interval associated with a given coverage level
  and a given confidence level, which is
  a statistical interval within which, with some confidence level,
  a specified proportion of the random variable realizations falls
  (this proportion is the coverage level)

    - :meth:`.Statistics.a_value`: the A-value, which is the lower bound of the
      left-sided tolerance interval associated with a coverage level
      equal to 99% and a confidence level equal to 95%,
    - :meth:`.Statistics.b_value`: the B-value, which is the lower bound of the
      left-sided tolerance interval associated with a coverage level equal
      to 90% and a confidence level equal to 95%,

from __future__ import absolute_import, division, unicode_literals

from future import standard_library


from gemseo import LOGGER

[docs]class Statistics(object): """Abstract class for Statistics library interface.""" def __init__(self, dataset, variables_names=None, name=None): """ Constructor :param Dataset dataset: dataset :param list(str) variables_names: list of variables names. If None, the method considers all variables from loaded dataset. Default: None. :param str name: name of the object. If None, use the concatenation of class and dataset names. Default: None. """ default_name = self.__class__.__name__ + "_" + = name or default_name msg = 'Create "' + + '"' msg += ", a " + self.__class__.__name__ + " library." self.dataset = dataset.get_all_data(by_group=False, as_dict=True) self.n_samples = dataset.n_samples self.names = variables_names or dataset.variables self.n_variables = dataset.n_variables def __str__(self): msg = + "\n" msg += "| n_samples: " + str(self.n_samples) + "\n" msg += "| n_variables: " + str(self.n_variables) + "\n" msg += "| variables: " + str(self.names) + "\n" return msg
[docs] def tolerance_interval(self, coverage, confidence=0.95, side="both"): """Compute the tolerance interval (TI) for a given minimum percentage of the population and a given confidence level. :param float coverage: minimum percentage of belonging to the TI. :param float confidence: level of confidence in [0,1]. Default: 0.95. :param str side: kind of interval: 'lower' for lower-sided TI, 'upper' for upper-sided TI and 'both for both-sided TI. :return: tolerance limits """ raise NotImplementedError
[docs] def a_value(self): """Compute the b-value. :return: b-value """ result = self.tolerance_interval(1 - 0.1, 0.99, "lower") result = {name: value[0] for name, value in result.items()} return result
[docs] def b_value(self): """Compute the b-value. :return: b-value """ result = self.tolerance_interval(1 - 0.1, 0.95, "lower") result = {name: value[0] for name, value in result.items()} return result
[docs] def maximum(self): """Compute the maximum. :return: maximum """ raise NotImplementedError
[docs] def mean(self): """Compute the mean. :return: mean """ raise NotImplementedError
[docs] def minimum(self): """Compute the minimum. :return: minimum """ raise NotImplementedError
[docs] def median(self): """Compute the median. :param options: options :return: median """ result = self.quantile(0.5) return result
[docs] def percentile(self, order): """Compute the percentile. :param int order: percentile order, e.g. 4. :return: percentile """ if not isinstance(order, int) or order > 100 or order < 0: raise TypeError( "Percentile order must be an integer " "between 0 and 100 inclusive." ) prob = order / 100.0 result = self.quantile(prob) return result
[docs] def probability(self, thresh, greater): """Compute a probability associated to a threshold. :param float thresh: threshold :param bool greater: if True, compute the probability the probability of exceeding the threshold, if False, compute the reverse. :return: probability """ raise NotImplementedError
[docs] def quantile(self, prob): """Compute a quantile associated to a probability. :param float prob: probability between 0 and 1 :return: quantile """ raise NotImplementedError
[docs] def quartile(self, order): """Compute a quartile. :param int order: quartile order in [1,2,3] :return: quartile """ quartiles = [0.25, 0.5, 0.75] if order not in [1, 2, 3]: raise ValueError("Quartile order must be in [1,2,3]") prob = quartiles[order - 1] result = self.quantile(prob) return result
[docs] def range(self): """Compute the range :return: range """ raise NotImplementedError
[docs] def standard_deviation(self): """Compute a standard_deviation. :return: standard deviation """ raise NotImplementedError
[docs] def variance(self): """Compute a variance. :return: variance """ raise NotImplementedError
[docs] def moment(self, order): """Compute the moment for a given order. :param int order: moment index :return: moment """ raise NotImplementedError