Source code for gemseo.mlearning.qual_measure.quality_measure

# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                         documentation
#        :author: Syver Doving Agdestein
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""Here is the baseclass to measure the quality of machine learning algorithms.

The concept of quality measure is implemented with the :class:`.MLQualityMeasure` class.
"""
from __future__ import division, unicode_literals

from typing import List, NoReturn, Optional, Sequence, Tuple, Union

import six
from custom_inherit import DocInheritMeta
from numpy import arange, array, array_split, ndarray
from numpy.random import shuffle

from gemseo.core.dataset import Dataset
from gemseo.core.factory import Factory
from gemseo.mlearning.core.ml_algo import MLAlgo

OptionType = Optional[Union[Sequence[int], bool, int, Dataset]]


[docs]@six.add_metaclass(
    DocInheritMeta(
        abstract_base_class=True,
        style="google_with_merge",
    )
)
class MLQualityMeasure(object):
    """An abstract quality measure for machine learning algorithms.

    Attributes:
        algo (MLAlgo): The machine learning algorithm.
    """

    LEARN = "learn"
    TEST = "test"
    LOO = "loo"
    KFOLDS = "kfolds"
    BOOTSTRAP = "bootstrap"

    SMALLER_IS_BETTER = True  # To be overwritten in inheriting classes

    def __init__(
        self,
        algo,  # type: MLAlgo
    ):  # type: (...) -> None
        """
        Args:
            algo: A machine learning algorithm.
        """
        self.algo = algo

[docs]    def evaluate(
        self,
        method=LEARN,  # type: str
        samples=None,  # type: Optional[Sequence[int]]
        **options  # type: Optional[OptionType]
    ):  # type: (...) -> Union[float,ndarray]
        """Evaluate the quality measure.

        Args:
            method: The name of the method
                to evaluate the quality measure.
            samples: The indices of the learning samples.
                If None, use the whole learning dataset.
            **options: The options of the estimation method (e.g. 'test_data' for
            the 'test' method, 'n_replicates' for the bootstrap one, ...)

        Returns:
            The value of the quality measure.

        Raises:
            ValueError: If the name of the method is unknown.
        """
        if method == self.LEARN:
            evaluation = self.evaluate_learn(samples=samples, **options)
        elif method == self.TEST:
            evaluation = self.evaluate_test(samples=samples, **options)
        elif method == self.LOO:
            evaluation = self.evaluate_loo(samples=samples, **options)
        elif method == self.KFOLDS:
            evaluation = self.evaluate_kfolds(samples=samples, **options)
        elif method == self.BOOTSTRAP:
            evaluation = self.evaluate_bootstrap(samples=samples, **options)
        else:
            raise ValueError("The method '{}' is not available.".format(method))
        return evaluation

[docs]    def evaluate_learn(
        self,
        samples=None,  # type: Optional[Sequence[int]]
        multioutput=True,  # type: bool
    ):  # type: (...) -> NoReturn
        """Evaluate the quality measure using the learning dataset.

        Args:
            samples: The indices of the learning samples.
                If None, use the whole learning dataset.
            multioutput: Whether to return the quality measure
                for each output component. If not, average these measures.

        Returns:
            The value of the quality measure.
        """
        raise NotImplementedError

[docs]    def evaluate_test(
        self,
        test_data,  # type:Dataset
        samples=None,  # type: Optional[Sequence[int]]
        multioutput=True,  # type: bool
    ):  # type: (...) -> NoReturn
        """Evaluate the quality measure using a test dataset.

        Args:
            dataset: The test dataset.
            samples: The indices of the learning samples.
                If None, use the whole learning dataset.
            multioutput: If True, return the quality measure for each
                output component. Otherwise, average these measures.

        Returns:
            The value of the quality measure.
        """
        raise NotImplementedError

[docs]    def evaluate_loo(
        self,
        samples=None,  # type: Optional[Sequence[int]]
        multioutput=True,  # type: bool
    ):  # type: (...) -> Union[float,ndarray]
        """Evaluate the quality measure using the leave-one-out technique.

        Args:
            samples: The indices of the learning samples.
                If None, use the whole learning dataset.
            multioutput: If True, return the quality measure for each
                output component. Otherwise, average these measures.

        Returns:
            The value of the quality measure.
        """
        n_samples = self.algo.learning_set.n_samples
        return self.evaluate_kfolds(
            samples=samples, n_folds=n_samples, multioutput=multioutput
        )

[docs]    def evaluate_kfolds(
        self,
        n_folds=5,  # type: int
        samples=None,  # type: Optional[Sequence[int]]
        multioutput=True,  # type: bool
        randomize=False,  # type:bool
    ):  # type: (...) -> NoReturn
        """Evaluate the quality measure using the k-folds technique.

        Args:
            n_folds: The number of folds.
            samples: The indices of the learning samples.
                If None, use the whole learning dataset.
            multioutput: If True, return the quality measure for each
                output component. Otherwise, average these measures.
            randomize: Whether to shuffle the samples before dividing them in folds.

        Returns:
            The value of the quality measure.
        """
        raise NotImplementedError

[docs]    def evaluate_bootstrap(
        self,
        n_replicates=100,  # type: int
        samples=None,  # type: Optional[Sequence[int]]
        multioutput=True,  # type: bool
    ):  # type: (...) -> NoReturn
        """Evaluate the quality measure using the bootstrap technique.

        Args:
            n_replicates: The number of bootstrap replicates.
            samples: The indices of the learning samples.
                If None, use the whole learning dataset.
            multioutput: If True, return the quality measure for each
                output component. Otherwise, average these measures.

        Returns:
            The value of the quality measure.
        """
        raise NotImplementedError

[docs]    @classmethod
    def is_better(
        cls,
        val1,  # type: float
        val2,  # type: float
    ):  # type: (...) -> bool
        """Compare the quality between two values.

        This methods returns True if the first one is better than the second one.

        For most measures, a smaller value is "better" than a larger one (MSE
        etc.). But for some, like an R2-measure, higher values are better than
        smaller ones. This comparison method correctly handles this,
        regardless of the type of measure.

        Args:
            val1: The value of the first quality measure.
            val2: The value of the second quality measure.

        Returns:
            Whether val1 is of better quality than val2.
        """
        if cls.SMALLER_IS_BETTER:
            result = val1 < val2
        else:
            result = val1 > val2
        return result

    def _assure_samples(
        self,
        samples,  # type: Optional[Sequence[int]]
    ):  # type: (...) -> ndarray
        """Get the list of all samples if samples is None.

        Args:
            samples: The list of samples. Can also be None.

        Returns:
            The samples.
        """
        if samples is None:
            return arange(self.algo.learning_set.n_samples)
        else:
            return array(samples)

    def _compute_folds(
        self,
        samples,  # type: Optional[Sequence[int]]
        n_folds,  # type: int
        randomize,  # type: bool
    ):  # type: (...) -> Tuple[List[ndarray],ndarray]
        """Divide the elements into folds.

        Args:
            samples: The samples to be split into folds.
                If None, use all the samples.
            n_folds: The number of folds.
            randomize: Whether to shuffle the elements before splitting them.

        Returns:
            * The folds defined as sub-sets of `samples`.
            * The original samples.
        """
        samples = self._assure_samples(samples)
        if randomize:
            shuffle(samples)
        return array_split(samples, n_folds), samples


[docs]class MLQualityMeasureFactory(Factory):
    """A factory of :class:`.MLQualityMeasure`."""

    def __init__(self):
        super(MLQualityMeasureFactory, self).__init__(
            MLQualityMeasure, ("gemseo.mlearning.qual_measure",)
        )