Source code for gemseo.mlearning.core.supervised

# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry,
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                         documentation
#        :author: Matthias De Lozzo
#        :author: Syver Doving Agdestein
Supervised machine learning algorithm

Supervised machine learning is a task of learning relationships
between input and output variables based on an input-output dataset. One
usually distinguishes between to types of supervised machine learning
algorithms, based on the nature of the outputs. For a continuous output
variable, a regression is performed, while for a discrete output variable,
a classification is performed.

Given a set of input variables
:math:`x \\in \\mathbb{R}^{n_{\\text{samples}}\\times n_{\\text{inputs}}}` and
a set of output variables
:math:`y\\in \\mathbb{K}^{n_{\\text{samples}}\\times n_{\\text{outputs}}}`,
where :math:`n_{\\text{inputs}}` is the dimension of the input variable,
:math:`n_{\\text{outputs}}` is the dimension of the output variable,
:math:`n_{\\text{samples}}` is the number of training samples and
:math:`\\mathbb{K}` is either :math:`\\mathbb{R}` or :math:`\\mathbb{N}` for
regression and classification tasks respectively, a supervised learning
algorithm seeks to find a function
:math:`f: \\mathbb{R}^{n_{\\text{inputs}}} \\to
\\mathbb{K}^{n_{\\text{outputs}}}` such that :math:`y=f(x)`.

In addition, we often want to impose some additional constraints on the
function :math:`f`, mainly to ensure that it has a generalization capacity
beyond the training data, i.e. it is able to correctly predict output values of
new input values. This is called regularization. Assuming :math:`f` is
parametrized by a set of parameters :math:`\\theta`, and denoting
:math:`f_\\theta` the parametrized function, one typically seeks to minimize
a function of the form

.. math::

    \\mu(y, f_\\theta(x)) + \\Omega(\\theta),

where :math:`\\mu` is a distance-like measure, typically a mean squared error
or a cross entropy in the case of a regression, or a probability to be
maximized in the case of a classification, and :math:`\\Omega` is a
regularization term that limits the parameters from overfitting, typically some
norm of its argument.

The :mod:`~gemseo.mlearning.core.supervised` module implements this concept
through the :class:`.MLSupervisedAlgo` class based on a :class:`.Dataset`.
from __future__ import absolute_import, division, unicode_literals

from future import standard_library
from numpy import atleast_2d

from gemseo.core.dataset import Dataset
from gemseo.mlearning.core.ml_algo import MLAlgo
from gemseo.mlearning.transform.dimension_reduction.dimension_reduction import (
from gemseo.utils.data_conversion import DataConversion


[docs]class MLSupervisedAlgo(MLAlgo): """Supervised machine learning algorithm. Inheriting classes should overload the :meth:`!MLSupervisedAlgo._fit` and :meth:`!MLSupervisedAlgo._predict` methods. """ ABBR = "MLSupervisedAlgo" def __init__( self, data, transformer=None, input_names=None, output_names=None, **parameters ): """Constructor. :param Dataset data: learning dataset. :param transformer: transformation strategy for data groups. If None, do not scale data. Default: None. :type transformer: dict(str) :param input_names: names of the input variables. :type input_names: list(str) :param output_names: names of the output variables. :type output_names: list(str) :param parameters: algorithm parameters. """ super(MLSupervisedAlgo, self).__init__( data, transformer=transformer, **parameters ) self.input_names = input_names or data.get_names(data.INPUT_GROUP) self.output_names = output_names or data.get_names(data.OUTPUT_GROUP)
[docs] class DataFormatters(MLAlgo.DataFormatters): """Decorators for supervised algorithms. """ @staticmethod def _array_to_dict(data_array, data_names, data_sizes): """Convert an array into a dict :param data_array: the array :param data_names: list of names (keys of the resulting dict) :param data_sizes: dict of (name, size) :returns: a dict :rtype: dict """ current_position = 0 array_dict = {} for name in data_names: array_dict[name] = data_array[ ..., current_position : current_position + data_sizes[name] ] current_position += data_sizes[name] return array_dict
[docs] @classmethod def format_dict(cls, predict): """If input_data is passed as a dictionary, then convert it to ndarray, and convert output_data to dictionary. Else, do nothing. :param predict: Method whose input_data and output_data are to be formatted. """ def wrapper(self, input_data, *args, **kwargs): as_dict = isinstance(input_data, dict) if as_dict: input_data = DataConversion.dict_to_array( input_data, self.input_names ) output_data = predict(self, input_data, *args, **kwargs) if as_dict: varsizes = self.learning_set.sizes output_data = cls._array_to_dict( output_data, self.output_names, varsizes ) return output_data return wrapper
[docs] @classmethod def format_samples(cls, predict): """If input_data has shape (n_inputs,), reshape input_data to (1, n_inputs), and then reshape output data from (1, n_outputs) to (n_outputs,). If input_data has shape (n_samples, n_inputs), then do nothing. :param predict: Method whose input_data and output_data are to be formatted. """ def wrapper(self, input_data, *args, **kwargs): """Format data before and after applying predictor. """ single_sample = input_data.ndim == 1 input_data = atleast_2d(input_data) output_data = predict(self, input_data, *args, **kwargs) if single_sample: output_data = output_data[0] return output_data return wrapper
[docs] @classmethod def format_transform(cls, transform_inputs=True, transform_outputs=True): """Apply transform to inputs, and inverse transform to outputs. :param bool format_inputs: Indicates whether to transform inputs. :param bool format_outputs: Indicates whether to transform outputs. """ def format_transform_(predict): """Apply transform to inputs, and inverse transform to outputs. :param predict: Method whose input_data and output_data are to be formatted. """ def wrapper(self, input_data, *args, **kwargs): """Wrapped version of predict function. """ inputs = self.learning_set.INPUT_GROUP if transform_inputs and inputs in self.transformer: input_data = self.transformer[inputs].transform(input_data) output_data = predict(self, input_data, *args, **kwargs) outputs = self.learning_set.OUTPUT_GROUP if transform_outputs and outputs in self.transformer: output_data = self.transformer[outputs].inverse_transform( output_data ) return output_data return wrapper return format_transform_
[docs] @classmethod def format_input_output(cls, predict): """Format dict, samples and transform successively. :param predict: Method whose input_data and output_data are to be formatted. """ @cls.format_dict @cls.format_samples @cls.format_transform() def wrapper(self, input_data, *args, **kwargs): return predict(self, input_data, *args, **kwargs) return wrapper
[docs] def learn(self, samples=None): """Train machine learning algorithm on learning set, possibly filtered using the given parameters. :param list(int) samples: indices of training samples. """ input_grp = self.learning_set.INPUT_GROUP output_grp = self.learning_set.OUTPUT_GROUP input_data = self.learning_set.get_data_by_names(self.input_names, False) output_data = self.learning_set.get_data_by_names(self.output_names, False) if samples is not None: input_data = input_data[samples] output_data = output_data[samples] if input_grp in self.transformer: input_data = self.transformer[input_grp].fit_transform(input_data) if output_grp in self.transformer: output_data = self.transformer[output_grp].fit_transform(output_data) self._fit(input_data, output_data) self._trained = True
def _fit(self, input_data, output_data): """Fit input-output relationship from data learning. :param ndarray input_data: input data (2D). :param ndarray output_data: output data (2D). """ raise NotImplementedError
[docs] @DataFormatters.format_input_output def predict(self, input_data): """Predict output data from input data. :param input_data: input data (n_inputs,) or (n_samples, n_inputs). :type input_data: dict(ndarray) or ndarray :return: predicted output data (n_outputs,) or (n_samples, n_outputs). :rtype: dict(ndarray) or ndarray(int) """ return self._predict(input_data)
def _predict(self, input_data): """Predict output data from input data. :param ndarray input_data: input data (n_samples, n_inputs). :return: output data (n_samples, n_outputs). :rtype: ndarray(int) """ raise NotImplementedError def _get_raw_shapes(self): """Get raw input and output shapes. The raw dimensions are the shapes of input and output variables after applying transformers. return: raw input shape, raw output shape rtype: tuple(int) """ reduce_inputs = Dataset.INPUT_GROUP in self.transformer and isinstance( self.transformer[Dataset.INPUT_GROUP], DimensionReduction ) if reduce_inputs: input_shape = self.transformer[Dataset.INPUT_GROUP].n_components else: input_shape = self.input_shape reduce_outputs = Dataset.OUTPUT_GROUP in self.transformer and isinstance( self.transformer[Dataset.OUTPUT_GROUP], DimensionReduction ) if reduce_outputs: output_shape = self.transformer[Dataset.OUTPUT_GROUP].n_components else: output_shape = self.output_shape return input_shape, output_shape @property def input_shape(self): """ Dimension of input variables before applying transformers. """ sizes = [self.learning_set.sizes[name] for name in self.input_names] return sum(sizes) @property def output_shape(self): """ Dimension of output variables before applying transformers. """ sizes = [self.learning_set.sizes[name] for name in self.output_names] return sum(sizes) def _get_objects_to_save(self): """Get objects to save. :return: objects to save. :rtype: dict """ objects = super(MLSupervisedAlgo, self)._get_objects_to_save() objects["input_names"] = self.input_names objects["output_names"] = self.output_names return objects