# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# Contributors:
# INITIAL AUTHORS - initial API and implementation and/or initial
# documentation
# :author: Matthias De Lozzo
# :author: Syver Doving Agdestein
# OTHER AUTHORS - MACROSCOPIC CHANGES
"""
Supervised machine learning algorithm
=====================================
Supervised machine learning is a task of learning relationships
between input and output variables based on an input-output dataset. One
usually distinguishes between to types of supervised machine learning
algorithms, based on the nature of the outputs. For a continuous output
variable, a regression is performed, while for a discrete output variable,
a classification is performed.
Given a set of input variables
:math:`x \\in \\mathbb{R}^{n_{\\text{samples}}\\times n_{\\text{inputs}}}` and
a set of output variables
:math:`y\\in \\mathbb{K}^{n_{\\text{samples}}\\times n_{\\text{outputs}}}`,
where :math:`n_{\\text{inputs}}` is the dimension of the input variable,
:math:`n_{\\text{outputs}}` is the dimension of the output variable,
:math:`n_{\\text{samples}}` is the number of training samples and
:math:`\\mathbb{K}` is either :math:`\\mathbb{R}` or :math:`\\mathbb{N}` for
regression and classification tasks respectively, a supervised learning
algorithm seeks to find a function
:math:`f: \\mathbb{R}^{n_{\\text{inputs}}} \\to
\\mathbb{K}^{n_{\\text{outputs}}}` such that :math:`y=f(x)`.
In addition, we often want to impose some additional constraints on the
function :math:`f`, mainly to ensure that it has a generalization capacity
beyond the training data, i.e. it is able to correctly predict output values of
new input values. This is called regularization. Assuming :math:`f` is
parametrized by a set of parameters :math:`\\theta`, and denoting
:math:`f_\\theta` the parametrized function, one typically seeks to minimize
a function of the form
.. math::
\\mu(y, f_\\theta(x)) + \\Omega(\\theta),
where :math:`\\mu` is a distance-like measure, typically a mean squared error
or a cross entropy in the case of a regression, or a probability to be
maximized in the case of a classification, and :math:`\\Omega` is a
regularization term that limits the parameters from overfitting, typically some
norm of its argument.
The :mod:`~gemseo.mlearning.core.supervised` module implements this concept
through the :class:`.MLSupervisedAlgo` class based on a :class:`.Dataset`.
"""
from __future__ import absolute_import, division, unicode_literals
from future import standard_library
from numpy import atleast_2d
from gemseo.core.dataset import Dataset
from gemseo.mlearning.core.ml_algo import MLAlgo
from gemseo.mlearning.transform.dimension_reduction.dimension_reduction import (
DimensionReduction,
)
from gemseo.utils.data_conversion import DataConversion
standard_library.install_aliases()
[docs]class MLSupervisedAlgo(MLAlgo):
"""Supervised machine learning algorithm.
Inheriting classes should overload the :meth:`!MLSupervisedAlgo._fit` and
:meth:`!MLSupervisedAlgo._predict` methods.
"""
ABBR = "MLSupervisedAlgo"
def __init__(
self, data, transformer=None, input_names=None, output_names=None, **parameters
):
"""Constructor.
:param Dataset data: learning dataset.
:param transformer: transformation strategy for data groups.
If None, do not scale data. Default: None.
:type transformer: dict(str)
:param input_names: names of the input variables.
:type input_names: list(str)
:param output_names: names of the output variables.
:type output_names: list(str)
:param parameters: algorithm parameters.
"""
super(MLSupervisedAlgo, self).__init__(
data, transformer=transformer, **parameters
)
self.input_names = input_names or data.get_names(data.INPUT_GROUP)
self.output_names = output_names or data.get_names(data.OUTPUT_GROUP)
[docs] def learn(self, samples=None):
"""Train machine learning algorithm on learning set, possibly filtered
using the given parameters.
:param list(int) samples: indices of training samples.
"""
input_grp = self.learning_set.INPUT_GROUP
output_grp = self.learning_set.OUTPUT_GROUP
input_data = self.learning_set.get_data_by_names(self.input_names, False)
output_data = self.learning_set.get_data_by_names(self.output_names, False)
if samples is not None:
input_data = input_data[samples]
output_data = output_data[samples]
if input_grp in self.transformer:
input_data = self.transformer[input_grp].fit_transform(input_data)
if output_grp in self.transformer:
output_data = self.transformer[output_grp].fit_transform(output_data)
self._fit(input_data, output_data)
self._trained = True
def _fit(self, input_data, output_data):
"""Fit input-output relationship from data learning.
:param ndarray input_data: input data (2D).
:param ndarray output_data: output data (2D).
"""
raise NotImplementedError
[docs] @DataFormatters.format_input_output
def predict(self, input_data):
"""Predict output data from input data.
:param input_data: input data (n_inputs,) or (n_samples, n_inputs).
:type input_data: dict(ndarray) or ndarray
:return: predicted output data (n_outputs,) or (n_samples, n_outputs).
:rtype: dict(ndarray) or ndarray(int)
"""
return self._predict(input_data)
def _predict(self, input_data):
"""Predict output data from input data.
:param ndarray input_data: input data (n_samples, n_inputs).
:return: output data (n_samples, n_outputs).
:rtype: ndarray(int)
"""
raise NotImplementedError
def _get_raw_shapes(self):
"""Get raw input and output shapes.
The raw dimensions are the shapes of input and output variables after
applying transformers.
return: raw input shape, raw output shape
rtype: tuple(int)
"""
reduce_inputs = Dataset.INPUT_GROUP in self.transformer and isinstance(
self.transformer[Dataset.INPUT_GROUP], DimensionReduction
)
if reduce_inputs:
input_shape = self.transformer[Dataset.INPUT_GROUP].n_components
else:
input_shape = self.input_shape
reduce_outputs = Dataset.OUTPUT_GROUP in self.transformer and isinstance(
self.transformer[Dataset.OUTPUT_GROUP], DimensionReduction
)
if reduce_outputs:
output_shape = self.transformer[Dataset.OUTPUT_GROUP].n_components
else:
output_shape = self.output_shape
return input_shape, output_shape
@property
def input_shape(self):
""" Dimension of input variables before applying transformers. """
sizes = [self.learning_set.sizes[name] for name in self.input_names]
return sum(sizes)
@property
def output_shape(self):
""" Dimension of output variables before applying transformers. """
sizes = [self.learning_set.sizes[name] for name in self.output_names]
return sum(sizes)
def _get_objects_to_save(self):
"""Get objects to save.
:return: objects to save.
:rtype: dict
"""
objects = super(MLSupervisedAlgo, self)._get_objects_to_save()
objects["input_names"] = self.input_names
objects["output_names"] = self.output_names
return objects