Source code for gemseo.post.dataset.scatter_plot_matrix

# -*- coding: utf-8 -*-
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                           documentation
#        :author: Matthias De Lozzo
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
r"""Draw a scatter matrix from a :class:`.Dataset`.

The :class:`.ScatterMatrix` class implements the scatter plot matrix,
which is a way to visualize :math:`n` samples of a
multi-dimensional vector

.. math::

   x=(x_1,x_2,\ldots,x_d)\in\mathbb{R}^d

in several 2D subplots where the (i,j) subplot represents the cloud
of points

.. math::

   \left(x_i^{(k)},x_j^{(k)}\right)_{1\leq k \leq n}

while the (i,i) subplot represents the empirical distribution of the samples

.. math::

   x_i^{(1)},\ldots,x_i^{(n)}

by means of an histogram or a kernel density estimator.

A variable name can be passed to the :meth:`.DatasetPlot.execute` method
by means of the :code:`classifier` keyword in order to color the curves
according to the value of the variable name. This is useful when the data is
labeled.
"""
from __future__ import division, unicode_literals

from typing import List, Mapping, Optional

import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from pandas import DataFrame

from gemseo.post.dataset.dataset_plot import DatasetPlot

try:
    from pandas.plotting import scatter_matrix
except ImportError:
    from pandas import scatter_matrix


[docs]class ScatterMatrix(DatasetPlot):
    """Scatter plot matrix."""

    def _plot(
        self,
        properties,  # type: Mapping
        classifier=None,  # type: Optional[str]
        kde=False,  # type: bool
        size=25,  # type: int
        marker="o",  # type: str
    ):  # type: (...) -> List[Figure]
        """
        Args:
            classifier: The name of the variable to build the cluster.
            kde: The type of the distribution representation.
                If True, plot kernel-density estimator on the diagonal.
                Otherwise, use histograms.
            size: The size of the points.
            marker: The marker for the points.
        """
        figsize_x = properties.get(self.FIGSIZE_X) or 10
        figsize_y = properties.get(self.FIGSIZE_Y) or 10
        if classifier is not None and classifier not in self.dataset.variables:
            raise ValueError(
                "Classifier must be one of these names: "
                + ", ".join(self.dataset.variables)
            )
        if kde:
            diagonal = "kde"
        else:
            diagonal = "hist"
        dataframe = self.dataset.export_to_dataframe()
        if classifier is None:
            self._scatter_matrix(
                dataframe, diagonal, size, marker, figsize_x, figsize_y
            )
        else:
            self._scatter_matrix_for_group(
                classifier, dataframe, diagonal, size, marker, figsize_x, figsize_y
            )
        return [plt.gcf()]

    def _scatter_matrix_for_group(
        self,
        classifier,  # type: str
        dataframe,  # type: DataFrame
        diagonal,  # type: str
        size,  # type: int
        marker,  # type: str
        figsize_x,  # type: int
        figsize_y,  # type: int
    ):  # type: (...) -> None
        """Scatter matrix plot for group.

        Args:
            classifier: The name of the variable to group the data.
            dataframe: The data to plot.
            diagonal: The type of distribution representation, either "kde" or "hist".
            size: The size of the points.
            marker: The marker for the points.
            figsize_x: The size of the figure in horizontal direction (inches).
            figsize_y: The size of the figure in vertical direction (inches).
        """
        palette = dict(enumerate("bgrcmyk"))
        groups = self.dataset.get_data_by_names([classifier], False)[:, 0:1]
        colors = [palette[group[0] % len(palette)] for group in groups]
        _, varname = self._get_label(classifier)
        dataframe = dataframe.drop(varname, 1)
        dataframe.columns = self._get_variables_names(dataframe)
        scatter_matrix(
            dataframe,
            diagonal=diagonal,
            color=colors,
            s=size,
            marker=marker,
            figsize=(figsize_x, figsize_y),
        )

    def _scatter_matrix(
        self,
        dataframe,  # type: DataFrame
        diagonal,  # type: str
        size,  # type: int
        marker,  # type: str
        figsize_x,  # type: int
        figsize_y,  # type: int
    ):  # type: (...) -> None
        """Scatter matrix plot for group.

        Args:
            dataframe: The data to plot.
            diagonal: The type of distribution representation, either "kde" or "hist".
            size: The size of the points.
            marker: The marker for the points.
            figsize_x: The size of the figure in horizontal direction (inches).
            figsize_y: The size of the figure in vertical direction (inches).

        Returns:
            The figure.
        """
        dataframe.columns = self._get_variables_names(dataframe)
        scatter_matrix(
            dataframe,
            diagonal=diagonal,
            figsize=(figsize_x, figsize_y),
            s=size,
            marker=marker,
        )