Source code for gemseo.problems.dataset.iris

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or initial
#                           documentation
#        :author: Matthias De Lozzo
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""Iris dataset.

This is one of the best known :class:`.Dataset`
to be found in the machine learning literature.

It was introduced by the statistician Ronald Fisher
in his 1936 paper "The use of multiple measurements in taxonomic problems",
Annals of Eugenics. 7 (2): 179-188.

It contains 150 instances of iris plants:

- 50 Iris Setosa,
- 50 Iris Versicolour,
- 50 Iris Virginica.

Each instance is characterized by:

- its sepal length in cm,
- its sepal width in cm,
- its petal length in cm,
- its petal width in cm.

This :class:`.Dataset` can be used for either clustering purposes
or classification ones.

`More information about the Iris dataset
<https://en.wikipedia.org/wiki/Iris_flower_data_set>`_
"""

from __future__ import annotations

from pathlib import Path

from numpy import int64 as np_int64
from pandas import factorize

from gemseo.datasets.dataset import Dataset
from gemseo.datasets.io_dataset import IODataset


[docs] def create_iris_dataset( as_io: bool = False, as_numeric: bool = True, ) -> Dataset: """Iris dataset parametrization. Args: as_io: Whether to use Input/Output group names. as_numeric: Whether to consider a string label or a numeric one. Returns: The Iris dataset. """ file_path = Path(__file__).parent / "iris.data" cls = IODataset if as_io else Dataset dataset = cls.from_csv(file_path, first_column_as_index=False) dataset.name = "Iris" if as_numeric: numeric_data, numeric_meaning = factorize( dataset.get_view(variable_names="specy").to_numpy().T[0] ) dataset.update_data(numeric_data, variable_names="specy") dataset = dataset.astype({("labels", "specy", 0): np_int64}) dataset.misc["labels"] = {"specy": numeric_meaning} if as_io: groups = { "parameters": IODataset.INPUT_GROUP, "labels": IODataset.OUTPUT_GROUP, } for group, new_group in groups.items(): dataset.rename_group(group_name=group, new_group_name=new_group) return dataset