Source code for gemseo.problems.dataset.iris
# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# Contributors:
# INITIAL AUTHORS - initial API and implementation and/or initial
# documentation
# :author: Matthias De Lozzo
# OTHER AUTHORS - MACROSCOPIC CHANGES
"""Iris dataset.
This is one of the best known :class:`.Dataset`
to be found in the machine learning literature.
It was introduced by the statistician Ronald Fisher
in his 1936 paper "The use of multiple measurements in taxonomic problems",
Annals of Eugenics. 7 (2): 179-188.
It contains 150 instances of iris plants:
- 50 Iris Setosa,
- 50 Iris Versicolour,
- 50 Iris Virginica.
Each instance is characterized by:
- its sepal length in cm,
- its sepal width in cm,
- its petal length in cm,
- its petal width in cm.
This :class:`.Dataset` can be used for either clustering purposes
or classification ones.
`More information about the Iris dataset
<https://en.wikipedia.org/wiki/Iris_flower_data_set>`_
"""
from __future__ import annotations
from pathlib import Path
from numpy import int64 as np_int64
from pandas import factorize
from gemseo.datasets.dataset import Dataset
from gemseo.datasets.io_dataset import IODataset
[docs]
def create_iris_dataset(
as_io: bool = False,
as_numeric: bool = True,
) -> Dataset:
"""Iris dataset parametrization.
Args:
as_io: Whether to use Input/Output group names.
as_numeric: Whether to consider a string label or a numeric one.
Returns:
The Iris dataset.
"""
file_path = Path(__file__).parent / "iris.data"
cls = IODataset if as_io else Dataset
dataset = cls.from_csv(file_path, first_column_as_index=False)
dataset.name = "Iris"
if as_numeric:
numeric_data, numeric_meaning = factorize(
dataset.get_view(variable_names="specy").to_numpy().T[0]
)
dataset.update_data(numeric_data, variable_names="specy")
dataset = dataset.astype({("labels", "specy", 0): np_int64})
dataset.misc["labels"] = {"specy": numeric_meaning}
if as_io:
groups = {
"parameters": IODataset.INPUT_GROUP,
"labels": IODataset.OUTPUT_GROUP,
}
for group, new_group in groups.items():
dataset.rename_group(group_name=group, new_group_name=new_group)
return dataset