Source code for gemseo.utils.study_analyses.xls_study_parser

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or
#                      initial documentation
#        :author:  Francois Gallard
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""Excel file parser for the study analyses."""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING
from typing import Final

from pandas import DataFrame
from pandas import read_excel

from gemseo import get_available_formulations
from gemseo.core.discipline import MDODiscipline
from gemseo.utils.string_tools import MultiLineString
from gemseo.utils.string_tools import pretty_str

if TYPE_CHECKING:
    from collections.abc import Iterable

LOGGER = logging.getLogger(__name__)



[docs]
class XLSStudyParser:
    """A study specification based on an Excel file.

    The Excel file shall contain one sheet per discipline:

    - the name of the sheet shall have the discipline name,
    - the sheet shall define the input names of the discipline
      as a vertical succession of cells starting with ``"Inputs"``:

        .. table:: Inputs

            +--------------+
            | Inputs       |
            +--------------+
            | input_name_1 |
            +--------------+
            | ...          |
            +--------------+
            | input_name_N |
            +--------------+

    - the sheet shall define the output names of the discipline
      as a vertical succession of cells starting with ``"Outputs"``:

    .. table:: Outputs

            +---------------+
            | Outputs       |
            +---------------+
            | output_name_1 |
            +---------------+
            | ...           |
            +---------------+
            | output_name_N |
            +---------------+

    - the empty lines of the series ``Inputs`` and ``Outputs`` are ignored,
    - the sheet may contain other data, but these will not be taken into account.

    If ``has_scenario`` is ``True``,
    the Excel file shall contain one sheet per scenario
    with a name starting by ``Scenario``.
    Distributed formulations shall contain one sheet for the main scenario
    and one sheet per sub-scenario.

    A scenario sheet shall have the following columns:

    .. table:: Scenario1

        +------------------+--------------------+-------------+-------------+-------------+---------------+----------------+
        | Design variables | Objective function | Constraints | Disciplines | Formulation |    Options    | Options values |
        +==================+====================+=============+=============+=============+===============+================+
        |       in1        |       out1         |    out2     |    Disc1    |    MDF      |   tolerance   |       0.1      |
        +------------------+--------------------+-------------+-------------+-------------+---------------+----------------+
        |                  |                    |             |    Disc2    |             | main_mda_name |   MDAJacobi    |
        +------------------+--------------------+-------------+-------------+-------------+---------------+----------------+

    These columns must satisfy some constraints:

    - all of them are mandatory, even if empty for the constraints,
    - their order does not matter,
    - one and only one formulation must be declared,
    - at least one objective must be provided,
    - at least one design variable must be provided,
    - all the objective functions and constraints must be outputs of a discipline,
      not necessarily the one of the current sheet,
    - all the design variables must be inputs of a discipline,
      not necessarily the one of the current sheet.

    The columns ``Options`` and ``Options values`` are used
    to pass the formulation options.
    Note that for string type ``Option values``,
    the value can be written with or without the ``""`` characters.

    To use multi-level MDO formulations,
    create multiple scenarios,
    and add the name of the sub-scenarios
    in the list of disciplines of the main (system) scenario.

    An arbitrary number of levels can be generated this way
    (three, four, ..., n, level formulations).
    """  # noqa: E501

    xls_study_path: str
    """The path to the Excel file."""

    worksheets: dict[str, DataFrame]
    """The worksheets of the Excel file."""

    disciplines: dict[str, MDODiscipline]
    """The non-executable disciplines."""

    scenarios: dict[str, dict[str, str | list[str]]]
    """The descriptions of the scenarios."""

    inputs: set[str]
    """The names of the input variables."""

    outputs: set[str]
    """The names of the output variables."""

    SCENARIO_PREFIX: Final[str] = "Scenario"
    DISCIPLINE: Final[str] = "Discipline"
    DISCIPLINES: Final[str] = "Disciplines"
    OBJECTIVE_FUNCTION: Final[str] = "Objective function"
    CONSTRAINTS: Final[str] = "Constraints"
    DESIGN_VARIABLES: Final[str] = "Design variables"
    FORMULATION: Final[str] = "Formulation"
    OPTIONS: Final[str] = "Options"
    OPTION_VALUES: Final[str] = "Options values"
    __INPUTS: Final[str] = "Inputs"
    __OUTPUTS: Final[str] = "Outputs"
    __SPACE: Final[str] = MultiLineString.INDENTATION

    def __init__(self, xls_study_path: str, has_scenario: bool = True) -> None:
        """Args:
            xls_study_path: The path to the Excel file describing the study.
            has_scenario: Whether the Excel file has a scenario sheet.

        Raises:
            IOError: If the Excel file cannot be opened.
            ValueError: If no scenario has been found in Excel file
                while the study is an MDO one.
        """  # noqa: D205 D212 D415
        self.xls_study_path = xls_study_path
        try:
            self.worksheets = read_excel(
                xls_study_path, sheet_name=None, engine="openpyxl"
            )
        except OSError:
            LOGGER.exception("Failed to open the study file: %s", xls_study_path)
            raise

        self.__log_number_objects_detected(True)
        self.disciplines = {}
        self.scenarios = {}
        self.inputs = set()
        self.outputs = set()

        self._init_disciplines()
        self.__set_scenario_descriptions()

        if has_scenario and not self.scenarios:
            msg = "No scenario found in the XLS file."
            raise ValueError(msg)

    def _init_disciplines(self) -> None:
        """Initialize the disciplines.

        Raises:
            ValueError: If the discipline has no input column or output column.
        """
        all_inputs = []
        all_outputs = []
        string = MultiLineString()
        string.indent()
        missing_column_msg = "The sheet of the discipline '{}' must have a column '{}'"
        for sheet_name, sheet_value in self.worksheets.items():
            if sheet_name.startswith(self.SCENARIO_PREFIX):
                continue

            # We use add("{}", sheet_name) rather than add(sheet_name)
            # to prevent problems with special characters in disc_name,
            # e.g. "Discipline{1}".
            string.add("{}", sheet_name)
            try:
                inputs = self.__get_series(sheet_value, self.__INPUTS)
                all_inputs += inputs
            except ValueError:
                raise ValueError(
                    missing_column_msg.format(sheet_name, self.__INPUTS)
                ) from None

            try:
                outputs = self.__get_series(sheet_value, self.__OUTPUTS)
                all_outputs += outputs
            except ValueError:
                raise ValueError(
                    missing_column_msg.format(sheet_name, self.__OUTPUTS)
                ) from None

            discipline = MDODiscipline(sheet_name)
            discipline.input_grammar.update_from_names(inputs)
            discipline.output_grammar.update_from_names(outputs)
            string.indent()
            string.add("{}: {}", self.__INPUTS, pretty_str(inputs))
            string.add("{}: {}", self.__OUTPUTS, pretty_str(outputs))
            string.dedent()
            self.disciplines[sheet_name] = discipline

        LOGGER.info("%s", string)
        self.inputs = set(all_inputs)
        self.outputs = set(all_outputs)

    @staticmethod
    def __get_series(
        frame: DataFrame, series_name: str, raise_error: bool = True
    ) -> list[str]:
        """Return the data of a named column.

        Removes empty data.

        Args:
            frame: The pandas frame of the sheet.
            series_name: The name of the series.
            raise_error: Whether to raise a ``ValueError``
                when the series does not exist;
                otherwise, return an empty list.

        Returns:
            The names of the columns, if the series exist.

        Raises:
            ValueError: If the sheet has no name and ``raise_error`` is ``True``.
        """
        series = frame.get(series_name)
        if series is None:
            if raise_error:
                msg = f"The sheet has no series named '{series_name}'."
                raise ValueError(msg)
            return []
        # Remove empty data
        return [val for val in series.tolist() if val == val]

    def __set_scenario_descriptions(self) -> None:
        """Define the descriptions of the different scenarios.

        In terms of objective function, the constraints and the design variables.

        Raises:
            ValueError: If at least one of following elements is missing:
                * ``disciplines`` column,
                * ``design variables`` column,
                * ``objectives`` column,
                * ``constraints`` column,
                * ``formulations`` column,
                * if a scenario has more than one formulation,
                * if a scenario has different number of option values.
        """
        self.scenarios = {}
        worksheets = self.__log_number_objects_detected(False)
        missing_column_msg = "Scenario {} has no {} column."
        for frame_name, frame in worksheets.items():
            try:
                disciplines = self.__get_series(frame, self.DISCIPLINES)
            except ValueError:
                raise ValueError(
                    missing_column_msg.format(frame_name, self.DISCIPLINES)
                ) from None

            try:
                design_variables = self.__get_series(frame, self.DESIGN_VARIABLES)
            except ValueError:
                raise ValueError(
                    missing_column_msg.format(frame_name, self.DESIGN_VARIABLES)
                ) from None

            try:
                objectives = self.__get_series(frame, self.OBJECTIVE_FUNCTION)
            except ValueError:
                raise ValueError(
                    missing_column_msg.format(frame_name, self.OBJECTIVE_FUNCTION)
                ) from None

            try:
                constraints = self.__get_series(frame, self.CONSTRAINTS)
            except ValueError:
                raise ValueError(
                    missing_column_msg.format(frame_name, self.CONSTRAINTS)
                ) from None

            try:
                formulation = self.__get_series(frame, self.FORMULATION)
            except ValueError:
                raise ValueError(
                    missing_column_msg.format(frame_name, self.FORMULATION)
                ) from None

            options = self.__get_series(frame, self.OPTIONS, False)
            option_values = self.__get_series(frame, self.OPTION_VALUES, False)

            if len(formulation) != 1:
                msg = f"Scenario {frame_name!s} must have one {self.FORMULATION} value."
                raise ValueError(msg) from None

            if options is not None and len(options) != len(option_values):
                msg = (
                    f"Options {options} and Options values {option_values} "
                    "must have the same length."
                )
                raise ValueError(msg) from None

            scenario_description = {
                self.DISCIPLINES: disciplines,
                self.OBJECTIVE_FUNCTION: objectives,
                self.CONSTRAINTS: constraints,
                self.DESIGN_VARIABLES: design_variables,
                self.FORMULATION: formulation[0],
                self.OPTIONS: options,
                self.OPTION_VALUES: option_values,
            }

            self.scenarios[frame_name] = scenario_description

        for scenario_name, scenario_description in self.scenarios.items():
            self.__check_scenario_description(
                scenario_description[self.OBJECTIVE_FUNCTION],
                scenario_description[self.CONSTRAINTS],
                scenario_description[self.DISCIPLINES],
                scenario_description[self.DESIGN_VARIABLES],
                scenario_description[self.FORMULATION],
                scenario_name,
            )

    def __log_number_objects_detected(
        self, is_discipline: bool
    ) -> dict[str | int, DataFrame]:
        """Log the number of worksheets matching a given type.

        Args:
            is_discipline: Whether the worksheet defines a discipline;
                otherwise, a scenario.

        Returns:
            The worksheets defining a discipline if ``is_discipline`` is ``True``;
            otherwise the others that are supposed to define scenarios.
        """
        worksheets = {
            sheet_name: sheet_value
            for sheet_name, sheet_value in self.worksheets.items()
            if sheet_name.startswith(self.SCENARIO_PREFIX) is not is_discipline
        }
        if worksheets:
            n_worksheets = len(worksheets)
            LOGGER.info(
                "%s %s%s detected",
                n_worksheets,
                "discipline" if is_discipline else "scenario",
                "s" if n_worksheets > 1 else "",
            )

        return worksheets

    def __check_scenario_description(
        self,
        objectives: Iterable[str],
        constraints: Iterable[str],
        disciplines: Iterable[str],
        design_variables: Iterable[str],
        formulation: str,
        scenario_name: str,
    ) -> None:
        """Checks the optimization problem consistency.

        Args:
            objectives: The names of the objectives.
            constraints: The names of the constraints.
            disciplines: The names of the disciplines.
            design_variables: The names of the design variables.
            formulation: The name of the MDO formulation.
            scenario_name: The name of the scenario.

        Raises:
            ValueError: If at least one of following situation happens:
                * design variables in the scenario are not input of any discipline,
                * some disciplines do not exist in the scenario,
                * some constraints are not outputs of any discipline,
                * the objective function is not an output of any discipline,
                * the formulation is unknown.
        """
        string = MultiLineString()
        string.indent()
        # We use add("{}", scn_name) rather than add(scn_name)
        # to prevent problems with special characters in scn_name, e.g. "Scenario{1}".
        string.add("{}", scenario_name)
        string.indent()
        string.add("Objectives: {}", pretty_str(objectives))
        string.add("Disciplines: {}", pretty_str(disciplines))
        string.add("Constraints: {}", pretty_str(constraints))
        string.add("Design variables: {}", pretty_str(design_variables))
        string.add("Formulation: {}", formulation)
        LOGGER.info("%s", string)

        missing = set(design_variables) - self.inputs
        if missing:
            msg = (
                f"{scenario_name}: some design variables are not "
                f"the inputs of any discipline: {missing}."
            )
            raise ValueError(msg)

        missing = set(disciplines) - set(self.disciplines.keys()) - set(self.scenarios)
        if missing:
            msg = f"{scenario_name}: some disciplines don't exist: {missing}."
            raise ValueError(msg)

        missing = set(constraints) - self.outputs
        if missing:
            msg = (
                f"{scenario_name}: some constraints are not "
                f"the outputs of any discipline: {missing}."
            )
            raise ValueError(msg)

        missing = set(objectives) - self.outputs
        if missing:
            msg = (
                f"{scenario_name}: some objectives are not "
                f"the outputs of any discipline: {missing}."
            )
            raise ValueError(msg)
        if not objectives:
            msg = f"{scenario_name}: no objectives are defined"
            raise ValueError(msg)

        if formulation not in get_available_formulations():
            msg = (
                f"{scenario_name}: unknown formulation '{formulation}'; "
                f"use one of: {get_available_formulations()}"
            )
            raise ValueError(msg)