Source code for gemseo.utils.study_analyses.xls_study_parser

# Copyright 2021 IRT Saint Exupéry, https://www.irt-saintexupery.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License version 3 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
# Contributors:
#    INITIAL AUTHORS - initial API and implementation and/or
#                      initial documentation
#        :author:  Francois Gallard
#    OTHER AUTHORS   - MACROSCOPIC CHANGES
"""Excel file parser for the study analyses."""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING
from typing import Final

from pandas import DataFrame
from pandas import read_excel

from gemseo import get_available_formulations
from gemseo.core.discipline import MDODiscipline
from gemseo.utils.string_tools import MultiLineString
from gemseo.utils.string_tools import pretty_str

if TYPE_CHECKING:
    from collections.abc import Iterable

LOGGER = logging.getLogger(__name__)


[docs] class XLSStudyParser: """A study specification based on an Excel file. The Excel file shall contain one sheet per discipline: - the name of the sheet shall have the discipline name, - the sheet shall define the input names of the discipline as a vertical succession of cells starting with ``"Inputs"``: .. table:: Inputs +--------------+ | Inputs | +--------------+ | input_name_1 | +--------------+ | ... | +--------------+ | input_name_N | +--------------+ - the sheet shall define the output names of the discipline as a vertical succession of cells starting with ``"Outputs"``: .. table:: Outputs +---------------+ | Outputs | +---------------+ | output_name_1 | +---------------+ | ... | +---------------+ | output_name_N | +---------------+ - the empty lines of the series ``Inputs`` and ``Outputs`` are ignored, - the sheet may contain other data, but these will not be taken into account. If ``has_scenario`` is ``True``, the Excel file shall contain one sheet per scenario with a name starting by ``Scenario``. Distributed formulations shall contain one sheet for the main scenario and one sheet per sub-scenario. A scenario sheet shall have the following columns: .. table:: Scenario1 +------------------+--------------------+-------------+-------------+-------------+---------------+----------------+ | Design variables | Objective function | Constraints | Disciplines | Formulation | Options | Options values | +==================+====================+=============+=============+=============+===============+================+ | in1 | out1 | out2 | Disc1 | MDF | tolerance | 0.1 | +------------------+--------------------+-------------+-------------+-------------+---------------+----------------+ | | | | Disc2 | | main_mda_name | MDAJacobi | +------------------+--------------------+-------------+-------------+-------------+---------------+----------------+ These columns must satisfy some constraints: - all of them are mandatory, even if empty for the constraints, - their order does not matter, - one and only one formulation must be declared, - at least one objective must be provided, - at least one design variable must be provided, - all the objective functions and constraints must be outputs of a discipline, not necessarily the one of the current sheet, - all the design variables must be inputs of a discipline, not necessarily the one of the current sheet. The columns ``Options`` and ``Options values`` are used to pass the formulation options. Note that for string type ``Option values``, the value can be written with or without the ``""`` characters. To use multi-level MDO formulations, create multiple scenarios, and add the name of the sub-scenarios in the list of disciplines of the main (system) scenario. An arbitrary number of levels can be generated this way (three, four, ..., n, level formulations). """ # noqa: E501 xls_study_path: str """The path to the Excel file.""" worksheets: dict[str, DataFrame] """The worksheets of the Excel file.""" disciplines: dict[str, MDODiscipline] """The non-executable disciplines.""" scenarios: dict[str, dict[str, str | list[str]]] """The descriptions of the scenarios.""" inputs: set[str] """The names of the input variables.""" outputs: set[str] """The names of the output variables.""" SCENARIO_PREFIX: Final[str] = "Scenario" DISCIPLINE: Final[str] = "Discipline" DISCIPLINES: Final[str] = "Disciplines" OBJECTIVE_FUNCTION: Final[str] = "Objective function" CONSTRAINTS: Final[str] = "Constraints" DESIGN_VARIABLES: Final[str] = "Design variables" FORMULATION: Final[str] = "Formulation" OPTIONS: Final[str] = "Options" OPTION_VALUES: Final[str] = "Options values" __INPUTS: Final[str] = "Inputs" __OUTPUTS: Final[str] = "Outputs" __SPACE: Final[str] = MultiLineString.INDENTATION def __init__(self, xls_study_path: str, has_scenario: bool = True) -> None: """Args: xls_study_path: The path to the Excel file describing the study. has_scenario: Whether the Excel file has a scenario sheet. Raises: IOError: If the Excel file cannot be opened. ValueError: If no scenario has been found in Excel file while the study is an MDO one. """ # noqa: D205 D212 D415 self.xls_study_path = xls_study_path try: self.worksheets = read_excel( xls_study_path, sheet_name=None, engine="openpyxl" ) except OSError: LOGGER.exception("Failed to open the study file: %s", xls_study_path) raise self.__log_number_objects_detected(True) self.disciplines = {} self.scenarios = {} self.inputs = set() self.outputs = set() self._init_disciplines() self.__set_scenario_descriptions() if has_scenario and not self.scenarios: msg = "No scenario found in the XLS file." raise ValueError(msg) def _init_disciplines(self) -> None: """Initialize the disciplines. Raises: ValueError: If the discipline has no input column or output column. """ all_inputs = [] all_outputs = [] string = MultiLineString() string.indent() missing_column_msg = "The sheet of the discipline '{}' must have a column '{}'" for sheet_name, sheet_value in self.worksheets.items(): if sheet_name.startswith(self.SCENARIO_PREFIX): continue # We use add("{}", sheet_name) rather than add(sheet_name) # to prevent problems with special characters in disc_name, # e.g. "Discipline{1}". string.add("{}", sheet_name) try: inputs = self.__get_series(sheet_value, self.__INPUTS) all_inputs += inputs except ValueError: raise ValueError( missing_column_msg.format(sheet_name, self.__INPUTS) ) from None try: outputs = self.__get_series(sheet_value, self.__OUTPUTS) all_outputs += outputs except ValueError: raise ValueError( missing_column_msg.format(sheet_name, self.__OUTPUTS) ) from None discipline = MDODiscipline(sheet_name) discipline.input_grammar.update_from_names(inputs) discipline.output_grammar.update_from_names(outputs) string.indent() string.add("{}: {}", self.__INPUTS, pretty_str(inputs)) string.add("{}: {}", self.__OUTPUTS, pretty_str(outputs)) string.dedent() self.disciplines[sheet_name] = discipline LOGGER.info("%s", string) self.inputs = set(all_inputs) self.outputs = set(all_outputs) @staticmethod def __get_series( frame: DataFrame, series_name: str, raise_error: bool = True ) -> list[str]: """Return the data of a named column. Removes empty data. Args: frame: The pandas frame of the sheet. series_name: The name of the series. raise_error: Whether to raise a ``ValueError`` when the series does not exist; otherwise, return an empty list. Returns: The names of the columns, if the series exist. Raises: ValueError: If the sheet has no name and ``raise_error`` is ``True``. """ series = frame.get(series_name) if series is None: if raise_error: msg = f"The sheet has no series named '{series_name}'." raise ValueError(msg) return [] # Remove empty data return [val for val in series.tolist() if val == val] def __set_scenario_descriptions(self) -> None: """Define the descriptions of the different scenarios. In terms of objective function, the constraints and the design variables. Raises: ValueError: If at least one of following elements is missing: * ``disciplines`` column, * ``design variables`` column, * ``objectives`` column, * ``constraints`` column, * ``formulations`` column, * if a scenario has more than one formulation, * if a scenario has different number of option values. """ self.scenarios = {} worksheets = self.__log_number_objects_detected(False) missing_column_msg = "Scenario {} has no {} column." for frame_name, frame in worksheets.items(): try: disciplines = self.__get_series(frame, self.DISCIPLINES) except ValueError: raise ValueError( missing_column_msg.format(frame_name, self.DISCIPLINES) ) from None try: design_variables = self.__get_series(frame, self.DESIGN_VARIABLES) except ValueError: raise ValueError( missing_column_msg.format(frame_name, self.DESIGN_VARIABLES) ) from None try: objectives = self.__get_series(frame, self.OBJECTIVE_FUNCTION) except ValueError: raise ValueError( missing_column_msg.format(frame_name, self.OBJECTIVE_FUNCTION) ) from None try: constraints = self.__get_series(frame, self.CONSTRAINTS) except ValueError: raise ValueError( missing_column_msg.format(frame_name, self.CONSTRAINTS) ) from None try: formulation = self.__get_series(frame, self.FORMULATION) except ValueError: raise ValueError( missing_column_msg.format(frame_name, self.FORMULATION) ) from None options = self.__get_series(frame, self.OPTIONS, False) option_values = self.__get_series(frame, self.OPTION_VALUES, False) if len(formulation) != 1: msg = f"Scenario {frame_name!s} must have one {self.FORMULATION} value." raise ValueError(msg) from None if options is not None and len(options) != len(option_values): msg = ( f"Options {options} and Options values {option_values} " "must have the same length." ) raise ValueError(msg) from None scenario_description = { self.DISCIPLINES: disciplines, self.OBJECTIVE_FUNCTION: objectives, self.CONSTRAINTS: constraints, self.DESIGN_VARIABLES: design_variables, self.FORMULATION: formulation[0], self.OPTIONS: options, self.OPTION_VALUES: option_values, } self.scenarios[frame_name] = scenario_description for scenario_name, scenario_description in self.scenarios.items(): self.__check_scenario_description( scenario_description[self.OBJECTIVE_FUNCTION], scenario_description[self.CONSTRAINTS], scenario_description[self.DISCIPLINES], scenario_description[self.DESIGN_VARIABLES], scenario_description[self.FORMULATION], scenario_name, ) def __log_number_objects_detected( self, is_discipline: bool ) -> dict[str | int, DataFrame]: """Log the number of worksheets matching a given type. Args: is_discipline: Whether the worksheet defines a discipline; otherwise, a scenario. Returns: The worksheets defining a discipline if ``is_discipline`` is ``True``; otherwise the others that are supposed to define scenarios. """ worksheets = { sheet_name: sheet_value for sheet_name, sheet_value in self.worksheets.items() if sheet_name.startswith(self.SCENARIO_PREFIX) is not is_discipline } if worksheets: n_worksheets = len(worksheets) LOGGER.info( "%s %s%s detected", n_worksheets, "discipline" if is_discipline else "scenario", "s" if n_worksheets > 1 else "", ) return worksheets def __check_scenario_description( self, objectives: Iterable[str], constraints: Iterable[str], disciplines: Iterable[str], design_variables: Iterable[str], formulation: str, scenario_name: str, ) -> None: """Checks the optimization problem consistency. Args: objectives: The names of the objectives. constraints: The names of the constraints. disciplines: The names of the disciplines. design_variables: The names of the design variables. formulation: The name of the MDO formulation. scenario_name: The name of the scenario. Raises: ValueError: If at least one of following situation happens: * design variables in the scenario are not input of any discipline, * some disciplines do not exist in the scenario, * some constraints are not outputs of any discipline, * the objective function is not an output of any discipline, * the formulation is unknown. """ string = MultiLineString() string.indent() # We use add("{}", scn_name) rather than add(scn_name) # to prevent problems with special characters in scn_name, e.g. "Scenario{1}". string.add("{}", scenario_name) string.indent() string.add("Objectives: {}", pretty_str(objectives)) string.add("Disciplines: {}", pretty_str(disciplines)) string.add("Constraints: {}", pretty_str(constraints)) string.add("Design variables: {}", pretty_str(design_variables)) string.add("Formulation: {}", formulation) LOGGER.info("%s", string) missing = set(design_variables) - self.inputs if missing: msg = ( f"{scenario_name}: some design variables are not " f"the inputs of any discipline: {missing}." ) raise ValueError(msg) missing = set(disciplines) - set(self.disciplines.keys()) - set(self.scenarios) if missing: msg = f"{scenario_name}: some disciplines don't exist: {missing}." raise ValueError(msg) missing = set(constraints) - self.outputs if missing: msg = ( f"{scenario_name}: some constraints are not " f"the outputs of any discipline: {missing}." ) raise ValueError(msg) missing = set(objectives) - self.outputs if missing: msg = ( f"{scenario_name}: some objectives are not " f"the outputs of any discipline: {missing}." ) raise ValueError(msg) if not objectives: msg = f"{scenario_name}: no objectives are defined" raise ValueError(msg) if formulation not in get_available_formulations(): msg = ( f"{scenario_name}: unknown formulation '{formulation}'; " f"use one of: {get_available_formulations()}" ) raise ValueError(msg)