Note
Go to the end to download the full example code.
Create a discipline that uses pandas DataFrames#
from __future__ import annotations
from typing import TYPE_CHECKING
import pandera as pa
from pandas import DataFrame
from pandera.typing import DataFrame as DataFrameType
from pandera.typing import Series # noqa: TC002
from pydantic import BaseModel
from gemseo.core.data_converters.pydantic import PydanticGrammarDataConverter
from gemseo.core.discipline import Discipline
from gemseo.core.grammars.pydantic_grammar import PydanticGrammar
if TYPE_CHECKING:
from gemseo.typing import StrKeyMapping
Import#
Create a discipline that uses a DataFrame#
We will create a class for a simple discipline that computes an output
variable y = 1 - 0.2 * x where x is an input variable.
For whatever reason, the business logic of this discipline uses a pandas DataFrame
to store the input and output values outside GEMSEO.
Although GEMSEO disciplines only handle input and output variables that are NumPy arrays,
their local data and default input values can use DataFrame objects.
The input and output grammars of the discipline shall use a naming convention
to access the names of the columns of a DataFrame.
The naming convention is built with the name of the input or output,
the character ~ (this can be changed) and
the name of the DataFrame column.
The code executed by the discipline is in the _run method,
where self.data, i.e. the local data, has automatically been initialized
with the default inputs and updated with the inputs passed to the discipline.
A DataFrame can be retrieved by querying the corresponding key, e.g. df,
in the local data and then changes can be made to this DataFrame, e.g.
discipline.data["df"]["x"] = value.
The default inputs and local data are instances of DisciplineData.
See also
DisciplineData has more information about how DataFrames are handled.
class InputDataFrameModel(pa.DataFrameModel):
x: Series[float] = pa.Field(unique=True)
class OutputDataFrameModel(pa.DataFrameModel):
y: Series[float] = pa.Field(unique=True)
class InputGrammarModel(BaseModel):
df: DataFrameType[InputDataFrameModel]
class OutputGrammarModel(BaseModel):
df: DataFrameType[OutputDataFrameModel]
class DataConverter(PydanticGrammarDataConverter):
"""A data converter where some coupling variables are 2D NumPy arrays."""
def convert_value_to_array(self, name, value):
if name == "df":
return value.to_numpy().flatten()
return super().convert_value_to_array(name, value)
def convert_array_to_value(self, name, array_):
if name == "df":
return DataFrame({"x": [array_[0]], "y": [array_[1]]})
return super().convert_array_to_value(name, array_)
PydanticGrammar.DATA_CONVERTER_CLASS = DataConverter
class DataFrameDiscipline(Discipline):
default_grammar_type = Discipline.GrammarType.PYDANTIC
def __init__(self) -> None:
super().__init__()
self.input_grammar = PydanticGrammar("inputs", model=InputGrammarModel)
self.output_grammar = PydanticGrammar("outputs", model=OutputGrammarModel)
self.default_input_data = {"df": DataFrame(data={"x": [0.0]})}
def _run(self, input_data: StrKeyMapping) -> StrKeyMapping | None:
df = self.local_data["df"]
df["y"] = 1.0 - 0.2 * df["x"]
Instantiate the discipline#
discipline = DataFrameDiscipline()
Execute the discipline#
Then, we can execute it easily, either considering default inputs:
discipline.execute()
{'df': x y
0 0.0 1.0}
or using new inputs:
discipline.execute({"df": DataFrame(data={"x": [1.0]})})
{'df': x y
0 1.0 0.8}
Total running time of the script: (0 minutes 0.095 seconds)