Source code for endogen.config

# from pydantic.dataclasses import dataclass, Field
# from pydantic import validator
from dataclasses import dataclass, field
from typing import List, Optional, Literal, Any, Mapping, Tuple

from .variables import (
    VariableTransform,
    VariableDifference,
    VariableRolling,
    VariableLag,
)


[docs] @dataclass(eq=True, frozen=True) class Transform: """A schema for describing a variable transform and a factory for `variables.VariableTransform` variables. Can be used separately, or in conjunction with .yaml files and `hydra.initialize`, `hydra.compose`, and `hydra.utils.instantiate`. Parameters ---------- output_var : str Name of the output variable input_vars : list[str] List of input variables needed to create the output variable formula : str A Wilkinson formula supported by `formulae`. See https://bambinos.github.io/formulae/notebooks/getting_started.html#User-guide. after_forecast : bool Endogenous simulation requires that all variables are fully specified in a circular fashion. At the same time, there cannot be any circular definitions in the transformation step, nor in the forecast step. If after_forecast is True, the variable is estimated/calculated in the forecast step. """ output_var: str input_vars: List[str] formula: str after_forecast: bool = False
[docs] def get_variables(self) -> VariableTransform: """Helper function to create a VariableTransform. Returns ------- VariableTransform """ if self.after_forecast: subset = 1 else: subset = 0 return VariableTransform( output_var=self.output_var, input_vars=self.input_vars, formula=self.formula, subset=subset, )
[docs] @dataclass(eq=True, frozen=True) class Differences: """A schema for describing a growth variables and a factory for `variables.VariableDifference` variables. Can be used separately, or in conjunction with .yaml files and `hydra.initialize`, `hydra.compose`, and `hydra.utils.instantiate`. Will return output variables named the same as the input variables, only with "_gr" suffixed. Parameters ---------- type : Literal["growth] Growth is currently the only difference function available. Note that dividing with zero is a possibility. input_vars : list[str] List of input variables to transform. """ type: Literal["growth"] input_vars: List[str] def get_output_name(self, input_var: str) -> str: match self.type: case "growth": suffix = "_gr" case _: raise ValueError(f"Unknown Differences.type: {self.type}") return f"{input_var}{suffix}" def get_variables(self) -> List[VariableDifference]: [ VariableDifference( output_var=self.get_output_name(input_var), input_var=input_var, type=self.type, subset=0, ) for input_var in self.input_vars ]
[docs] @dataclass(eq=True, frozen=True) class Rolling: """A schema for describing variables with "rolling" transformations and a factory for `variables.VariableRolling` variables. Can be used separately, or in conjunction with .yaml files and `hydra.initialize`, `hydra.compose`, and `hydra.utils.instantiate`. Will return output variables named the same as the input variables, only with a suffix according to this scheme: {input_var}_{window_type_suffix}{fun_suffix}{window} where: =========== ======= window_type suffix =========== ======= normal \_r span \_rsp com \_rc halflife \_hl alpha \_ral =========== ======= ==== ====== fun suffix ==== ====== mean m sum s ==== ====== Parameters ---------- window : int The window size in time-units. funs : list[Literal["mean", "sum]] List of aggregation functions (rolling mean or rolling sum). input_vars : list[str] List of input variables to transform. window_type : Literal["normal", "span", "com", "halflife", "alpha"] "normal" is equally weighted. See `pandas.DataFrame.ewm` for details on the rest. """ window: int funs: List[Literal["mean", "sum"]] input_vars: List[str] window_type: Literal["normal", "span", "com", "halflife", "alpha"] = "normal" def get_output_name(self, input_var: str, fun: str) -> str: match self.window_type: case "normal": suffix = "_r" case "span": suffix = "_rsp" case "com": suffix = "_rc" case "halflife": suffix = "_hl" case "alpha": suffix = "_ral" case _: raise ValueError(f"Unknown Rolling.window_type: {self.window_type}") match fun: case "mean": suffix = suffix + "m" case "sum": suffix = suffix + "s" case _: raise ValueError(f"Unknown function: {fun}, in Rolling.funs") return f"{input_var}{suffix}{self.window}" def get_variables(self) -> List[VariableRolling]: out = [] for input_var in self.input_vars: for fun in self.funs: obj = VariableRolling( output_var=self.get_output_name(input_var, fun), input_var=input_var, window=self.window, fun=fun, window_type=self.window_type, subset=0, ) out.append(obj) return out
[docs] @dataclass(eq=True, frozen=True) class Lags: """A schema for describing a lagged variables and a factory for `variables.VariableLag` variables. Can be used separately, or in conjunction with .yaml files and `hydra.initialize`, `hydra.compose`, and `hydra.utils.instantiate`. Will return output variables named the same as the input variables, only with "_l{num_lag}" suffixed. Parameters ---------- num_lag : int How many time-units to offset. E.g., 1 would lag a time-series 1 time-unit compared to the input_var. input_vars : list[str] List of input variables to transform. """ num_lag: int input_vars: List[str] def get_output_names(self, input_var: str) -> str: return f"{input_var}_l{self.num_lag}" def get_variables(self) -> List[VariableLag]: return [ VariableLag( output_var=self.get_output_names(input_var), input_var=input_var, num_lag=self.num_lag, subset=0, ) for input_var in self.input_vars ]
[docs] @dataclass(eq=True, frozen=True) class InputModel: """Configuration schema for statistical model of any variable, to be used in endogenous simulation. Can be used separately, or in conjunction with .yaml files and `hydra.initialize`, `hydra.compose`, and `hydra.utils.instantiate`. Endogenous simulation requires not only knowledge of the statistical model, but also of any other variable input in the model. Some of these might be statistical models on their own (e.g., an `InputModel`), whilst other variables might be variable transforms of various types (see the `variables` module). These models must be fully specified here. Note the naming conventions for variable transforms in the various `config` schemas. E.g., for referencing a 1-year lagged variable as input_var, you can put the non-lagged variable "var1" in `InputModel.lags`, and "var1_l1" in `InputModel.input_vars`. The `endogen.ModelController` will make sure variables are calculated in the correct sequence. Parameters ---------- stage : Literal["writing", "evaluating", "production"] Information on at which development-stage the InputModel can be said to live in. Can be useful in larger production settings. output_var : str The name of the output variable in question. input_vars : list[str] List of input variables the output variable needs in its model. model : Any Any supported statistical (or otherwise) model class that can produce numerical output (forecasts) based on input data. Currently, that means any sklearn.base.BaseEstimator subclass or mlforecast.forecast.MLForecast lags : list[Lags] List of `config.Lags` necessary to build the `input_vars`. rolling : list[Rolling] List of `config.Rolling` necessary to build the `input_vars`. differences : list[Differences] List of `config.Differences` necessary to build the `input_vars`. transforms : list[Transforms] List of `config.Transforms` necessary to build the `input_vars`. subset : int Endogenous simulation requires that all variables are fully specified in a circular fashion. At the same time, there cannot be any circular definitions in the transformation step, nor in the forecast step. If the variable is subset == 0, it is estimated/calculated in the transformation step, if it is 1, it is in the forecast step. """ stage: Literal["writing", "evaluating", "production"] output_var: str input_vars: List[str] model: Any lags: Optional[List[Lags]] = field(default_factory=list) rolling: Optional[List[Rolling]] = field(default_factory=list) differences: Optional[List[Differences]] = field(default_factory=list) transforms: Optional[List[Transform]] = field(default_factory=list) subset: int = field(default=1) @property def node(self) -> Tuple[str, Mapping[str, Any]]: """A node representation that interface well with NetworkX graphs. Returns ------- Tuple[str, Mapping[str, Any]] A tuple where the first element is the output variable name ("node"), and the second element is a dictionary of node data. """ return (self.output_var, {"model": self.model, "subset": self.subset}) @property def edges(self) -> List[Tuple[str, str]]: """The edges between input variables and the output variable that interface well with NetworkX graphs. Returns ------- List[Tuple[str, str]] A list of graph edges describing the links between the input_vars and the output_var. """ return [(input_var, self.output_var) for input_var in self.input_vars]
[docs] @dataclass(eq=True, frozen=True) class ExogenModel: """Configuration schema for statistical model of any variable, to be used in endogenous simulation. Can be used separately, or in conjunction with .yaml files and `hydra.initialize`, `hydra.compose`, and `hydra.utils.instantiate`. An ExogenModel variable is forecast data coming from somewhere else. Currenly only supports deterministic exogenous variables. It has to be a complete set of data for all units in the simulation system, from the start date of simulation to the end date. output_var : str The name of the output variable in question. exogen_data : str String path to .csv or .parquet file only including time_var, unit_var and output_var. subset : int This should just always be 1. Might remove this as an option. """ output_var: str exogen_data: str subset: int = field(default=1) @property def node(self) -> Tuple[str, Mapping[str, Any]]: """A node representation that interface well with NetworkX graphs. Returns ------- Tuple[str, Mapping[str, Any]] A tuple where the first element is the output variable name ("node"), and the second element is a dictionary of node data. """ return (self.output_var, {"subset": self.subset})
[docs] @dataclass class GlobalSimConfig: """Configuration schema for global simulation options. Can be used separately, or in conjunction with .yaml files and `hydra.initialize`, `hydra.compose`, and `hydra.utils.instantiate`. Parameters ---------- input_data : str Path to input data in either .csv or .parquet file format. Used both for training and as initial values in simulation. time_var : str Name of the variable in the input_data indicating the time dimension. The variable must be integer type. unit_var : str Name of the variable in the input_data indicating the unit/spatial dimension. The variable must be integer type. nsim : int Number of independent simulations to run. end : int The time-unit to end simulation. Since these are fully described endogenous simulations, they can go indefinitely. include_past_n : int How much of the past to include when fitting statistical models. start : int The time-unit to start simulation. Must be an integer value found in the time_var series in the input_data. vars : list[str] The subset of variables in the input_data to include. """ input_data: str time_var: str unit_var: str nsim: int end: int include_past_n: int start: Optional[int] = None vars: Optional[List[str]] = field(default_factory=list)
[docs] @dataclass class Config: """Global configuration schema for the endogenous simulation. Can be used separately, or in conjunction with .yaml files and `hydra.initialize`, `hydra.compose`, and `hydra.utils.instantiate`. Parameters ---------- global_config : GlobalSimConfig Global simulation configuration options variables : list[InputModel] List of configuration schema for input models to include in endogenous simulation. Note restrictions on circularity, etc. """ _variable_dict: dict global_config: GlobalSimConfig variables: List[InputModel|ExogenModel] = field(default_factory=list)