diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c8c1c82..5dd1f8d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ files: | )/.*\.py$ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-added-large-files files: ".*" @@ -26,7 +26,7 @@ repos: - id: debug-statements files: '^src/.*\.py$' - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.23.3 + rev: 0.27.1 hooks: - id: check-github-workflows files: '^github/workflows/.*\.ya?ml$' @@ -34,7 +34,7 @@ repos: - id: check-dependabot files: '^\.github/dependabot\.ya?ml$' - repo: https://github.com/ambv/black - rev: 23.7.0 + rev: 23.11.0 hooks: - id: black name: black formatter mfpbench @@ -43,7 +43,7 @@ repos: name: black formatter tests args: ["--config=pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.5.1 + rev: v1.7.1 hooks: - id: mypy name: mypy @@ -55,7 +55,7 @@ repos: - "--show-traceback" - "--allow-untyped-decorators" # Test decorators are not properly typed - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.278 + rev: v0.1.6 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix, --no-cache] diff --git a/docs/quickstart.md b/docs/quickstart.md index e809ff8..630ed17 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -139,7 +139,7 @@ print("contains", "X_1" in config) print("len", len(config)) -print("dict", dict(config)) +print("dict", config.as_dict()) ``` ??? tip "How is that done?" @@ -150,16 +150,16 @@ print("dict", dict(config)) and other pythonic things! -=== "`dict()`/`from_dict()`" +=== "`as_dict()`/`from_dict()`" - [`Config.dict()`][mfpbench.Config.dict] returns a dictionary of the config. This is useful for + [`Config.as_dict()`][mfpbench.Config.as_dict] returns a dictionary of the config. This is useful for working with the config in other libraries. ```python exec="true" source="material-block" result="python" session="quickstart" config = benchmark.sample() print(config) - config_dict = config.dict() + config_dict = config.as_dict() print(config_dict) new_config = benchmark.Config.from_dict(config_dict) @@ -246,7 +246,7 @@ print("cost", result.cost) print(result) ``` -These share the [`dict()`][mfpbench.Result.dict] and [`from_dict()`][mfpbench.Result.from_dict] +These share the [`as_dict()`][mfpbench.Result.as_dict] and [`from_dict()`][mfpbench.Result.from_dict] methods as [`Config`][mfpbench.Config] objects but do not behave like dictionaries. The most notable property of [`Result`][mfpbench.Result] objects is that also have the @@ -278,7 +278,7 @@ identify the config in the table. **This is what's used to retrieve results from If this is missing when doing a [`query()`][mfpbench.Benchmark.query], we'll do our best to match the config to the table and get the correct id, but this is not guaranteed. -When using [`dict()`][mfpbench.TabularConfig.dict], this `id` is **not** included in the dictionary. +When using [`as_dict()`][mfpbench.TabularConfig.as_dict], this `id` is **not** included in the dictionary. In general you should either store the `config` object itself or at least `config.id`, that you can include back in before calling [`query()`][mfpbench.Benchmark.query]. diff --git a/pyproject.toml b/pyproject.toml index d1e6ead..97793e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,12 +10,12 @@ dependencies = [ "more_itertools", "pyarrow" ] -version = "1.7.3" +version = "1.7.4" description = "A wrapper for multi-fidelity benchmarks with priors" authors = [{name = "Eddie Bergman", email="eddiebergmanhs@gmail.com"}] readme = "README.md" license = { file = "LICENSE.txt" } -requires-python = ">=3.7" +requires-python = ">=3.8" classifiers = [ 'Intended Audience :: Science/Research', 'Intended Audience :: Developers', @@ -61,7 +61,7 @@ dev = [ [tool.pytest.ini_options] testpaths = ["tests"] # path to the test directory -minversion = "3.7" +minversion = "3.8" # addopts = "--cov=mfpbench" # Should be package name [tool.coverage.run] diff --git a/src/mfpbench/__init__.py b/src/mfpbench/__init__.py index 3c18fb7..0bceeae 100644 --- a/src/mfpbench/__init__.py +++ b/src/mfpbench/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations from mfpbench.benchmark import Benchmark -from mfpbench.config import Config, GenericTabularConfig, TabularConfig +from mfpbench.config import Config, TabularConfig from mfpbench.get import _mapping, get from mfpbench.jahs import JAHSBenchmark from mfpbench.lcbench_tabular import ( @@ -9,6 +9,7 @@ LCBenchTabularConfig, LCBenchTabularResult, ) +from mfpbench.metric import Metric from mfpbench.pd1 import ( PD1Benchmark, PD1cifar100_wideresnet_2048, @@ -17,7 +18,7 @@ PD1translatewmt_xformer_64, PD1uniref50_transformer_128, ) -from mfpbench.result import GenericTabularResult, Result +from mfpbench.result import Result from mfpbench.synthetic.hartmann import ( MFHartmann3Benchmark, MFHartmann3BenchmarkBad, @@ -31,7 +32,7 @@ MFHartmann6BenchmarkTerrible, MFHartmannBenchmark, ) -from mfpbench.tabular import GenericTabularBenchmark, TabularBenchmark +from mfpbench.tabular import TabularBenchmark from mfpbench.yahpo import ( IAMLglmnetBenchmark, IAMLrangerBenchmark, @@ -58,11 +59,8 @@ "YAHPOBenchmark", "PD1Benchmark", "TabularBenchmark", - "GenericTabularBenchmark", "Config", "TabularConfig", - "GenericTabularConfig", - "GenericTabularResult", "MFHartmannBenchmark", "MFHartmann3Benchmark", "MFHartmann6Benchmark", @@ -97,5 +95,6 @@ "PD1lm1b_transformer_2048", "PD1translatewmt_xformer_64", "PD1uniref50_transformer_128", + "Metric", "_mapping", ] diff --git a/src/mfpbench/benchmark.py b/src/mfpbench/benchmark.py index 1b7a733..0665095 100644 --- a/src/mfpbench/benchmark.py +++ b/src/mfpbench/benchmark.py @@ -3,7 +3,17 @@ import copy from abc import ABC, abstractmethod from pathlib import Path -from typing import TYPE_CHECKING, Any, Generic, Iterator, Mapping, TypeVar, overload +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Generic, + Iterable, + Iterator, + Mapping, + TypeVar, + overload, +) import numpy as np @@ -14,6 +24,8 @@ if TYPE_CHECKING: from ConfigSpace import ConfigurationSpace + from mfpbench.metric import Metric + HERE = Path(__file__).parent.parent PRIOR_DIR = HERE / "priors" @@ -30,50 +42,49 @@ class Benchmark(Generic[C, R, F], ABC): """Base class for a Benchmark.""" - fidelity_range: tuple[F, F, F] - """The fidelity range of this benchmark, (start, end, step)""" - - start: F - """The start of the fidelity range""" - - end: F - """The end of the fidelity range""" - - step: F - """The step of the fidelity range""" - - fidelity_name: str - """The name of the fidelity used in this benchmark""" - - space: ConfigurationSpace - """The configuration space used in this benchmark""" - - Config: type[C] - """The config type of this benchmark""" - - Result: type[R] - """The result type of this benchmark""" - - has_conditionals: bool = False - """Whether this benchmark has conditionals in it or not""" - - _default_prior_dir = PRIOR_DIR + _default_prior_dir: ClassVar[Path] = PRIOR_DIR """The default directory for priors""" - def __init__( + _result_renames: ClassVar[Mapping[str, str] | None] = None + """Any renaming to be done to raw result names before being passed + to the `Result` type. This can be useful if for example, the benchmark returns + a result named `valid-error-rate` but the `Result` type expects + `valid_error_rate`, as you can't have `-` in a python identifier. + """ + + _config_renames: ClassVar[Mapping[str, str] | None] = None + """Any renaming to be done to raw result names before being passed + to the `Config` type. This can be useful if for example, the benchmark returns + a result named `lambda` which is a reserved keyword in python but the `Config` + type expects `_lambda` as the key. + """ + + def __init__( # noqa: PLR0913 self, name: str, space: ConfigurationSpace, + config_type: type[C], + result_type: type[R], + fidelity_range: tuple[F, F, F], + fidelity_name: str, *, + has_conditionals: bool = False, seed: int | None = None, prior: str | Path | C | Mapping[str, Any] | None = None, perturb_prior: float | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ): """Initialize the benchmark. Args: name: The name of this benchmark space: The configuration space to use for the benchmark. + config_type: The type of config to use for the benchmark. + result_type: The type of result to use for the benchmark. + fidelity_name: The name of the fidelity to use for the benchmark. + fidelity_range: The range of fidelities to use for the benchmark. + has_conditionals: Whether this benchmark has conditionals in it or not. seed: The seed to use. prior: The prior to use for the benchmark. If None, no prior is used. If a str, will check the local location first for a prior @@ -84,13 +95,35 @@ def __init__( For numericals, this is interpreted as the standard deviation of a normal distribution while for categoricals, this is interpreted as the probability of swapping the value for a random one. + value_metric: The metric to use for this benchmark. Uses + the default metric from the Result if None. + cost_metric: The cost to use for this benchmark. Uses + the default cost from the Result if None. """ + if value_metric is None: + value_metric = result_type.default_value_metric + + if cost_metric is None: + cost_metric = result_type.default_cost_metric + self.name = name self.seed = seed self.space = space - self.start: F = self.fidelity_range[0] - self.end: F = self.fidelity_range[1] - self.step: F = self.fidelity_range[2] + self.value_metric = value_metric + self.cost_metric = cost_metric + self.fidelity_range: tuple[F, F, F] = fidelity_range + self.fidelity_name = fidelity_name + self.has_conditionals = has_conditionals + self.Config = config_type + self.Result = result_type + self.metric_optimums = { + metric_name: metric.optimum_value + for metric_name, metric in self.Result.metric_defs.items() + } + + if value_metric is None: + assert getattr(self.Result, "value_metric", None) is not None + value_metric = self.Result.value_metric self._prior_arg = prior @@ -108,7 +141,6 @@ def __init__( if prior is not None: self.prior = self._load_prior(prior, benchname=self.name) - self.prior.validate() else: self.prior = None @@ -123,18 +155,37 @@ def __init__( if self.prior is not None: self.prior.set_as_default_prior(space) - @classmethod + @property + def metrics(self) -> dict[str, Metric]: + """The metrics for this benchmark.""" + return dict(self.Result.metric_defs) + + @property + def start(self) -> F: + """The start of the fidelity range.""" + return self.fidelity_range[0] + + @property + def end(self) -> F: + """The end of the fidelity range.""" + return self.fidelity_range[1] + + @property + def step(self) -> F: + """The step of the fidelity range.""" + return self.fidelity_range[2] + def _load_prior( - cls, + self, prior: str | Path | Mapping[str, Any] | C, benchname: str | None = None, ) -> C: - Config: type[C] = cls.Config # Need to be a bit explicit here + Config: type[C] = self.Config # Need to be a bit explicit here if isinstance(prior, str): # It's a str, use as a key into available priors if benchname is not None: - assumed_path = cls._default_prior_dir / f"{benchname}-{prior}.yaml" + assumed_path = self._default_prior_dir / f"{benchname}-{prior}.yaml" if assumed_path.exists(): return Config.from_file(assumed_path) @@ -148,7 +199,7 @@ def _load_prior( return prior if isinstance(prior, Mapping): - return Config.from_dict(prior) + return Config.from_dict(prior, renames=self._config_renames) raise ValueError(f"Unknown prior type {type(prior)}") @@ -196,20 +247,23 @@ def load(self) -> None: def query( self, config: C | Mapping[str, Any], - at: F | None = None, *, - argmax: str | None = None, - argmin: str | None = None, + at: F | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ) -> R: """Submit a query and get a result. Args: config: The query to use at: The fidelity at which to query, defaults to None which means *maximum* - argmax: Whether to return the argmax up to the point `at`. Will be slower as - it has to get the entire trajectory. Uses the key from the Results. - argmin: Whether to return the argmin up to the point `at`. Will be slower as - it has to get the entire trajectory. Uses the key from the Results. + value_metric: The metric to use for this result. Uses + the value metric passed in to the constructor if not specified, + otherwise the default metric from the Result if None. + cost_metric: The metric to use for this result. Uses + the cost metric passed in to the constructor if not specified, + otherwise the default metric from the Result if None. + Returns: The result of the query @@ -217,29 +271,27 @@ def query( at = at if at is not None else self.end assert self.start <= at <= self.end - if argmax is not None and argmin is not None: - raise ValueError("Can't have both argmax and argmin") - - if argmax is not None: - _argmax = argmax - return max( - self.trajectory(config, frm=self.start, to=at), - key=lambda r: getattr(r, _argmax), - ) - - if argmin is not None: - _argmin = argmin - return min( - self.trajectory(config, frm=self.start, to=at), - key=lambda r: getattr(r, _argmin), - ) - if not isinstance(config, self.Config): - _config = self.Config.from_dict(config) + _config = self.Config.from_dict(config, renames=self._config_renames) else: _config = config - return self._objective_function(_config, at=at) + __config = dict(_config) + if self._config_renames is not None: + _reverse_renames = {v: k for k, v in self._config_renames.items()} + __config = {k: __config.get(v, v) for k, v in _reverse_renames.items()} + + value_metric = value_metric if value_metric is not None else self.value_metric + cost_metric = cost_metric if cost_metric is not None else self.cost_metric + + return self.Result.from_dict( + config=config, + fidelity=at, + result=self._objective_function(__config, at=at), + value_metric=str(value_metric), + cost_metric=str(cost_metric), + renames=self._result_renames, + ) def trajectory( self, @@ -248,6 +300,8 @@ def trajectory( frm: F | None = None, to: F | None = None, step: F | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ) -> list[R]: """Get the full trajectory of a configuration. @@ -256,6 +310,12 @@ def trajectory( frm: Start of the curve, should default to the start to: End of the curve, should default to the total step: Step size, defaults to ``cls.default_step`` + value_metric: The metric to use for this result. Uses + the value metric passed in to the constructor if not specified, + otherwise the default metric from the Result if None. + cost_metric: The metric to use for this result. Uses + the cost metric passed in to the constructor if not specified, + otherwise the default metric from the Result if None. Returns: A list of the results for this config @@ -264,15 +324,38 @@ def trajectory( frm = frm if frm is not None else self.start step = step if step is not None else self.step - if not isinstance(config, self.Config): - _config = self.Config.from_dict(config) - else: - _config = config + __config = dict(config) + if self._config_renames is not None: + _reverse_renames = {v: k for k, v in self._config_renames.items()} + __config = {k: __config.get(v, v) for k, v in _reverse_renames.items()} - return self._trajectory(_config, frm=frm, to=to, step=step) + value_metric = value_metric if value_metric is not None else self.value_metric + cost_metric = cost_metric if cost_metric is not None else self.cost_metric + + return [ + self.Result.from_dict( + config=config, + fidelity=fidelity, + result=result, + value_metric=str(value_metric), + cost_metric=str(cost_metric), + renames=self._result_renames, + ) + for fidelity, result in self._trajectory( + __config, + frm=frm, + to=to, + step=step, + ) + ] @abstractmethod - def _objective_function(self, config: C, *, at: F) -> R: + def _objective_function( + self, + config: Mapping[str, Any], + *, + at: F, + ) -> Mapping[str, float]: """Get the value of the benchmark for a config at a fidelity. Args: @@ -280,11 +363,18 @@ def _objective_function(self, config: C, *, at: F) -> R: at: The fidelity to get the result at Returns: - The result of the config + The result of the config as key value pairs """ ... - def _trajectory(self, config: C, *, frm: F, to: F, step: F) -> list[R]: + def _trajectory( + self, + config: Mapping[str, Any], + *, + frm: F, + to: F, + step: F, + ) -> Iterable[tuple[F, Mapping[str, float]]]: """Get the trajectory of a config. By default this will just call the @@ -301,7 +391,7 @@ def _trajectory(self, config: C, *, frm: F, to: F, step: F) -> list[R]: A list of the results for this config """ return [ - self._objective_function(config, at=fidelity) + (fidelity, self._objective_function(config, at=fidelity)) for fidelity in self.iter_fidelities(frm=frm, to=to, step=step) ] @@ -347,23 +437,34 @@ def sample( """ space = copy.deepcopy(self.space) if isinstance(seed, np.random.RandomState): - rng = seed.randint(0, 2**32 - 1) + rng = seed.randint(0, 2**31 - 1) else: rng = ( seed if seed is not None - else np.random.default_rng().integers(0, 2**32 - 1) + else np.random.default_rng().integers(0, 2**31 - 1) ) space.seed(rng) if n is None: - return self.Config.from_dict(space.sample_configuration()) + return self.Config.from_dict( + space.sample_configuration(), + renames=self._config_renames, + ) # Just because of how configspace works if n == 1: - return [self.Config.from_dict(space.sample_configuration())] + return [ + self.Config.from_dict( + space.sample_configuration(), + renames=self._config_renames, + ), + ] - return [self.Config.from_dict(c) for c in space.sample_configuration(n)] + return [ + self.Config.from_dict(c, renames=self._config_renames) + for c in space.sample_configuration(n) + ] def frame(self) -> ResultFrame[C, F, R]: """Get an empty frame to record with.""" diff --git a/src/mfpbench/config.py b/src/mfpbench/config.py index 82caefe..175b723 100644 --- a/src/mfpbench/config.py +++ b/src/mfpbench/config.py @@ -1,10 +1,10 @@ from __future__ import annotations import json -from abc import ABC, abstractmethod +from abc import ABC from dataclasses import asdict, dataclass, field, fields, replace from pathlib import Path -from typing import TYPE_CHECKING, Any, Iterator, Mapping +from typing import Any, Iterator, Mapping from typing_extensions import Self, override import numpy as np @@ -18,9 +18,6 @@ from mfpbench.util import perturb -if TYPE_CHECKING: - import pandas as pd - @dataclass(frozen=True, eq=False, unsafe_hash=True) # type: ignore[misc] class Config(ABC, Mapping[str, Any]): @@ -35,20 +32,22 @@ class Config(ABC, Mapping[str, Any]): """ @classmethod - def from_dict(cls, d: Mapping[str, Any]) -> Self: + def from_dict( + cls, + d: Mapping[str, Any], + renames: Mapping[str, str] | None = None, + ) -> Self: """Create from a dict or mapping object.""" + if renames is not None: + d = {renames.get(k, k): v for k, v in d.items()} + field_names = {f.name for f in fields(cls)} if not field_names.issuperset(d.keys()): raise ValueError(f"Dict keys {d.keys()} must be a subset of {field_names}") return cls(**{f.name: d[f.name] for f in fields(cls) if f.name in d}) - @classmethod - def from_row(cls, row: pd.Series) -> Self: - """Create from a row of a dataframe.""" - return cls.from_dict(row.to_dict()) - - def dict(self) -> dict[str, Any]: + def as_dict(self) -> dict[str, Any]: """As a raw dictionary.""" return asdict(self) @@ -99,24 +98,15 @@ def perturb( return self.mutate(**new_values) - @abstractmethod - def validate(self) -> None: - """Validate the config, just useful early on while testing. - - Raises: - AssertionError: If the config is not valid - """ - ... - def __eq__(self, that: Any) -> bool: """Equality is defined in terms of their dictionary repr.""" - this = self.dict() + this = self.as_dict() if isinstance(that, dict): that = that.copy() elif isinstance(that, Configuration): that = dict(that) elif isinstance(that, self.__class__): - that = that.dict() + that = that.as_dict() else: return False @@ -129,13 +119,13 @@ def __eq__(self, that: Any) -> bool: return this == _that def __getitem__(self, key: str) -> Any: - return self.dict()[key] + return self.as_dict()[key] def __len__(self) -> int: - return len(self.dict()) + return len(self.as_dict()) def __iter__(self) -> Iterator[str]: - return self.dict().__iter__() + return self.as_dict().__iter__() def set_as_default_prior(self, configspace: ConfigurationSpace) -> None: """Apply this configuration as a prior on a configspace. @@ -144,7 +134,7 @@ def set_as_default_prior(self, configspace: ConfigurationSpace) -> None: configspace: The space to apply this config to """ # We convert to dict incase there's any special transformation that happen - d = self.dict() + d = self.as_dict() for k, v in d.items(): hp = configspace[k] # https://github.com/automl/ConfigSpace/issues/270 @@ -211,7 +201,7 @@ def save(self, path: str | Path, format: str | None = None) -> None: path: Where to save to. Will infer json or yaml based on filename format: The format to save as. Will use file suffix if not provided """ - d = self.dict() + d = self.as_dict() path = Path(path) if format is None: if path.suffix == "json": @@ -246,29 +236,31 @@ class TabularConfig(Config): an id key. """ - @classmethod - def from_row(cls, row: pd.Series) -> Self: - """Create from a row of a dataframe.""" - return cls.from_dict({"id": row.name, **row.to_dict()}) - @override - def dict(self, *, with_id: bool = False) -> Any: + def as_dict(self, *, with_id: bool = False) -> Any: """As a raw dictionary. Args: with_id: Whether to include the id key """ - d = {**super().dict()} + d = {**super().as_dict()} if not with_id: d.pop("id") return d @classmethod @override - def from_dict(cls, d: Mapping[str, Any]) -> Self: + def from_dict( + cls, + d: Mapping[str, Any], + renames: Mapping[str, str] | None = None, + ) -> Self: """Create from a dict or mapping object.""" - d = dict(d) + if renames is not None: + d = {renames.get(k, k): v for k, v in d.items()} + else: + d = dict(d) d.setdefault("id", None) return cls(**d) @@ -276,51 +268,3 @@ def from_dict(cls, d: Mapping[str, Any]) -> Self: def names(cls) -> list[str]: """The names of entries in this config.""" return [f.name for f in fields(cls) if f.name not in ("id",)] - - def validate(self) -> None: - """Validate the config, just useful early on while testing. - - !!! note "Not implemented" - - Does not do anything for Tabular Benchmarks - """ - - -@dataclass(frozen=True, eq=False) # type: ignore[misc] -class GenericTabularConfig(TabularConfig): - """A generic tabular config. - - This is useful for adhoc tabular benchmarks and is what they will return, i.e. - directly creating a benchmark from TabularBenchmark. - """ - - _values: dict[str, Any] - - def __hash__(self) -> int: - """Hash based on the dictionary repr.""" - return hash(self.id) ^ hash(tuple(self._values.items())) - - @override - def dict(self, *, with_id: bool = False) -> Any: - """As a raw dictionary. - - Args: - with_id: Whether to include the id key - """ - d = {**self._values} - if with_id: - d["id"] = self.id - return d - - # Make .property acces work - def __getattr__(self, __name: str) -> Any: - # To prevent recursion - return self._values[__name] - - @classmethod - @override - def from_dict(cls, d: Mapping[str, Any]) -> Self: - """Create from a dict or mapping object.""" - d = dict(d) - id = d.pop("id") - return cls(id=id, _values=d) diff --git a/src/mfpbench/get.py b/src/mfpbench/get.py index 1626448..878ba92 100644 --- a/src/mfpbench/get.py +++ b/src/mfpbench/get.py @@ -90,6 +90,8 @@ def get( name: str, *, + value_metric: str | None = None, + cost_metric: str | None = None, prior: str | Path | Config | None = None, preload: bool = False, **kwargs: Any, @@ -98,6 +100,10 @@ def get( Args: name: The name of the benchmark + value_metric: The value metric to use for the benchmark. If not specified, + the default value metric is used. + cost_metric: The cost metric to use for the benchmark. If not specified, + the default cost metric is used. prior: The prior to use for the benchmark. * str - If it ends in {.json} or {.yaml, .yml}, it will convert it to a path and @@ -190,7 +196,7 @@ def get( ): prior = Path(prior) - bench = b(prior=prior, **kwargs) + bench = b(prior=prior, cost_metric=cost_metric, value_metric=value_metric, **kwargs) if preload: bench.load() diff --git a/src/mfpbench/jahs/benchmark.py b/src/mfpbench/jahs/benchmark.py index ae60b1b..08b05d4 100644 --- a/src/mfpbench/jahs/benchmark.py +++ b/src/mfpbench/jahs/benchmark.py @@ -3,7 +3,7 @@ from abc import ABC from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, Mapping +from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Literal, Mapping from typing_extensions import override import numpy as np @@ -16,9 +16,9 @@ from mfpbench.benchmark import Benchmark from mfpbench.config import Config +from mfpbench.metric import Metric from mfpbench.result import Result from mfpbench.setup_benchmark import JAHSBenchSource -from mfpbench.util import rename if TYPE_CHECKING: import jahs_bench @@ -51,81 +51,40 @@ class JAHSConfig(Config): LearningRate: float WeightDecay: float - def validate(self) -> None: - """Validate this config incase required.""" - # Just being explicit to catch bugs easily, we can remove later - assert self.N in [1, 3, 5] - assert self.W in [4, 8, 16] - assert self.Op1 in [0, 1, 2, 3, 4, 5] - assert self.Op2 in [0, 1, 2, 3, 4, 5] - assert self.Op3 in [0, 1, 2, 3, 4, 5] - assert self.Op4 in [0, 1, 2, 3, 4, 5] - assert self.Op5 in [0, 1, 2, 3, 4, 5] - assert self.Op6 in [0, 1, 2, 3, 4, 5] - assert self.Resolution in [0.25, 0.5, 1.0] - assert isinstance(self.TrivialAugment, bool) - assert self.Activation in ["ReLU", "Hardswish", "Mish"] - assert self.Optimizer in ["SGD"] - assert 1e-3 <= self.LearningRate <= 1e0 - assert 1e-5 <= self.WeightDecay <= 1e-2 - @dataclass(frozen=True) # type: ignore[misc] class JAHSResult(Result[JAHSConfig, int]): + default_value_metric: ClassVar[str] = "valid_acc" + default_cost_metric: ClassVar[str] = "runtime" + + metric_defs: ClassVar[Mapping[str, Metric]] = { + "runtime": Metric(minimize=True, bounds=(0, np.inf)), + "valid_acc": Metric(minimize=False, bounds=(0, 100)), + "test_acc": Metric(minimize=False, bounds=(0, 100)), + } + # Info # size: float # remove # flops: float # remove # latency: float # unit? remove - runtime: float # unit? + runtime: Metric.Value # unit? # Scores (0 - 100) - valid_acc: float - test_acc: float + valid_acc: Metric.Value + test_acc: Metric.Value # train_acc: float # remove - @property - def score(self) -> float: - """The score of interest.""" - return self.valid_acc - - @property - def error(self) -> float: - """The error of interest.""" - return 100 - self.valid_acc - - @property - def test_score(self) -> float: - """The score on the test set.""" - return self.test_acc - - @property - def test_error(self) -> float: - """The error on the test set.""" - return 100 - self.test_acc - - @property - def val_score(self) -> float: - """The score on the validation set.""" - return self.valid_acc - - @property - def val_error(self) -> float: - """The error on the validation set.""" - return 100 - self.valid_acc - - @property - def cost(self) -> float: - """The time taken (assumed to be seconds).""" - return self.runtime - class JAHSBenchmark(Benchmark[JAHSConfig, JAHSResult, int], ABC): - Config = JAHSConfig - Result = JAHSResult - fidelity_name = "epoch" - fidelity_range = (3, 200, 1) # TODO: min budget plays a huge role in SH/HB algos + JAHS_FIDELITY_NAME: ClassVar[str] = "epoch" + JAHS_FIDELITY_RANGE: ClassVar[tuple[int, int, int]] = (3, 200, 1) + JAHS_METRICS_TO_ACTIVATE: ClassVar[tuple[str, ...]] = ( + "valid-acc", + "test-acc", + "runtime", + ) - task_ids: tuple[str, ...] = ( + task_ids: ClassVar[tuple[str, str, str]] = ( "CIFAR10", "ColorectalHistology", "FashionMNIST", @@ -137,14 +96,13 @@ class JAHSBenchmark(Benchmark[JAHSConfig, JAHSResult, int], ABC): ``` """ - _result_renames: Mapping[str, str] = { + _result_renames: ClassVar[Mapping[str, str]] = { "size_MB": "size", "FLOPS": "flops", "valid-acc": "valid_acc", "test-acc": "test_acc", "train-acc": "train_acc", } - _result_metrics_active: tuple[str, ...] = ("valid-acc", "test-acc", "runtime") def __init__( self, @@ -154,6 +112,8 @@ def __init__( seed: int | None = None, prior: str | Path | JAHSConfig | Mapping[str, Any] | None = None, perturb_prior: float | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ): """Initialize the benchmark. @@ -171,6 +131,10 @@ def __init__( perturb_prior: If given, will perturb the prior by this amount. Only used if `prior=` is given as a config. + value_metric: The metric to use for this benchmark. Uses + the default metric from the Result if None. + cost_metric: The cost to use for this benchmark. Uses + the default cost from the Result if None. """ cls = self.__class__ if datadir is None: @@ -193,9 +157,15 @@ def __init__( super().__init__( seed=seed, name=name, + config_type=JAHSConfig, + result_type=JAHSResult, + fidelity_name=self.JAHS_FIDELITY_NAME, + fidelity_range=self.JAHS_FIDELITY_RANGE, space=cls._jahs_configspace(name=name, seed=seed), prior=prior, perturb_prior=perturb_prior, + value_metric=value_metric, + cost_metric=cost_metric, ) # explicit overwrite @@ -231,52 +201,38 @@ def bench(self) -> jahs_bench.Benchmark: task=self.task_id, save_dir=self.datadir, download=False, - metrics=self._result_metrics_active, + metrics=self.JAHS_METRICS_TO_ACTIVATE, ) return self._bench @override - def _objective_function(self, config: JAHSConfig, at: int) -> JAHSResult: - query = config.dict() - + def _objective_function( + self, + config: Mapping[str, Any], + at: int, + ) -> dict[str, float]: + query = dict(config) results = self.bench.__call__(query, nepochs=at) - result = results[at] - - return self.Result.from_dict( - config=config, - result=rename(result, keys=self._result_renames), - fidelity=at, - ) + return results[at] @override def _trajectory( self, - config: JAHSConfig, + config: Mapping[str, Any], *, frm: int, to: int, step: int, - ) -> list[JAHSResult]: - query = config.dict() + ) -> Iterable[tuple[int, Mapping[str, float]]]: + query = dict(config) try: - results = self.bench.__call__(query, nepochs=to, full_trajectory=True) + return self.bench.__call__(query, nepochs=to, full_trajectory=True).items() except TypeError: # See: https://github.com/automl/jahs_bench_201/issues/5 - results = { - f: self.bench.__call__(query, nepochs=f)[f] - for f in self.iter_fidelities(frm=frm, to=to, step=step) - } - - return [ - self.Result.from_dict( - config=config, - fidelity=i, - result=rename(results[i], keys=self._result_renames), - ) - for i in self.iter_fidelities(frm=frm, to=to, step=step) - ] + # Revert back to calling individually, default behaviour + return super()._trajectory(config, frm=frm, to=to, step=step) @classmethod def _jahs_configspace( diff --git a/src/mfpbench/lcbench_tabular/benchmark.py b/src/mfpbench/lcbench_tabular/benchmark.py index 4e18337..8a39f2b 100644 --- a/src/mfpbench/lcbench_tabular/benchmark.py +++ b/src/mfpbench/lcbench_tabular/benchmark.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any, ClassVar, Mapping +import numpy as np import pandas as pd from ConfigSpace import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -13,6 +14,7 @@ ) from mfpbench.config import TabularConfig +from mfpbench.metric import Metric from mfpbench.result import Result from mfpbench.setup_benchmark import LCBenchTabularSource from mfpbench.tabular import TabularBenchmark @@ -136,55 +138,28 @@ class LCBenchTabularConfig(TabularConfig): @dataclass(frozen=True) # type: ignore[misc] class LCBenchTabularResult(Result[LCBenchTabularConfig, int]): - time: float - val_accuracy: float - val_cross_entropy: float - val_balanced_accuracy: float - test_accuracy: float - test_cross_entropy: float - test_balanced_accuracy: float - - @property - def score(self) -> float: - """The score of interest.""" - return self.val_score - - @property - def error(self) -> float: - """The error of interest.""" - return self.val_error - - @property - def val_score(self) -> float: - """The score on the validation set.""" - return self.val_accuracy / 100 - - @property - def val_error(self) -> float: - """The error on the validation set.""" - return (100 - self.val_accuracy) / 100 - - @property - def test_score(self) -> float: - """The score on the test set.""" - return self.test_accuracy / 100 - - @property - def test_error(self) -> float: - """The error on the test set.""" - return (100 - self.test_accuracy) / 100 - - @property - def cost(self) -> float: - """The time to train the configuration (assumed to be seconds).""" - return self.time + metric_defs: ClassVar[Mapping[str, Metric]] = { + "val_accuracy": Metric(minimize=False, bounds=(0, 100)), + "val_balanced_accuracy": Metric(minimize=False, bounds=(0, 100)), + "val_cross_entropy": Metric(minimize=True, bounds=(0, np.inf)), + "test_accuracy": Metric(minimize=False, bounds=(0, 100)), + "test_balanced_accuracy": Metric(minimize=False, bounds=(0, 100)), + "test_cross_entropy": Metric(minimize=True, bounds=(0, np.inf)), + "time": Metric(minimize=True, bounds=(0, np.inf)), + } + default_value_metric: ClassVar[str] = "val_balanced_accuracy" + default_cost_metric: ClassVar[str] = "time" + + time: Metric.Value + val_accuracy: Metric.Value + test_accuracy: Metric.Value + val_balanced_accuracy: Metric.Value + test_balanced_accuracy: Metric.Value + val_cross_entropy: Metric.Value + test_cross_entropy: Metric.Value class LCBenchTabularBenchmark(TabularBenchmark): - Config = LCBenchTabularConfig - Result = LCBenchTabularResult - fidelity_name: str = "epoch" - task_ids: ClassVar[tuple[str, ...]] = ( "adult", "airlines", @@ -238,6 +213,8 @@ def __init__( seed: int | None = None, prior: str | Path | LCBenchTabularConfig | Mapping[str, Any] | None = None, perturb_prior: float | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ) -> None: """Initialize the benchmark. @@ -257,6 +234,10 @@ def __init__( For numericals, this is interpreted as the standard deviation of a normal distribution while for categoricals, this is interpreted as the probability of swapping the value for a random one. + value_metric: The metric to use for this benchmark. Uses + the default metric from the Result if None. + cost_metric: The cost to use for this benchmark. Uses + the default cost from the Result if None. """ cls = self.__class__ if task_id not in cls.task_ids: @@ -297,10 +278,11 @@ def __init__( table=table, # type: ignore name=benchmark_task_name, id_key="id", - fidelity_key=cls.fidelity_name, - result_keys=LCBenchTabularResult.names(), - config_keys=LCBenchTabularConfig.names(), - remove_constants=remove_constants, + fidelity_key="epoch", + result_type=LCBenchTabularResult, + config_type=LCBenchTabularConfig, + value_metric=value_metric, + cost_metric=cost_metric, space=space, seed=seed, prior=prior, diff --git a/src/mfpbench/metric.py b/src/mfpbench/metric.py new file mode 100644 index 0000000..40f2549 --- /dev/null +++ b/src/mfpbench/metric.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + +import numpy as np + + +class OutOfBoundsError(ValueError): + """Raised when a value is outside of the bounds of a metric.""" + + +@dataclass(frozen=True) +class Metric: + """A metric to be used in the benchmark. + + It's main use is to convert a raw value into a value that can always be + minimized. + """ + + minimize: bool + """Whether or not to minimize the metric.""" + + bounds: tuple[float, float] = field(default_factory=lambda: (-np.inf, np.inf)) + """The bounds of the metric.""" + + def __post_init__(self) -> None: + if self.bounds[0] >= self.bounds[1]: + raise ValueError( + f"bounds[0] must be less than bounds[1], got {self.bounds}", + ) + + def as_value(self, value: float) -> Metric.Value: + """Convert a raw value into a metric value. + + Args: + value: The raw value to convert. + + Returns: + The metric value. + """ + return Metric.Value(value=value, definition=self) + + @property + def optimum_value(self) -> Metric.Value: + """Get the optimum value for this metric. + + Returns: + The optimum value. + """ + if self.minimize: + return self.as_value(self.bounds[0]) + + return self.as_value(self.bounds[1]) + + @dataclass(frozen=True) + class Value: + """A value of a metric.""" + + value: float + definition: Metric = field(hash=False) + + def __post_init__(self) -> None: + if not self.definition.bounds[0] <= self.value <= self.definition.bounds[1]: + raise OutOfBoundsError( + f"Value {self.value} is outside of bounds {self.definition.bounds}", + ) + + @property + def error(self) -> float: + """Calculate a minimization value for the metric based on its raw value. + + The calculation is as follows: + + | direction | lower | upper | | error | + |-----------|-------|-------|-----|------------------------------------| + | minimize | inf | inf | | value | + | minimize | A | inf | | value | + | minimize | inf | B | | value | + | minimize | A | B | | abs(A - value) / abs(B - A) # 0-1 | + | --- | --- | --- | --- | --- | + | maximize | inf | inf | | -value | + | maximize | A | inf | | -value | + | maximize | inf | B | | -value | + | maximize | A | B | | abs(B - value) / abs(B - a) # 0 -1 | + + Returns: + The cost of the metric. + """ + value = self.value + lower, upper = self.definition.bounds + if self.definition.minimize: + if np.isinf(lower) or np.isinf(upper): + return value + + return abs(lower - value) / abs(upper - lower) + + if np.isinf(upper) or np.isinf(lower): + return -value + + return abs(upper - value) / abs(upper - lower) + + @property + def score(self) -> float: + """Calculate a minimization value for the metric based on its raw value. + + The calculation is as follows: + + | direction | lower | upper | | score | + |-----------|-------|-------|-----|------------------------------------| + | minimize | inf | inf | | -value | + | minimize | A | inf | | -value | + | minimize | inf | B | | -value | + | minimize | A | B | | abs(B - value) / abs(B - A) # 0-1 | + | --- | --- | --- | --- | --- | + | maximize | inf | inf | | value | + | maximize | A | inf | | value | + | maximize | inf | B | | value | + | maximize | A | B | | abs(A - value) / abs(B - A) # 0 -1 | + + Returns: + The cost of the metric. + """ + value = self.value + lower, upper = self.definition.bounds + if self.definition.minimize: + if np.isinf(lower) or np.isinf(upper): + return -value + + return abs(upper - value) / abs(upper - lower) + + if np.isinf(upper) or np.isinf(lower): + return value + + return abs(lower - value) / abs(upper - lower) diff --git a/src/mfpbench/pd1/benchmark.py b/src/mfpbench/pd1/benchmark.py index dc98b65..acc080a 100644 --- a/src/mfpbench/pd1/benchmark.py +++ b/src/mfpbench/pd1/benchmark.py @@ -1,6 +1,5 @@ from __future__ import annotations -import warnings from abc import abstractmethod from dataclasses import dataclass from pathlib import Path @@ -12,6 +11,7 @@ from mfpbench.benchmark import Benchmark from mfpbench.config import Config +from mfpbench.metric import Metric from mfpbench.result import Result from mfpbench.setup_benchmark import PD1Source @@ -19,6 +19,8 @@ from ConfigSpace import ConfigurationSpace from xgboost import XGBRegressor +PD1_FIDELITY_NAME = "epoch" + @dataclass(frozen=True, eq=False, unsafe_hash=True) # type: ignore[misc] class PD1Config(Config): @@ -30,116 +32,60 @@ class PD1Config(Config): opt_momentum: float -C = TypeVar("C", bound=PD1Config) - - -@dataclass(frozen=True) # type: ignore[misc] -class PD1Result(Result[PD1Config, int]): - valid_error_rate: float # (0, 1) - train_cost: float # - - @property - def score(self) -> float: - """The score of interest.""" - return 1 - self.valid_error_rate - - @property - def error(self) -> float: - """The error of interest.""" - return self.valid_error_rate - - @property - def val_score(self) -> float: - """The score on the validation set.""" - return 1 - self.valid_error_rate - - @property - def val_error(self) -> float: - """The error on the validation set.""" - return self.valid_error_rate - - @property - def cost(self) -> float: - """The train cost of the model (asssumed to be seconds). - - Please double check with YAHPO. - """ - return self.train_cost - - @dataclass(frozen=True) # type: ignore[misc] -class PD1ResultSimple(PD1Result): +class PD1ResultSimple(Result[PD1Config, int]): """Used for all PD1 benchmarks, except imagenet, lm1b, translate_wmt, uniref50.""" - test_error_rate: float = np.inf + metric_defs: ClassVar[Mapping[str, Metric]] = { + "valid_error_rate": Metric(minimize=True, bounds=(0, np.inf)), + "test_error_rate": Metric(minimize=True, bounds=(0, np.inf)), + "train_cost": Metric(minimize=True, bounds=(0, np.inf)), + } + default_value_metric: ClassVar[str] = "valid_error_rate" + default_cost_metric: ClassVar[str] = "train_cost" - @property - def test_score(self) -> float: - """The score on the test set.""" - return 1 - self.test_error_rate - - @property - def test_error(self) -> float: - """The error on the test set.""" - return self.test_error_rate + valid_error_rate: Metric.Value + test_error_rate: Metric.Value + train_cost: Metric.Value @dataclass(frozen=True) -class PD1ResultTransformer(PD1Result): +class PD1ResultTransformer(Result[PD1Config, int]): """Imagenet, lm1b, translate_wmt, uniref50, cifar100 contains no test error.""" - @property - def test_score(self) -> float: - """The score on the test set.""" - warnings.warn( - "Using valid error rate as there is no test error rate", - UserWarning, - stacklevel=2, - ) - return 1 - self.valid_error_rate - - @property - def test_error(self) -> float: - """The error on the test set.""" - warnings.warn( - "Using valid error rate as there is no test error rate", - UserWarning, - stacklevel=2, - ) - return self.valid_error_rate - + metric_defs: ClassVar[Mapping[str, Metric]] = { + "valid_error_rate": Metric(minimize=True, bounds=(0, np.inf)), + "train_cost": Metric(minimize=True, bounds=(0, np.inf)), + } + default_value_metric: ClassVar[str] = "valid_error_rate" + default_cost_metric: ClassVar[str] = "train_cost" -R = TypeVar("R", PD1ResultTransformer, PD1ResultSimple) + valid_error_rate: Metric.Value + train_cost: Metric.Value -class PD1Benchmark(Benchmark[C, R, int]): - pd1_dataset: ClassVar[str] - """The dataset that this benchmark uses.""" +R = TypeVar("R", bound=Result) - pd1_model: ClassVar[str] - """The model that this benchmark uses.""" - pd1_batchsize: ClassVar[int] - """The batchsize that this benchmark uses.""" +class PD1Benchmark(Benchmark[PD1Config, R, int]): + pd1_fidelity_range: ClassVar[tuple[int, int, int]] + """The fidelity range for this benchmark.""" - pd1_metrics: ClassVar[tuple[str, ...]] - """The metrics that are available for this benchmark.""" + pd1_name: ClassVar[str] + """The name to access surrogates from.""" - Config: type[C] - """The config type for this benchmark.""" - - Result: type[R] + pd1_result_type: type[R] """The result type for this benchmark.""" - has_conditionals = False - def __init__( self, *, datadir: str | Path | None = None, seed: int | None = None, - prior: str | Path | C | Mapping[str, Any] | None = None, + prior: str | Path | PD1Config | Mapping[str, Any] | None = None, perturb_prior: float | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ): """Create a PD1 Benchmark. @@ -151,10 +97,13 @@ def __init__( is interpreted as the std of a normal from which to perturb numerical hyperparameters of the prior, and the raw probability of swapping a categorical value. + value_metric: The metric to use for this benchmark. Uses + the default metric from the Result if None. + cost_metric: The cost to use for this benchmark. Uses + the default cost from the Result if None. """ cls = self.__class__ space = cls._create_space(seed=seed) - name = f"{cls.pd1_dataset}-{cls.pd1_model}-{cls.pd1_batchsize}" if datadir is None: datadir = PD1Source.default_location() @@ -169,10 +118,16 @@ def __init__( super().__init__( seed=seed, - name=name, + name=self.pd1_name, + config_type=PD1Config, + fidelity_name=PD1_FIDELITY_NAME, + fidelity_range=cls.pd1_fidelity_range, + result_type=cls.pd1_result_type, prior=prior, perturb_prior=perturb_prior, space=space, + value_metric=value_metric, + cost_metric=cost_metric, ) def load(self) -> None: @@ -209,38 +164,47 @@ def surrogate_paths(self) -> dict[str, Path]: """The paths to the surrogates.""" return { metric: self.surrogate_dir / f"{self.name}-{metric}.json" - for metric in self.pd1_metrics + for metric in self.Result.metric_defs } @override - def _objective_function(self, config: C, at: int) -> R: + def _objective_function( + self, + config: Mapping[str, Any], + at: int, + ) -> dict[str, float]: return self._results_for(config, fidelities=[at])[0] @override - def _trajectory(self, config: C, *, frm: int, to: int, step: int) -> list[R]: - return self._results_for(config, fidelities=self.iter_fidelities(frm, to, step)) - - def _results_for(self, config: C, fidelities: Iterable[int]) -> list[R]: + def _trajectory( + self, + config: Mapping[str, Any], + *, + frm: int, + to: int, + step: int, + ) -> Iterable[tuple[int, Mapping[str, float]]]: + fidelities = list(self.iter_fidelities(frm, to, step)) + return zip(fidelities, self._results_for(config, fidelities)) + + def _results_for( + self, + config: Mapping[str, Any], + fidelities: Iterable[int], + ) -> list[dict[str, float]]: # Add the fidelities into the query and make a dataframe - c = config.dict() + c = dict(config) queries = [{**c, self.fidelity_name: f} for f in fidelities] xs = pd.DataFrame(queries) # Predict the metric for everything in the dataframe features = xs.columns for metric, surrogate in self.surrogates.items(): - xs[metric] = surrogate.predict(xs[features]) + # We clip as sometimes the surrogate produces negative values + xs[metric] = surrogate.predict(xs[features]).clip(min=0) metrics = list(self.surrogates.keys()) - - return [ - self.Result.from_dict( - config=config, # Our original config - fidelity=r[self.fidelity_name], # fidelity # type: ignore - result=r[metrics], # Grab the metrics # type: ignore - ) - for _, r in xs.iterrows() - ] + return [dict(r[metrics]) for _, r in xs.iterrows()] @classmethod @abstractmethod diff --git a/src/mfpbench/pd1/benchmarks/cifar100.py b/src/mfpbench/pd1/benchmarks/cifar100.py index 2482936..977772f 100644 --- a/src/mfpbench/pd1/benchmarks/cifar100.py +++ b/src/mfpbench/pd1/benchmarks/cifar100.py @@ -1,34 +1,14 @@ from __future__ import annotations -from dataclasses import dataclass -from typing_extensions import override - from ConfigSpace import ConfigurationSpace, UniformFloatHyperparameter -from mfpbench.pd1.benchmark import PD1Benchmark, PD1Config, PD1ResultTransformer - - -@dataclass(frozen=True, eq=False, unsafe_hash=True) -class PD1Config_cifar100_wideresnet_2048(PD1Config): - @override - def validate(self) -> None: - assert 0.010093 <= self.lr_decay_factor <= 0.989012 - assert 0.000010 <= self.lr_initial <= 9.779176 - assert 0.100708 <= self.lr_power <= 1.999376 - assert 0.000059 <= self.opt_momentum <= 0.998993 +from mfpbench.pd1.benchmark import PD1Benchmark, PD1ResultTransformer class PD1cifar100_wideresnet_2048(PD1Benchmark): - fidelity_name = "epoch" - fidelity_range = (45, 199, 1) - - Config = PD1Config_cifar100_wideresnet_2048 - Result = PD1ResultTransformer - - pd1_dataset = "cifar100" - pd1_model = "wide_resnet" - pd1_batchsize = 2048 - pd1_metrics = ("valid_error_rate", "train_cost") + pd1_fidelity_range = (45, 199, 1) + pd1_result_type = PD1ResultTransformer + pd1_name = "cifar100-wideresnet-2048" @classmethod def _create_space(cls, seed: int | None = None) -> ConfigurationSpace: diff --git a/src/mfpbench/pd1/benchmarks/imagenet.py b/src/mfpbench/pd1/benchmarks/imagenet.py index 8714669..cf1e818 100644 --- a/src/mfpbench/pd1/benchmarks/imagenet.py +++ b/src/mfpbench/pd1/benchmarks/imagenet.py @@ -1,33 +1,14 @@ from __future__ import annotations -from dataclasses import dataclass -from typing_extensions import override - from ConfigSpace import ConfigurationSpace, UniformFloatHyperparameter -from mfpbench.pd1.benchmark import PD1Benchmark, PD1Config, PD1ResultTransformer - - -@dataclass(frozen=True, eq=False, unsafe_hash=True) -class PD1Config_imagenet_resnet_512(PD1Config): - @override - def validate(self) -> None: - assert 0.010294 <= self.lr_decay_factor <= 0.989753 - assert 0.000010 <= self.lr_initial <= 9.774312 - assert 0.100225 <= self.lr_power <= 1.999326 - assert 0.000059 <= self.opt_momentum <= 0.998993 +from mfpbench.pd1.benchmark import PD1Benchmark, PD1ResultTransformer class PD1imagenet_resnet_512(PD1Benchmark): - fidelity_name = "epoch" - fidelity_range = (3, 99, 1) - Config = PD1Config_imagenet_resnet_512 - Result = PD1ResultTransformer - - pd1_dataset = "imagenet" - pd1_model = "resnet" - pd1_batchsize = 512 - pd1_metrics = ("valid_error_rate", "train_cost") + pd1_result_type = PD1ResultTransformer + pd1_fidelity_range = (3, 99, 1) + pd1_name = "imagenet-resnet-512" @classmethod def _create_space(cls, seed: int | None = None) -> ConfigurationSpace: diff --git a/src/mfpbench/pd1/benchmarks/lm1b.py b/src/mfpbench/pd1/benchmarks/lm1b.py index 6ac2b3a..7a9840d 100644 --- a/src/mfpbench/pd1/benchmarks/lm1b.py +++ b/src/mfpbench/pd1/benchmarks/lm1b.py @@ -1,37 +1,16 @@ from __future__ import annotations -from dataclasses import dataclass -from typing_extensions import override - from ConfigSpace import ConfigurationSpace, UniformFloatHyperparameter -from mfpbench.pd1.benchmark import PD1Benchmark, PD1Config, PD1ResultTransformer - - -@dataclass(frozen=True, eq=False, unsafe_hash=True) -class PD1Config_lm1b_transformer_2048(PD1Config): - @override - def validate(self) -> None: - assert 0.010543 <= self.lr_decay_factor <= 9.885653e-01 - assert 0.000010 <= self.lr_initial <= 9.986256e00 - assert 0.100811 <= self.lr_power <= 1.999659e00 - assert 0.000059 <= self.opt_momentum <= 9.989986e-01 +from mfpbench.pd1.benchmark import PD1Benchmark, PD1ResultTransformer class PD1lm1b_transformer_2048(PD1Benchmark): - fidelity_name = "epoch" - fidelity_range = (1, 74, 1) - - Config = PD1Config_lm1b_transformer_2048 - Result = PD1ResultTransformer - - pd1_dataset = "lm1b" - pd1_model = "transformer" - pd1_batchsize = 2048 - pd1_metrics = ("valid_error_rate", "train_cost") + pd1_fidelity_range = (1, 74, 1) + pd1_result_type = PD1ResultTransformer + pd1_name = "lm1b-transformer-2048" @classmethod - @override def _create_space(cls, seed: int | None = None) -> ConfigurationSpace: cs = ConfigurationSpace(seed=seed) cs.add_hyperparameters( diff --git a/src/mfpbench/pd1/benchmarks/translate_wmt.py b/src/mfpbench/pd1/benchmarks/translate_wmt.py index bf7adff..dd14bf8 100644 --- a/src/mfpbench/pd1/benchmarks/translate_wmt.py +++ b/src/mfpbench/pd1/benchmarks/translate_wmt.py @@ -1,34 +1,14 @@ from __future__ import annotations -from dataclasses import dataclass -from typing_extensions import override - from ConfigSpace import ConfigurationSpace, UniformFloatHyperparameter -from mfpbench.pd1.benchmark import PD1Benchmark, PD1Config, PD1ResultTransformer - - -@dataclass(frozen=True, eq=False, unsafe_hash=True) -class PD1Config_translatewmt_xformer_64(PD1Config): - @override - def validate(self) -> None: - assert 0.0100221257 <= self.lr_decay_factor <= 0.988565263 - assert 1.00276e-05 <= self.lr_initial <= 9.8422475735 - assert 0.1004250993 <= self.lr_power <= 1.9985927056 - assert 5.86114e-05 <= self.opt_momentum <= 0.9989999746 +from mfpbench.pd1.benchmark import PD1Benchmark, PD1ResultTransformer class PD1translatewmt_xformer_64(PD1Benchmark): - fidelity_name = "epoch" - fidelity_range = (1, 19, 1) - - Config = PD1Config_translatewmt_xformer_64 - Result = PD1ResultTransformer - - pd1_dataset = "translate_wmt" - pd1_model = "xformer_translate" - pd1_batchsize = 64 - pd1_metrics = ("valid_error_rate", "train_cost") + pd1_fidelity_range = (1, 19, 1) + pd1_result_type = PD1ResultTransformer + pd1_name = "translate-wmt-xformer-64" @classmethod def _create_space(cls, seed: int | None = None) -> ConfigurationSpace: diff --git a/src/mfpbench/pd1/benchmarks/uniref50.py b/src/mfpbench/pd1/benchmarks/uniref50.py index 2111fab..8ca1943 100644 --- a/src/mfpbench/pd1/benchmarks/uniref50.py +++ b/src/mfpbench/pd1/benchmarks/uniref50.py @@ -1,34 +1,14 @@ from __future__ import annotations -from dataclasses import dataclass -from typing_extensions import override - from ConfigSpace import ConfigurationSpace, UniformFloatHyperparameter -from mfpbench.pd1.benchmark import PD1Benchmark, PD1Config, PD1ResultTransformer - - -@dataclass(frozen=True, eq=False, unsafe_hash=True) -class PD1Config_uniref50_transformer_128(PD1Config): - @override - def validate(self) -> None: - assert 0.0111588123 <= self.lr_decay_factor <= 0.9898713967 - assert 1.00564e-05 <= self.lr_initial <= 0.4429248972 - assert 0.1001570089 <= self.lr_power <= 1.9989163336 - assert 5.86114e-05 <= self.opt_momentum <= 0.9989940217 +from mfpbench.pd1.benchmark import PD1Benchmark, PD1ResultTransformer class PD1uniref50_transformer_128(PD1Benchmark): - fidelity_name = "epoch" - fidelity_range = (1, 22, 1) - - Config = PD1Config_uniref50_transformer_128 - Result = PD1ResultTransformer - - pd1_dataset = "uniref50" - pd1_model = "transformer" - pd1_batchsize = 128 - pd1_metrics = ("valid_error_rate", "train_cost") + pd1_fidelity_range = (1, 22, 1) + pd1_result_type = PD1ResultTransformer + pd1_name = "uniref50-transformer-128" @classmethod def _create_space(cls, seed: int | None = None) -> ConfigurationSpace: diff --git a/src/mfpbench/pd1/surrogate/train_xgboost.py b/src/mfpbench/pd1/surrogate/train_xgboost.py index e6ce503..7c9100e 100644 --- a/src/mfpbench/pd1/surrogate/train_xgboost.py +++ b/src/mfpbench/pd1/surrogate/train_xgboost.py @@ -55,7 +55,7 @@ def train_xgboost( if __name__ == "__main__": import argparse - from xgboost import XGBRegressor # noqa: F811 + from xgboost import XGBRegressor parser = argparse.ArgumentParser() parser.add_argument("--data", required=True, type=str) diff --git a/src/mfpbench/result.py b/src/mfpbench/result.py index e2903f2..20c63af 100644 --- a/src/mfpbench/result.py +++ b/src/mfpbench/result.py @@ -1,12 +1,15 @@ from __future__ import annotations -from abc import ABC, abstractmethod -from dataclasses import asdict, dataclass, field, fields -from typing import Any, Generic, Mapping, TypeVar -from typing_extensions import Self, override +from abc import ABC +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, ClassVar, Generic, Mapping, TypeVar +from typing_extensions import Self from mfpbench.config import Config +if TYPE_CHECKING: + from mfpbench.metric import Metric + # The Config kind C = TypeVar("C", bound=Config) @@ -18,194 +21,97 @@ class Result(ABC, Generic[C, F]): """Collect all results in a class for clarity.""" + metric_defs: ClassVar[Mapping[str, Metric]] + """The metric definitions of this result.""" + + default_value_metric: ClassVar[str] + """The default metric to use for this result.""" + + default_cost_metric: ClassVar[str] + """The default cost to use for this result.""" + fidelity: F """The fidelity of this result.""" - config: C = field(repr=False) + config: C """The config used to generate this result.""" + value_metric: str + """The metric to use for this result.""" + + cost_metric: str + """The cost to use for this result.""" + @classmethod def from_dict( cls, config: C, - result: Mapping[str, Any], fidelity: F, + result: Mapping[str, float], + *, + value_metric: str | None = None, + cost_metric: str | None = None, + renames: Mapping[str, str] | None = None, ) -> Self: """Create from a dict or mapping object.""" - fieldnames = set(cls.names()) - if not fieldnames.issubset(result.keys()): - raise ValueError( - f"Result dict is missing fields: {fieldnames - result.keys()}", + values = { + k: ( + metric.as_value(v) + if (metric := cls.metric_defs.get(k)) is not None + else v ) - # To help with serialization, we need to convert floats to... ehh floats - # This is due to some things returning an np.float -_- - result = { - k: float(v) if isinstance(v, float) else v for k, v in result.items() - if k in fieldnames } - return cls(config=config, fidelity=fidelity, **result) - - @classmethod - def names(cls) -> tuple[str, ...]: - """The names of the fields in this result.""" - return tuple( - f.name for f in fields(cls) if f.name not in ("config", "fidelity") + if renames is not None: + values = {renames.get(k, k): v for k, v in values.items()} + if value_metric is None: + value_metric = cls.default_value_metric + if cost_metric is None: + cost_metric = cls.default_cost_metric + + return cls( + config=config, + fidelity=fidelity, + value_metric=value_metric, + cost_metric=cost_metric, + **values, # type: ignore ) - @classmethod - def from_row( - cls, - config: C, - row: Mapping[str, Any], - fidelity: F, - ) -> Self: - """Create from a row of a dataframe.""" - return cls.from_dict(config, dict(row), fidelity) - - @property - @abstractmethod - def score(self) -> float: - """The score of interest.""" - ... - - @property - @abstractmethod - def error(self) -> float: - """The error of interest.""" - ... - - @property - @abstractmethod - def test_score(self) -> float: - """The score on the test set.""" - ... - - @property - @abstractmethod - def test_error(self) -> float: - """The error on the test set.""" - ... - - @property - @abstractmethod - def val_score(self) -> float: - """The score on the validation set.""" - ... + def as_dict(self) -> dict[str, Any]: + """As a raw dictionary.""" + return self.values - @property - @abstractmethod - def val_error(self) -> float: - """The score on the validation set.""" - ... + def __getitem__(self, key: str) -> Metric.Value: + if key not in self.metric_defs: + raise KeyError(f"Metric {key} not in {self.metric_defs.keys()}") + return getattr(self, key) @property - @abstractmethod def cost(self) -> float: """The time cost for evaluting this config.""" - ... - - def dict(self) -> dict[str, Any]: - """Create a dict from this result.""" - d = asdict(self) - del d["config"] - del d["fidelity"] - return d - - -@dataclass(frozen=True, eq=False) # type: ignore[misc] -class GenericTabularResult(Result[C, F], Generic[C, F]): - """A generic tabular result. - - This is useful for adhoc tabular benchmarks. - """ - - _values: dict[str, Any] - - def __hash__(self) -> int: - """Hash based on the dictionary repr.""" - return ( - hash(self.config) ^ hash(self.fidelity) ^ hash(tuple(self._values.items())) - ) - - def dict(self) -> Any: - """As a raw dictionary.""" - return dict(self._values) - - def __getitem__(self, key: str) -> Any: - return self._values[key] - - # Make .property acces work - def __getattr__(self, __name: str) -> Any: - return self._values[__name] - - @override - @classmethod - def from_dict(cls, config: C, result: Mapping[str, Any], fidelity: F) -> Self: - """Create from a dict or mapping object.""" - return cls(config=config, _values=dict(result), fidelity=fidelity) - - @property - def score(self) -> float: - """The score of interest.""" - if "score" in self._values: - return float(self._values["score"]) - - raise KeyError("GenericTabularResult does not have a score") + return self[self.cost_metric].error @property def error(self) -> float: """The error of interest.""" - if "error" in self._values: - return float(self._values["error"]) - - raise KeyError("GenericTabularResult does not have an error") + return self[self.value_metric].error @property - def test_score(self) -> float: - """The score on the test set.""" - if "test_score" in self._values: - return float(self._values["test_score"]) - - raise KeyError("GenericTabularResult does not have a test_score") + def score(self) -> float: + """The score of interest.""" + return self[self.value_metric].score @property - def test_error(self) -> float: - """The error on the test set.""" - if "test_error" in self._values: - return float(self._values["test_error"]) - - raise KeyError("GenericTabularResult does not have a test_error") + def values(self) -> dict[str, Any]: + """Create a dict from this result with the raw values.""" + return {k: getattr(self, k).value for k in self.metric_defs} @property - def val_score(self) -> float: - """The score on the validation set.""" - if "val_score" in self._values: - return float(self._values["val_score"]) - - raise KeyError("GenericTabularResult does not have a val_score") + def errors(self) -> dict[str, float]: + """Create a dict from this result with the error values.""" + return {k: getattr(self, k).error for k in self.metric_defs} @property - def val_error(self) -> float: - """The score on the validation set.""" - if "val_error" in self._values: - return float(self._values["val_error"]) - - raise KeyError("GenericTabularResult does not have a val_error") - - @property - def cost(self) -> float: - """The time cost for evaluting this config.""" - if "cost" in self._values: - return float(self._values["cost"]) - - raise KeyError("GenericTabularResult does not have a cost") - - @classmethod - def names(cls) -> tuple[str, ...]: - """The names of the fields in this result.""" - return tuple( - f.name - for f in fields(cls) - if f.name not in ("config", "fidelity", "__values") - ) + def scores(self) -> dict[str, float]: + """Create a dict from this result with the score values.""" + return {k: getattr(self, k).score for k in self.metric_defs} diff --git a/src/mfpbench/setup_benchmark.py b/src/mfpbench/setup_benchmark.py index 1105b5f..7521477 100644 --- a/src/mfpbench/setup_benchmark.py +++ b/src/mfpbench/setup_benchmark.py @@ -263,7 +263,7 @@ def download_status(source: str, datadir: Path | None = None) -> bool: _source = BenchmarkSetup.source(source) source_path = datadir / _source.name return source_path.exists() and bool( - next(source_path.iterdir(), False), # noqa: FBT003 + next(source_path.iterdir(), False), ) @@ -366,7 +366,6 @@ def setup( print(f"Finished downloading to {source_path}") else: print(f"Already found something at {source_path}") - pass if install is not False: if install is True: diff --git a/src/mfpbench/synthetic/__init__.py b/src/mfpbench/synthetic/__init__.py index 82184f0..ac3b59e 100644 --- a/src/mfpbench/synthetic/__init__.py +++ b/src/mfpbench/synthetic/__init__.py @@ -15,7 +15,6 @@ MFHartmann6Config, MFHartmannBenchmark, MFHartmannGenerator, - MFHartmannResult, ) __all__ = [ @@ -35,5 +34,4 @@ "MFHartmann3", "MFHartmann6", "MFHartmannGenerator", - "MFHartmannResult", ] diff --git a/src/mfpbench/synthetic/hartmann/__init__.py b/src/mfpbench/synthetic/hartmann/__init__.py index 6850fc1..db1077e 100644 --- a/src/mfpbench/synthetic/hartmann/__init__.py +++ b/src/mfpbench/synthetic/hartmann/__init__.py @@ -12,7 +12,6 @@ MFHartmann6BenchmarkTerrible, MFHartmann6Config, MFHartmannBenchmark, - MFHartmannResult, ) from mfpbench.synthetic.hartmann.generators import ( MFHartmann3, @@ -37,5 +36,4 @@ "MFHartmann3", "MFHartmann6", "MFHartmannGenerator", - "MFHartmannResult", ] diff --git a/src/mfpbench/synthetic/hartmann/benchmark.py b/src/mfpbench/synthetic/hartmann/benchmark.py index 55458ea..3362feb 100644 --- a/src/mfpbench/synthetic/hartmann/benchmark.py +++ b/src/mfpbench/synthetic/hartmann/benchmark.py @@ -15,10 +15,12 @@ from typing import Any, ClassVar, Generic, Mapping, TypeVar from typing_extensions import override +import numpy as np from ConfigSpace import ConfigurationSpace, UniformFloatHyperparameter from mfpbench.benchmark import Benchmark from mfpbench.config import Config +from mfpbench.metric import Metric from mfpbench.result import Result from mfpbench.synthetic.hartmann.generators import ( MFHartmann3, @@ -26,6 +28,8 @@ MFHartmannGenerator, ) +G = TypeVar("G", bound=MFHartmannGenerator) + @dataclass(frozen=True, eq=False, unsafe_hash=True) # type: ignore[misc] class MFHartmann3Config(Config): @@ -33,12 +37,6 @@ class MFHartmann3Config(Config): X_1: float X_2: float - def validate(self) -> None: - """Validate this config.""" - assert 0.0 <= self.X_0 <= 1.0 - assert 0.0 <= self.X_1 <= 1.0 - assert 0.0 <= self.X_2 <= 1.0 - @dataclass(frozen=True, eq=False, unsafe_hash=True) # type: ignore[misc] class MFHartmann6Config(Config): @@ -49,93 +47,58 @@ class MFHartmann6Config(Config): X_4: float X_5: float - def validate(self) -> None: - """Validate this config.""" - assert 0.0 <= self.X_0 <= 1.0 - assert 0.0 <= self.X_1 <= 1.0 - assert 0.0 <= self.X_2 <= 1.0 - assert 0.0 <= self.X_3 <= 1.0 - assert 0.0 <= self.X_4 <= 1.0 - assert 0.0 <= self.X_5 <= 1.0 - - -C = TypeVar("C", MFHartmann3Config, MFHartmann6Config) - @dataclass(frozen=True) # type: ignore[misc] -class MFHartmannResult(Result[C, int]): - value: float - fid_cost: float +class MFHartmann3Result(Result[MFHartmann3Config, int]): + metric_defs: ClassVar[Mapping[str, Metric]] = { + # TODO: There's probably some analytical upper bound... + "value": Metric(minimize=True, bounds=(-3.86278, np.inf)), + "fid_cost": Metric(minimize=True, bounds=(0.05, 1)), + } + default_value_metric: ClassVar[str] = "value" + default_cost_metric: ClassVar[str] = "fid_cost" - @property - def score(self) -> float: - """The score of interest.""" - # TODO: what should be an appropriate score since flipping signs may not be - # adequate or meaningful. When is the property score used? - # Hartmann functions have multiple minimas with the global valued at < 0 - # The function evaluates to a y-value that needs to be minimized - # https://www.sfu.ca/~ssurjano/hart3.html - raise NotImplementedError("There's no meaninfgul score for Hartmann functions") + value: Metric.Value + fid_cost: Metric.Value - @property - def error(self) -> float: - """The score of interest.""" - # TODO: verify - # Hartmann functions have multiple minimas with the global valued at < 0 - # The function evaluates to a y-value that needs to be minimized - # https://www.sfu.ca/~ssurjano/hart3.html - return self.value - @property - def test_score(self) -> float: - """Just returns the score.""" - raise NotImplementedError("There's no meaninfgul score for Hartmann functions") +@dataclass(frozen=True) # type: ignore[misc] +class MFHartmann6Result(Result[MFHartmann6Config, int]): + metric_defs: ClassVar[Mapping[str, Metric]] = { + # TODO: There's probably some analytical upper bound... + "value": Metric(minimize=True, bounds=(-3.32237, np.inf)), + "fid_cost": Metric(minimize=True, bounds=(0.05, 1)), + } + default_value_metric: ClassVar[str] = "value" + default_cost_metric: ClassVar[str] = "fid_cost" - @property - def test_error(self) -> float: - """Just returns the error.""" - return self.error + value: Metric.Value + fid_cost: Metric.Value - @property - def val_score(self) -> float: - """Just returns the score.""" - raise NotImplementedError("There's no meaninfgul score for Hartmann functions") - @property - def val_error(self) -> float: - """Just returns the error.""" - return self.error +C = TypeVar("C", bound=Config) +R = TypeVar("R", bound=Result) - @property - def cost(self) -> float: - """Just retuns the fidelity.""" - # return self.fidelity - return self.fid_cost +class MFHartmannBenchmark(Benchmark[C, R, int], Generic[G, C, R]): + mfh_generator_type: type[G] + """The underlying mfhartmann function generator.""" -G = TypeVar("G", bound=MFHartmannGenerator) + mfh_config_type: type[C] + """The config type for this benchmark.""" + mfh_result_type: type[R] + """The result type for this benchmark.""" -class MFHartmannBenchmark(Benchmark, Generic[G, C]): mfh_dims: ClassVar[int] """How many dimensions there are to the Hartmann function.""" mfh_suffix: ClassVar[str] """Suffix for the benchmark name""" - Config: type[C] - """The Config type for this mfhartmann benchmark.""" - - Generator: type[G] - """The underlying mfhartmann function generator.""" - mfh_bias_noise: ClassVar[tuple[float, float]] = (0.5, 0.1) """The default bias and noise for mfhartmann benchmarks.""" - fidelity_name = "z" - fidelity_range = (3, 100, 1) - Result = MFHartmannResult - def __init__( self, *, @@ -144,6 +107,8 @@ def __init__( noise: float | None = None, prior: str | Path | C | Mapping[str, Any] | None = None, perturb_prior: float | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ): """Initialize the benchmark. @@ -160,12 +125,19 @@ def __init__( perturb_prior: If not None, will perturb the prior by this amount. For numericals, while for categoricals, this is interpreted as the probability of swapping the value for a random one. + value_metric: The metric to use for this benchmark. Uses + the default metric from the Result if None. + cost_metric: The cost to use for this benchmark. Uses + the default cost from the Result if None. """ cls = self.__class__ self.bias = bias if bias is not None else cls.mfh_bias_noise[0] self.noise = noise if noise is not None else cls.mfh_bias_noise[1] - self.mfh = cls.Generator( - n_fidelities=cls.fidelity_range[1], + + _max_fidelity = 100 + + self.mfh = cls.mfh_generator_type( + n_fidelities=_max_fidelity, fidelity_noise=self.noise, fidelity_bias=self.bias, seed=seed, @@ -185,27 +157,31 @@ def __init__( ) super().__init__( name=name, + config_type=self.mfh_config_type, + result_type=self.mfh_result_type, + fidelity_name="z", + fidelity_range=(3, _max_fidelity, 1), space=space, seed=seed, prior=prior, perturb_prior=perturb_prior, + value_metric=value_metric, + cost_metric=cost_metric, ) @override - def _objective_function(self, config: C, *, at: int) -> MFHartmannResult[C]: - query = config.dict() + def _objective_function( + self, + config: Mapping[str, Any], + *, + at: int, + ) -> dict[str, float]: + query = dict(config) # It's important here that we still have X_0, X_1, ..., X_n # We strip out the numerical part and sort by that Xs = tuple(query[s] for s in sorted(query, key=lambda k: int(k.split("_")[-1]))) - value = self.mfh(z=at, Xs=Xs) - cost = self._fidelity_cost(at) - - return self.Result.from_dict( - config=config, - fidelity=at, - result={"value": value, "fid_cost": cost}, - ) + return {"value": self.mfh(z=at, Xs=Xs), "fid_cost": self._fidelity_cost(at)} def _fidelity_cost(self, at: int) -> float: # λ(z) on Pg 18 from https://arxiv.org/pdf/1703.06240.pdf @@ -214,16 +190,23 @@ def _fidelity_cost(self, at: int) -> float: @property def optimum(self) -> C: """The optimum of the benchmark.""" - optimum = {f"X_{i}": x for i, x in enumerate(self.Generator.optimum)} + optimum = {f"X_{i}": x for i, x in enumerate(self.mfh_generator_type.optimum)} return self.Config.from_dict(optimum) # ----------- # MFHartmann3 # ----------- -class MFHartmann3Benchmark(MFHartmannBenchmark): - Generator = MFHartmann3 - Config = MFHartmann3Config +class MFHartmann3Benchmark( + MFHartmannBenchmark[ + MFHartmann3, + MFHartmann3Config, + MFHartmann3Result, + ], +): + mfh_generator_type = MFHartmann3 + mfh_config_type = MFHartmann3Config + mfh_result_type = MFHartmann3Result mfh_dims = MFHartmann3.dims mfh_suffix = "" @@ -251,9 +234,16 @@ class MFHartmann3BenchmarkGood(MFHartmann3Benchmark): # ----------- # MFHartmann6 # ----------- -class MFHartmann6Benchmark(MFHartmannBenchmark): - Generator = MFHartmann6 - Config = MFHartmann6Config +class MFHartmann6Benchmark( + MFHartmannBenchmark[ + MFHartmann6, + MFHartmann6Config, + MFHartmann6Result, + ], +): + mfh_generator_type = MFHartmann6 + mfh_config_type = MFHartmann6Config + mfh_result_type = MFHartmann6Result mfh_dims = MFHartmann6.dims mfh_suffix = "" diff --git a/src/mfpbench/tabular.py b/src/mfpbench/tabular.py index d95f309..14333d5 100644 --- a/src/mfpbench/tabular.py +++ b/src/mfpbench/tabular.py @@ -1,8 +1,7 @@ from __future__ import annotations -from datetime import datetime from pathlib import Path -from typing import Any, Callable, Mapping, Sequence, TypeVar, overload +from typing import TYPE_CHECKING, Any, Iterable, Mapping, TypeVar, overload from typing_extensions import override import numpy as np @@ -11,8 +10,12 @@ from more_itertools import first_true from mfpbench.benchmark import Benchmark -from mfpbench.config import GenericTabularConfig, TabularConfig -from mfpbench.result import GenericTabularResult, Result +from mfpbench.config import TabularConfig +from mfpbench.result import Result + +if TYPE_CHECKING: + from mfpbench.metric import Metric + # The kind of Config to the **tabular** benchmark CTabular = TypeVar("CTabular", bound=TabularConfig) @@ -25,31 +28,6 @@ class TabularBenchmark(Benchmark[CTabular, R, F]): - id_key: str - """The column in the table that contains the config id. Will be set to the index""" - - fidelity_key: str - """The name of the fidelity used in this benchmark""" - - config_keys: Sequence[str] - """The keys in the table that contain the config""" - - result_keys: Sequence[str] - """The keys in the table that contain the results""" - - table: pd.DataFrame - """The table of results used for this benchmark""" - - configs: Mapping[str, CTabular] - """The configs used in this benchmark""" - - # The config and result type of this benchmark - Config: type[CTabular] - Result: type[R] - - # Whether this benchmark has conditonals in it or not - has_conditionals: bool = False - def __init__( # noqa: PLR0913 self, name: str, @@ -57,9 +35,10 @@ def __init__( # noqa: PLR0913 *, id_key: str, fidelity_key: str, - result_keys: Sequence[str], - config_keys: Sequence[str], - remove_constants: bool = False, + result_type: type[R], + config_type: type[CTabular], + value_metric: str | None = None, + cost_metric: str | None = None, space: ConfigurationSpace | None = None, seed: int | None = None, prior: str | Path | CTabular | Mapping[str, Any] | None = None, @@ -72,9 +51,12 @@ def __init__( # noqa: PLR0913 table: The table to use for the benchmark. id_key: The column in the table that contains the config id fidelity_key: The column in the table that contains the fidelity - result_keys: The columns in the table that contain the results - config_keys: The columns in the table that contain the config values - remove_constants: Remove constant config columns from the data or not. + result_type: The result type for this benchmark. + config_type: The config type for this benchmark. + value_metric: The metric to use for this benchmark. Uses + the default metric from the Result if None. + cost_metric: The cost to use for this benchmark. Uses + the default cost from the Result if None. space: The configuration space to use for the benchmark. If None, will just be an empty space. prior: The prior to use for the benchmark. If None, no prior is used. @@ -87,8 +69,6 @@ def __init__( # noqa: PLR0913 probability of swapping the value for a random one. seed: The seed to use for the benchmark. """ - cls = self.__class__ - # Make sure we work with a clean slate, no issue with index. table = table.reset_index() @@ -99,9 +79,13 @@ def __init__( # noqa: PLR0913 if fidelity_key not in table.columns: raise ValueError(f"'{fidelity_key=}' not in columns {table.columns}") + result_keys: list[str] = list(result_type.metric_defs.keys()) if not all(key in table.columns for key in result_keys): - raise ValueError(f"{result_keys=} not in columns {table.columns}") + raise ValueError( + f"Not all {result_keys=} not in columns {table.columns}", + ) + config_keys: list[str] = config_type.names() if not all(key in table.columns for key in config_keys): raise ValueError(f"{config_keys=} not in columns {table.columns}") @@ -112,19 +96,6 @@ def __init__( # noqa: PLR0913 " Please drop it or rename it.", ) - # Remove constants from the table - if remove_constants: - - def is_constant(_s: pd.Series) -> bool: - _arr = _s.to_numpy() - return bool((_arr == _arr[0]).all()) - - constant_cols = [ - col for col in table.columns if is_constant(table[col]) # type: ignore - ] - table = table.drop(columns=constant_cols) # type: ignore - config_keys = [k for k in config_keys if k not in constant_cols] - # Remap their id column to `id` table = table.rename(columns={id_key: "id"}) @@ -169,7 +140,7 @@ def is_constant(_s: pd.Series) -> bool: # ... id_table = table.groupby(level="id").agg("first") configs = { - str(config_id): cls.Config.from_dict( + str(config_id): config_type.from_dict( { **row[config_keys].to_dict(), # type: ignore "id": str(config_id), @@ -184,27 +155,48 @@ def is_constant(_s: pd.Series) -> bool: self.table = table self.configs = configs - self.fidelity_key = fidelity_key self.id_key = id_key + self.fidelity_key = fidelity_key self.config_keys = sorted(config_keys) self.result_keys = sorted(result_keys) - self.fidelity_range = (start, end, step) # type: ignore super().__init__( name=name, seed=seed, + config_type=config_type, + result_type=result_type, + fidelity_name=fidelity_key, + fidelity_range=(start, end, step), space=space, prior=prior, perturb_prior=perturb_prior, + value_metric=value_metric, + cost_metric=cost_metric, ) + _raw_optimums = { + (k, metric): ( + float(table[k].min()) if metric.minimize else float(table[k].max()) + ) + for k, metric in self.Result.metric_defs.items() + } + self.table_optimums: dict[str, Metric.Value] = { + k: metric.as_value(v) for (k, metric), v in _raw_optimums.items() + } + + if self.value_metric not in self.result_keys: + raise ValueError(f"{self.value_metric=} not in {self.result_keys}") + + if self.cost_metric not in self.result_keys: + raise ValueError(f"{self.cost_metric=} not in {self.result_keys}") + def query( self, config: CTabular | Mapping[str, Any] | str, - at: F | None = None, *, - argmax: str | None = None, - argmin: str | None = None, + at: F | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ) -> R: """Submit a query and get a result. @@ -228,20 +220,36 @@ def query( Args: config: The query to use at: The fidelity at which to query, defaults to None which means *maximum* - argmax: Whether to return the argmax up to the point `at`. Will be slower as - it has to get the entire trajectory. Uses the key from the Results. - argmin: Whether to return the argmin up to the point `at`. Will be slower as - it has to get the entire trajectory. Uses the key from the Results. + value_metric: The metric to use for this result. Uses + the value metric passed in to the constructor if not specified, + otherwise the default metric from the Result if None. + cost_metric: The metric to use for this result. Uses + the cost metric passed in to the constructor if not specified, + otherwise the default metric from the Result if None. Returns: The result of the query """ _config = self._find_config(config) - return super().query( - _config, - at=at, # type: ignore - argmax=argmax, - argmin=argmin, + + at = at if at is not None else self.end + assert self.start <= at <= self.end + + __config = _config.as_dict(with_id=True) + if self._config_renames is not None: + _reverse_renames = {v: k for k, v in self._config_renames.items()} + __config = {k: __config.get(v, v) for k, v in _reverse_renames.items()} + + value_metric = value_metric if value_metric is not None else self.value_metric + cost_metric = cost_metric if cost_metric is not None else self.cost_metric + + return self.Result.from_dict( + config=config, + fidelity=at, + result=self._objective_function(__config, at=at), + value_metric=str(value_metric), + cost_metric=str(cost_metric), + renames=self._result_renames, ) @override @@ -252,6 +260,8 @@ def trajectory( frm: F | None = None, to: F | None = None, step: F | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ) -> list[R]: """Submit a query and get a result. @@ -277,12 +287,46 @@ def trajectory( frm: Start of the curve, should default to the start to: End of the curve, should default to the total step: Step size, defaults to ``cls.default_step`` + value_metric: The metric to use for this result. Uses + the value metric passed in to the constructor if not specified, + otherwise the default metric from the Result if None. + cost_metric: The metric to use for this result. Uses + the cost metric passed in to the constructor if not specified, + otherwise the default metric from the Result if None. Returns: The result of the query """ _config = self._find_config(config) - return super().trajectory(_config, frm=frm, to=to, step=step) # type: ignore + + to = to if to is not None else self.end + frm = frm if frm is not None else self.start + step = step if step is not None else self.step + + __config = _config.as_dict(with_id=True) + if self._config_renames is not None: + _reverse_renames = {v: k for k, v in self._config_renames.items()} + __config = {k: __config.get(v, v) for k, v in _reverse_renames.items()} + + value_metric = value_metric if value_metric is not None else self.value_metric + cost_metric = cost_metric if cost_metric is not None else self.cost_metric + + return [ + self.Result.from_dict( + config=config, + fidelity=fidelity, + result=result, + value_metric=str(value_metric), + cost_metric=str(cost_metric), + renames=self._result_renames, + ) + for fidelity, result in self._trajectory( + __config, + frm=frm, + to=to, + step=step, + ) + ] def _find_config( self, @@ -299,6 +343,10 @@ def _find_config( # If's a Config, that's fine if isinstance(config, self.Config): + if config.id not in self.configs: + raise ValueError( + f"Config {config.id} not in {self.configs.keys()}", + ) return config # At this point, we assume we're basically dealing with a dictionary @@ -319,7 +367,7 @@ def _find_config( # id that way match = first_true( self.configs.values(), - pred=lambda c: c == config, # type: ignore + pred=lambda c: c.as_dict(with_id=False) == config, # type: ignore default=None, ) if match is None: @@ -330,7 +378,12 @@ def _find_config( return match @override - def _objective_function(self, config: CTabular, at: F) -> R: + def _objective_function( + self, + config: Mapping[str, Any], + *, + at: F, + ) -> Mapping[str, float]: """Submit a query and get a result. Args: @@ -340,12 +393,46 @@ def _objective_function(self, config: CTabular, at: F) -> R: Returns: The result of the query """ - row = self.table.loc[(config.id, at)] + config = dict(config) + _id = config.pop("id") + row = self.table.loc[(_id, at)] + + row.name = _id + _config = dict(row[self.config_keys]) + if config != _config: + raise ValueError( + f"Config queried with is not equal to the one in the table with {_id=}." + f"\nconfig provided {config=}" + f"\nconfig in table {_config=}", + ) - row.name = config.id - config = self.Config.from_row(row[self.config_keys]) - results = row[self.result_keys] - return self.Result.from_row(config=config, row=results, fidelity=at) + return dict(row[self.result_keys]) + + @override + def _trajectory( + self, + config: Mapping[str, Any], + *, + frm: F, + to: F, + step: F, + ) -> Iterable[tuple[F, Mapping[str, float]]]: + config = dict(config) + _id = config.pop("id") + rows = self.table.loc[(_id, frm):(_id, to):step] # type: ignore + first_config = dict(rows.iloc[0][self.config_keys]) + + if config != first_config: + raise ValueError( + f"Config queried with is not equal to the one in the table with {_id=}." + f"\nconfig provided {config=}" + f"\nconfig in table {first_config=}", + ) + + return [ + (fidelity, dict(row[self.result_keys])) + for (_, fidelity), row in rows.iterrows() + ] # No number specified, just return one config @overload @@ -390,7 +477,7 @@ def sample( """ _seed: int | None if isinstance(seed, np.random.RandomState): - _seed = seed.random_integers(0, 2**32 - 1) + _seed = seed.random_integers(0, 2**31 - 1) else: _seed = seed @@ -413,133 +500,19 @@ def sample( return [config_items[i] for i in indices] -class GenericTabularBenchmark( - TabularBenchmark[ - GenericTabularConfig, - GenericTabularResult[GenericTabularConfig, F], - F, - ], -): - Result = GenericTabularResult - Config = GenericTabularConfig - - def __init__( # noqa: PLR0913 - self, - table: pd.DataFrame, - *, - name: str | None = None, - id_key: str, - fidelity_key: str, - result_keys: Sequence[str], - config_keys: Sequence[str], - result_mapping: (dict[str, str | Callable[[pd.DataFrame], Any]] | None) = None, - remove_constants: bool = False, - space: ConfigurationSpace | None = None, - seed: int | None = None, - prior: str | Path | GenericTabularConfig | Mapping[str, Any] | None = None, - perturb_prior: float | None = None, - ): - """Initialize the benchmark. - - Args: - table: The table to use for the benchmark - name: The name of the benchmark. If None, will be set to - `unknown-{datetime.now().isoformat()}` - id_key: The column in the table that contains the config id - fidelity_key: The column in the table that contains the fidelity - result_keys: The columns in the table that contain the results - config_keys: The columns in the table that contain the config values - result_mapping: A mapping from the result keys to the table keys. - If a string, will be used as the key in the table. If a callable, - will be called with the table and the result will be used as the value. - remove_constants: Remove constant config columns from the data or not. - space: The configuration space to use for the benchmark. If None, will - just be an empty space. - seed: The seed to use. - prior: The prior to use for the benchmark. If None, no prior is used. - If a str, will check the local location first for a prior - specific for this benchmark, otherwise assumes it to be a Path. - If a Path, will load the prior from the path. - If a Mapping, will be used directly. - perturb_prior: If not None, will perturb the prior by this amount. - For numericals, this is interpreted as the standard deviation of a - normal distribution while for categoricals, this is interpreted - as the probability of swapping the value for a random one. - """ - if name is None: - name = f"unknown-{datetime.now().isoformat()}" - - _result_mapping: dict = result_mapping if result_mapping is not None else {} - - # Remap the result keys so it works with the generic result types - if _result_mapping is not None: - for k, v in _result_mapping.items(): - if isinstance(v, str): - if v not in table.columns: - raise ValueError(f"{v} not in columns\n{table.columns}") - - table[k] = table[v] - elif callable(v): - table[k] = v(table) - else: - raise ValueError(f"Unknown result mapping {v} for {k}") - - super().__init__( - name=name, - table=table, - id_key=id_key, - fidelity_key=fidelity_key, - result_keys=[*result_keys, *_result_mapping.keys()], - config_keys=config_keys, - remove_constants=remove_constants, - space=space, - seed=seed, - prior=prior, - perturb_prior=perturb_prior, - ) - - if __name__ == "__main__": HERE = Path(__file__).parent path = HERE.parent.parent / "data" / "lcbench-tabular" / "adult.parquet" table = pd.read_parquet(path) - benchmark = GenericTabularBenchmark( - table=table, + from mfpbench.lcbench_tabular import LCBenchTabularConfig, LCBenchTabularResult + + benchmark = TabularBenchmark( + "toy", + table, id_key="id", fidelity_key="epoch", - result_keys=[ - "time", - "val_accuracy", - "val_cross_entropy", - "val_balanced_accuracy", - "test_accuracy", - "test_cross_entropy", - "test_balanced_accuracy", - ], - result_mapping={ - "error": lambda df: 1 - df["val_accuracy"], - "score": lambda df: df["val_accuracy"], - }, - config_keys=[ - "batch_size", - "loss", - "imputation_strategy", - "learning_rate_scheduler", - "network", - "max_dropout", - "normalization_strategy", - "optimizer", - "cosine_annealing_T_max", - "cosine_annealing_eta_min", - "activation", - "max_units", - "mlp_shape", - "num_layers", - "learning_rate", - "momentum", - "weight_decay", - ], - remove_constants=True, + result_type=LCBenchTabularResult, + config_type=LCBenchTabularConfig, ) # benchmark = LCBenchTabular(task="adult") all_configs = benchmark.configs # type: ignore @@ -550,7 +523,7 @@ def __init__( # noqa: PLR0913 config_id = config.id result = benchmark.query(config, at=1) - argmin_score = benchmark.query(config, at=42, argmin="error") + argmin_score = benchmark.query(config, at=42) trajectory = benchmark.trajectory(config, frm=1, to=10) diff --git a/src/mfpbench/util.py b/src/mfpbench/util.py index 3896e99..1919446 100644 --- a/src/mfpbench/util.py +++ b/src/mfpbench/util.py @@ -72,7 +72,7 @@ def remove_hyperparameter(name: str, space: ConfigurationSpace) -> Configuration hps = [copy(hp) for hp in space.get_hyperparameters() if hp.name != name] if isinstance(space.random, np.random.RandomState): - new_seed = space.random.randint(2**32 - 1) + new_seed = space.random.randint(2**31 - 1) else: new_seed = copy(space.random) diff --git a/src/mfpbench/yahpo/benchmark.py b/src/mfpbench/yahpo/benchmark.py index 5ce61f9..0e77b89 100644 --- a/src/mfpbench/yahpo/benchmark.py +++ b/src/mfpbench/yahpo/benchmark.py @@ -4,14 +4,12 @@ import tempfile import uuid from pathlib import Path -from typing import TYPE_CHECKING, Any, ClassVar, Mapping, Sequence, TypeVar +from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Mapping, TypeVar from typing_extensions import override -from mfpbench.benchmark import Benchmark +from mfpbench.benchmark import Benchmark, Config, Result from mfpbench.setup_benchmark import YAHPOSource from mfpbench.util import remove_hyperparameter -from mfpbench.yahpo.config import YAHPOConfig -from mfpbench.yahpo.result import YAHPOResult if TYPE_CHECKING: import onnxruntime @@ -120,8 +118,8 @@ def _ensure_yahpo_config_set(datapath: Path) -> None: # A Yahpo Benchmark is parametrized by a YAHPOConfig, YAHPOResult and fidelity -C = TypeVar("C", bound=YAHPOConfig) -R = TypeVar("R", bound=YAHPOResult) +C = TypeVar("C", bound=Config) +R = TypeVar("R", bound=Result) F = TypeVar("F", int, float) @@ -129,26 +127,31 @@ class YAHPOBenchmark(Benchmark[C, R, F]): yahpo_base_benchmark_name: ClassVar[str] """Base name of the yahpo benchmark.""" - yahpo_instances: tuple[str, ...] | None + yahpo_config_type: type[C] + """The config type for this benchmark.""" + + yahpo_result_type: type[R] + """The result type for this benchmark.""" + + yahpo_fidelity_name: ClassVar[str] + """The name of the fidelity for this benchmark.""" + + yahpo_fidelity_range: tuple[F, F, F] + """The fidelity range for this benchmark.""" + + yahpo_has_conditionals: ClassVar[bool] = False + """Whether this benchmark has conditionals.""" + + yahpo_instances: ClassVar[tuple[str, ...] | None] = None """The instances available for this benchmark, if Any.""" - yahpo_task_id_name: ClassVar[str | None] + yahpo_task_id_name: ClassVar[str | None] = None """Name of hp used to indicate task.""" - yahpo_forced_remove_hps: Mapping[str, int | float | str] | None + yahpo_forced_remove_hps: ClassVar[Mapping[str, int | float | str] | None] = None """Any hyperparameters that should be forcefully deleted from the space but have default values filled in""" - yahpo_replacements_hps: Sequence[tuple[str, str]] | None - """Any replacements that need to be done in hyperparameters - [(dataclass_version, dict_version)]""" - - datadir: Path - """The path to where the data is stored.""" - - task_id: str - """The task id for this benchmark.""" - def __init__( # noqa: C901, PLR0912 self, task_id: str, @@ -158,6 +161,8 @@ def __init__( # noqa: C901, PLR0912 prior: str | Path | C | Mapping[str, Any] | None = None, perturb_prior: float | None = None, session: onnxruntime.InferenceSession | None = None, + value_metric: str | None = None, + cost_metric: str | None = None, ): """Initialize a Yahpo Benchmark. @@ -180,18 +185,22 @@ def __init__( # noqa: C901, PLR0912 This is only a backdoor for onnx compatibility issues with YahpoGym. You are advised not to use this unless you know what you are doing. + value_metric: The metric to use for this benchmark. Uses + the default metric from the Result if None. + cost_metric: The cost to use for this benchmark. Uses + the default cost from the Result if None. """ # Validation cls = self.__class__ # These errors are maintainers errors, not user errors - if cls.yahpo_forced_remove_hps is not None and cls.has_conditionals: + if cls.yahpo_forced_remove_hps is not None and cls.yahpo_has_conditionals: raise NotImplementedError( "Error setting up a YAHPO Benchmark with conditionals", " and forced hps", ) - if cls.yahpo_task_id_name is not None and cls.has_conditionals: + if cls.yahpo_task_id_name is not None and cls.yahpo_has_conditionals: raise NotImplementedError( f"{self.name} has conditionals, can't remove task_id from space", ) @@ -264,9 +273,16 @@ def __init__( # noqa: C901, PLR0912 super().__init__( name=name, seed=seed, + config_type=cls.yahpo_config_type, + result_type=cls.yahpo_result_type, + fidelity_name=cls.yahpo_fidelity_name, + fidelity_range=cls.yahpo_fidelity_range, # type: ignore + has_conditionals=cls.yahpo_has_conditionals, space=space, prior=prior, perturb_prior=perturb_prior, + value_metric=value_metric, + cost_metric=cost_metric, ) @property @@ -288,8 +304,15 @@ def load(self) -> None: _ = self.bench @override - def _objective_function(self, config: C, at: F) -> R: - query = config.dict() + def _trajectory( + self, + config: Mapping[str, Any], + *, + frm: F, + to: F, + step: F, + ) -> Iterable[tuple[F, Mapping[str, float]]]: + query = dict(config) if self.yahpo_forced_remove_hps is not None: query.update(self.yahpo_forced_remove_hps) @@ -297,24 +320,22 @@ def _objective_function(self, config: C, at: F) -> R: if self.task_id is not None and self.yahpo_task_id_name is not None: query[self.yahpo_task_id_name] = self.task_id - query[self.fidelity_name] = at + # Copy same config and insert fidelities for each + queries: list[dict] = [ + {**query, self.fidelity_name: f} + for f in self.iter_fidelities(frm=frm, to=to, step=step) + ] # NOTE: seed is allowed to be int | None results: list[dict] = self.bench.objective_function( - query, + queries, seed=self.seed, # type: ignore ) - result = results[0] - - return self.Result.from_dict( - config=config, - result=result, - fidelity=at, - ) + return zip(self.iter_fidelities(frm=frm, to=to, step=step), results) @override - def _trajectory(self, config: C, *, frm: F, to: F, step: F) -> list[R]: - query = config.dict() + def _objective_function(self, config: Mapping[str, Any], at: F) -> dict[str, float]: + query = dict(config) if self.yahpo_forced_remove_hps is not None: query.update(self.yahpo_forced_remove_hps) @@ -322,24 +343,11 @@ def _trajectory(self, config: C, *, frm: F, to: F, step: F) -> list[R]: if self.task_id is not None and self.yahpo_task_id_name is not None: query[self.yahpo_task_id_name] = self.task_id - # Copy same config and insert fidelities for each - queries: list[dict] = [ - {**query, self.fidelity_name: f} - for f in self.iter_fidelities(frm=frm, to=to, step=step) - ] + query[self.fidelity_name] = at # NOTE: seed is allowed to be int | None results: list[dict] = self.bench.objective_function( - queries, + query, seed=self.seed, # type: ignore ) - - return [ - self.Result.from_dict( - config=config, - result=result, - fidelity=query[self.fidelity_name], - ) - # We need to loop over q's for fidelity - for result, query in zip(results, queries) - ] + return results[0] diff --git a/src/mfpbench/yahpo/benchmarks/iaml/__init__.py b/src/mfpbench/yahpo/benchmarks/iaml/__init__.py index a61c505..fddc446 100644 --- a/src/mfpbench/yahpo/benchmarks/iaml/__init__.py +++ b/src/mfpbench/yahpo/benchmarks/iaml/__init__.py @@ -2,27 +2,22 @@ from mfpbench.yahpo.benchmarks.iaml.iaml_glmnet import ( IAMLglmnetBenchmark, IAMLglmnetConfig, - IAMLglmnetResult, ) from mfpbench.yahpo.benchmarks.iaml.iaml_ranger import ( IAMLrangerBenchmark, IAMLrangerConfig, - IAMLrangerResult, ) from mfpbench.yahpo.benchmarks.iaml.iaml_rpart import ( IAMLrpartBenchmark, IAMLrpartConfig, - IAMLrpartResult, ) from mfpbench.yahpo.benchmarks.iaml.iaml_super import ( IAMLSuperBenchmark, IAMLSuperConfig, - IAMLSuperResult, ) from mfpbench.yahpo.benchmarks.iaml.iaml_xgboost import ( IAMLxgboostBenchmark, IAMLxgboostConfig, - IAMLxgboostResult, ) __all__ = [ @@ -30,18 +25,13 @@ "IAMLConfig", "IAMLResult", "IAMLSuperBenchmark", - "IAMLSuperResult", "IAMLSuperConfig", "IAMLglmnetBenchmark", - "IAMLglmnetResult", "IAMLglmnetConfig", "IAMLrangerBenchmark", - "IAMLrangerResult", "IAMLrangerConfig", "IAMLrpartBenchmark", - "IAMLrpartResult", "IAMLrpartConfig", "IAMLxgboostBenchmark", - "IAMLxgboostResult", "IAMLxgboostConfig", ] diff --git a/src/mfpbench/yahpo/benchmarks/iaml/iaml.py b/src/mfpbench/yahpo/benchmarks/iaml/iaml.py index 898eaff..41b7836 100644 --- a/src/mfpbench/yahpo/benchmarks/iaml/iaml.py +++ b/src/mfpbench/yahpo/benchmarks/iaml/iaml.py @@ -1,91 +1,77 @@ from __future__ import annotations from dataclasses import asdict, dataclass -from typing import Any, Mapping, Sequence, TypeVar +from typing import Any, ClassVar, Mapping, TypeVar +from typing_extensions import Self +import numpy as np + +from mfpbench.benchmark import Config, Result +from mfpbench.metric import Metric from mfpbench.yahpo.benchmark import YAHPOBenchmark -from mfpbench.yahpo.config import YAHPOConfig -from mfpbench.yahpo.result import YAHPOResult C = TypeVar("C", bound="IAMLConfig") R = TypeVar("R", bound="IAMLResult") @dataclass(frozen=True, eq=False, unsafe_hash=True) # type: ignore[misc] -class IAMLConfig(YAHPOConfig): +class IAMLConfig(Config): @classmethod - def from_dict(cls: type[C], d: Mapping[str, Any]) -> C: + def from_dict( + cls, + d: Mapping[str, Any], + renames: Mapping[str, str] | None = None, + ) -> Self: """Create from a dict or mapping object.""" # We may have keys that are conditional and hence we need to flatten them config = {k.replace(".", "__"): v for k, v in d.items()} - return cls(**config) + return super().from_dict(config, renames) - def dict(self) -> dict[str, Any]: + def as_dict(self) -> dict[str, Any]: """Converts the config to a raw dictionary.""" d = asdict(self) return {k.replace("__", "."): v for k, v in d.items() if v is not None} @dataclass(frozen=True) # type: ignore[misc] -class IAMLResult(YAHPOResult[C, float]): - fidelity: float - - mmce: float - f1: float - auc: float - logloss: float - - timetrain: float - timepredict: float - - ramtrain: float - rammodel: float - rampredict: float - - mec: float - ias: float - nf: float - - @property - def score(self) -> float: - """The score of interest.""" - return self.f1 - - @property - def error(self) -> float: - """The error of interest.""" - return 1 - self.f1 - - @property - def test_score(self) -> float: - """The score on the test set.""" - return self.f1 - - @property - def test_error(self) -> float: - """The error on the test set.""" - return 1 - self.f1 - - @property - def val_score(self) -> float: - """The score on the validation set.""" - return self.score - - @property - def val_error(self) -> float: - """The error on the validation set.""" - return self.error - - @property - def cost(self) -> float: - """The time taken in seconds to train the config.""" - return self.timetrain - - -class IAMLBenchmark(YAHPOBenchmark): +class IAMLResult(Result[C, float]): + default_value_metric: ClassVar[str] = "f1" + default_cost_metric: ClassVar[str] = "timetrain" + metric_defs: ClassVar[Mapping[str, Metric]] = { + "mmce": Metric(minimize=True, bounds=(0, np.inf)), + "f1": Metric(minimize=False, bounds=(0, 1)), + "auc": Metric(minimize=False, bounds=(0, 1)), + "logloss": Metric(minimize=True, bounds=(0, np.inf)), + "timetrain": Metric(minimize=True, bounds=(0, np.inf)), + "timepredict": Metric(minimize=True, bounds=(0, np.inf)), + "ramtrain": Metric(minimize=True, bounds=(0, np.inf)), + "rammodel": Metric(minimize=True, bounds=(0, np.inf)), + "rampredict": Metric(minimize=True, bounds=(0, np.inf)), + } + + mmce: Metric.Value + f1: Metric.Value + auc: Metric.Value + logloss: Metric.Value + + timetrain: Metric.Value + timepredict: Metric.Value + + ramtrain: Metric.Value + rammodel: Metric.Value + rampredict: Metric.Value + + # Definitions taken from YAHPO-gym paper appendix + # Whether to minimize is not really fully relevant + # so these are not given a real Metric definition. + mec: float # main effect complexity of features + ias: float # Iteration stregth of features + nf: float # Number of features used + + +class IAMLBenchmark(YAHPOBenchmark[C, IAMLResult, float]): + yahpo_result_type = IAMLResult # IAML class of benchmarks share train size as fidelity - fidelity_range = (0.03, 1.0, 0.05) - fidelity_name = "trainsize" + yahpo_fidelity_range = (0.03, 1.0, 0.05) + yahpo_fidelity_name = "trainsize" yahpo_task_id_name = "task_id" - yahpo_replacements_hps: Sequence[tuple[str, str]] | None = None - yahpo_forced_remove_hps = None diff --git a/src/mfpbench/yahpo/benchmarks/iaml/iaml_glmnet.py b/src/mfpbench/yahpo/benchmarks/iaml/iaml_glmnet.py index 49537fe..360fdda 100644 --- a/src/mfpbench/yahpo/benchmarks/iaml/iaml_glmnet.py +++ b/src/mfpbench/yahpo/benchmarks/iaml/iaml_glmnet.py @@ -1,9 +1,8 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check -from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig, IAMLResult +from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -11,22 +10,9 @@ class IAMLglmnetConfig(IAMLConfig): alpha: float s: float # log - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert 0.0 <= self.alpha <= 1.0 - assert 0.00010000000000000009 <= self.s <= 999.9999999999998 - - -@dataclass(frozen=True) -class IAMLglmnetResult(IAMLResult): - config: IAMLglmnetConfig - - -class IAMLglmnetBenchmark(IAMLBenchmark): - Result = IAMLglmnetResult - Config = IAMLglmnetConfig - has_conditionals = False +class IAMLglmnetBenchmark(IAMLBenchmark[IAMLglmnetConfig]): + yahpo_config_type = IAMLglmnetConfig + yahpo_has_conditionals = False yahpo_base_benchmark_name = "iaml_glmnet" yahpo_instances = ("40981", "41146", "1489", "1067") diff --git a/src/mfpbench/yahpo/benchmarks/iaml/iaml_ranger.py b/src/mfpbench/yahpo/benchmarks/iaml/iaml_ranger.py index 332c69d..1ca27c7 100644 --- a/src/mfpbench/yahpo/benchmarks/iaml/iaml_ranger.py +++ b/src/mfpbench/yahpo/benchmarks/iaml/iaml_ranger.py @@ -1,10 +1,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig, IAMLResult +from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -18,34 +17,9 @@ class IAMLrangerConfig(IAMLConfig): num__random__splits: int | None = None - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert 1 <= self.min__node__size <= 100 - assert 0 <= self.mtry__power <= 1 - assert 1 <= self.num__trees <= 2000 - assert self.respect__unordered__factors in [ - "ignore", - "order", - "partition", - ] - assert 0.1 <= self.sample__fraction <= 1.0 - assert self.splitrule in ["gini", "extratrees"] - - if self.num__random__splits is not None: - assert self.splitrule == "extratrees" - assert 1 <= self.num__random__splits <= 100 - - -@dataclass(frozen=True) -class IAMLrangerResult(IAMLResult): - config: IAMLrangerConfig - - -class IAMLrangerBenchmark(IAMLBenchmark): - Result = IAMLrangerResult - Config = IAMLrangerConfig - has_conditionals = True +class IAMLrangerBenchmark(IAMLBenchmark[IAMLrangerConfig]): + yahpo_config_type = IAMLrangerConfig + yahpo_has_conditionals = True yahpo_base_benchmark_name = "iaml_ranger" yahpo_instances = ("40981", "41146", "1489", "1067") diff --git a/src/mfpbench/yahpo/benchmarks/iaml/iaml_rpart.py b/src/mfpbench/yahpo/benchmarks/iaml/iaml_rpart.py index 86d0051..c62a15f 100644 --- a/src/mfpbench/yahpo/benchmarks/iaml/iaml_rpart.py +++ b/src/mfpbench/yahpo/benchmarks/iaml/iaml_rpart.py @@ -1,9 +1,8 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check -from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig, IAMLResult +from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -13,24 +12,9 @@ class IAMLrpartConfig(IAMLConfig): minbucket: int minsplit: int - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert 0.00010000000000000009 <= self.cp <= 1.0 - assert 1 <= self.maxdepth <= 30 - assert 1 <= self.minbucket <= 100 - assert 1 <= self.minsplit <= 100 - - -@dataclass(frozen=True) -class IAMLrpartResult(IAMLResult): - config: IAMLrpartConfig - - -class IAMLrpartBenchmark(IAMLBenchmark): - Result = IAMLrpartResult - Config = IAMLrpartConfig - has_conditionals = False +class IAMLrpartBenchmark(IAMLBenchmark[IAMLrpartConfig]): + yahpo_config_type = IAMLrpartConfig + yahpo_has_conditionals = False yahpo_base_benchmark_name = "iaml_rpart" yahpo_instances = ("40981", "41146", "1489", "1067") diff --git a/src/mfpbench/yahpo/benchmarks/iaml/iaml_super.py b/src/mfpbench/yahpo/benchmarks/iaml/iaml_super.py index 4e14749..dca3aee 100644 --- a/src/mfpbench/yahpo/benchmarks/iaml/iaml_super.py +++ b/src/mfpbench/yahpo/benchmarks/iaml/iaml_super.py @@ -1,10 +1,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig, IAMLResult +from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -48,128 +47,9 @@ class IAMLSuperConfig(IAMLConfig): xgboost__skip_drop: float | None = None xgboost__subsample: float | None = None - @no_type_check - def validate(self) -> None: # noqa: C901, PLR0915, PLR0912 - """Validate this config.""" - assert self.learner_id in ["glmnet", "ranger", "rpart", "xgboost"] - - # We do some conditional checking here - learner = self.learner_id - - # We filter out all attributes except for those that must always be contained - # or are the selected learner, ... - attrs = [ - attr - for attr in dir(self) - if not attr.startswith("__") - or not attr.startswith(learner) - or attr in ["learner_id"] - ] - - # ... the remaining must always have None set then - for attr in attrs: - assert attr is None - - if learner == "glmnet": - assert self.glmnet__alpha is not None - assert self.glmnet__s is not None - assert 0.0 <= self.glmnet__alpha <= 1.0 - assert 0.00010000000000000009 <= self.glmnet__s <= 999.9999999999998 - - elif learner == "rpart": - assert self.rpart__cp is not None - assert self.rpart__maxdepth is not None - assert self.rpart__minbucket is not None - assert self.rpart__minsplit is not None - assert 0.00010000000000000009 <= self.rpart__cp <= 1.0 - assert 1 <= self.rpart__maxdepth <= 30 - assert 1 <= self.rpart__minbucket <= 100 - assert 1 <= self.rpart__minsplit <= 100 - - elif learner == "ranger": - assert self.ranger__min__node__size is not None - assert self.ranger__mtry__power is not None - assert self.ranger__num__trees is not None - assert self.ranger__respect__unordered__factors is not None - assert self.ranger__sample__fraction is not None - assert 1 <= self.ranger__min__node__size <= 100 - assert 0 <= self.ranger__mtry__power <= 1 - assert 1 <= self.ranger__num__trees <= 2000 - assert self.ranger__respect__unordered__factors in [ - "ignore", - "order", - "partition", - ] - assert 0.1 <= self.ranger__sample__fraction <= 1.0 - assert self.ranger__splitrule in ["gini", "extratrees"] - - if self.ranger__num__random__splits is not None: - assert self.ranger__splitrule == "extratrees" - assert 1 <= self.ranger__num__random__splits <= 100 - - elif learner == "xgboost": - assert self.xgboost__alpha is not None - assert self.xgboost__lambda is not None - assert self.xgboost__nrounds is not None - assert self.xgboost__subsample is not None - assert self.xgboost__booster in ["gblinear", "gbtree", "dart"] - assert 0.00010000000000000009 <= self.xgboost__alpha <= 999.9999999999998 - assert 0.00010000000000000009 <= self.xgboost__lambda <= 999.9999999999998 - assert 7 <= self.xgboost__nrounds <= 2981 - assert 0.1 <= self.xgboost__subsample <= 1.0 - - if self.xgboost__colsample_bylevel is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert 0.01 <= self.xgboost__colsample_bylevel <= 1.0 - - if self.xgboost__colsample_bytree is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert 0.01 <= self.xgboost__colsample_bytree <= 1.0 - - if self.xgboost__eta is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert 0.00010000000000000009 <= self.xgboost__eta <= 1.0 - - if self.xgboost__gamma is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert ( - 0.00010000000000000009 <= self.xgboost__gamma <= 6.999999999999999 - ) - - if self.xgboost__max_depth is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert 1 <= self.xgboost__max_depth <= 15 - - if self.xgboost__min_child_weight is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert ( - 2.718281828459045 - <= self.xgboost__min_child_weight - <= 149.99999999999997 - ) - - if self.xgboost__rate_drop is not None: - assert self.xgboost__booster in ["dart"] - assert 0.0 <= self.xgboost__rate_drop <= 1.0 - - if self.xgboost__skip_drop is not None: - assert self.xgboost__booster in ["dart"] - assert 0.0 <= self.xgboost__skip_drop <= 1.0 - - else: - raise NotImplementedError() - - -@dataclass(frozen=True) -class IAMLSuperResult(IAMLResult): - config: IAMLSuperConfig - - -class IAMLSuperBenchmark(IAMLBenchmark): - Result = IAMLSuperResult - Config = IAMLSuperConfig - - has_conditionals = True +class IAMLSuperBenchmark(IAMLBenchmark[IAMLSuperConfig]): + yahpo_config_type = IAMLSuperConfig + yahpo_has_conditionals = True yahpo_base_benchmark_name = "iaml_super" yahpo_instances = ("40981", "41146", "1489", "1067") diff --git a/src/mfpbench/yahpo/benchmarks/iaml/iaml_xgboost.py b/src/mfpbench/yahpo/benchmarks/iaml/iaml_xgboost.py index 066ad47..47cfeec 100644 --- a/src/mfpbench/yahpo/benchmarks/iaml/iaml_xgboost.py +++ b/src/mfpbench/yahpo/benchmarks/iaml/iaml_xgboost.py @@ -1,10 +1,10 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check +from typing import Mapping from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig, IAMLResult +from mfpbench.yahpo.benchmarks.iaml.iaml import IAMLBenchmark, IAMLConfig @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -24,58 +24,10 @@ class IAMLxgboostConfig(IAMLConfig): rate_drop: float | None = None skip_drop: float | None = None - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert self.booster in ["gblinear", "gbtree", "dart"] - assert 0.00010000000000000009 <= self.alpha <= 999.9999999999998 - assert 0.00010000000000000009 <= self._lambda <= 999.9999999999998 - assert 7 <= self.nrounds <= 2981 - assert 0.1 <= self.subsample <= 1.0 - - if self.colsample_bylevel is not None: - assert self.booster in ["dart", "gbtree"] - assert 0.01 <= self.colsample_bylevel <= 1.0 - - if self.colsample_bytree is not None: - assert self.booster in ["dart", "gbtree"] - assert 0.01 <= self.colsample_bytree <= 1.0 - - if self.eta is not None: - assert self.booster in ["dart", "gbtree"] - assert 0.00010000000000000009 <= self.eta <= 1.0 - - if self.gamma is not None: - assert self.booster in ["dart", "gbtree"] - assert 0.00010000000000000009 <= self.gamma <= 6.999999999999999 - - if self.max_depth is not None: - assert self.booster in ["dart", "gbtree"] - assert 1 <= self.max_depth <= 15 - - if self.min_child_weight is not None: - assert self.booster in ["dart", "gbtree"] - assert 2.718281828459045 <= self.min_child_weight <= 149.99999999999997 - - if self.rate_drop is not None: - assert self.booster in ["dart"] - assert 0.0 <= self.rate_drop <= 1.0 - - if self.skip_drop is not None: - assert self.booster in ["dart"] - assert 0.0 <= self.skip_drop <= 1.0 - - -@dataclass(frozen=True) -class IAMLxgboostResult(IAMLResult): - config: IAMLxgboostConfig - - -class IAMLxgboostBenchmark(IAMLBenchmark): - Result = IAMLxgboostResult - Config = IAMLxgboostConfig - has_conditionals = True +class IAMLxgboostBenchmark(IAMLBenchmark[IAMLxgboostConfig]): + _config_replacements: Mapping[str, str] = {"lambda": "_lambda"} + yahpo_config_type = IAMLxgboostConfig + yahpo_has_conditionals = True yahpo_base_benchmark_name = "iaml_xgboost" - yahpo_replacements_hps = (("_lambda", "lambda"),) yahpo_instances = ("40981", "41146", "1489", "1067") diff --git a/src/mfpbench/yahpo/benchmarks/lcbench.py b/src/mfpbench/yahpo/benchmarks/lcbench.py index 06e6954..13bad03 100644 --- a/src/mfpbench/yahpo/benchmarks/lcbench.py +++ b/src/mfpbench/yahpo/benchmarks/lcbench.py @@ -1,14 +1,16 @@ from __future__ import annotations from dataclasses import dataclass +from typing import ClassVar, Mapping -from mfpbench.yahpo.benchmark import YAHPOBenchmark -from mfpbench.yahpo.config import YAHPOConfig -from mfpbench.yahpo.result import YAHPOResult +import numpy as np + +from mfpbench.metric import Metric +from mfpbench.yahpo.benchmark import Config, Result, YAHPOBenchmark @dataclass(frozen=True, eq=False, unsafe_hash=True) -class LCBenchConfig(YAHPOConfig): +class LCBenchConfig(Config): """A LCBench Config. Note: @@ -25,74 +27,38 @@ class LCBenchConfig(YAHPOConfig): max_units: int # [64, 1024] int log max_dropout: float # [0.0, 1.0] float - def validate(self) -> None: - """Validate this is a correct config.""" - assert 16 <= self.batch_size <= 512 - assert 1e-04 <= self.learning_rate <= 0.1 - assert 0.1 <= self.momentum <= 0.99 - assert 1e-05 <= self.weight_decay <= 0.1 - assert 1 <= self.num_layers <= 5 - assert 64 <= self.max_units <= 1024 - assert 0.0 <= self.max_dropout <= 1.0 - @dataclass(frozen=True) # type: ignore[misc] -class LCBenchResult(YAHPOResult[LCBenchConfig, int]): - time: float # unit? - - val_accuracy: float - val_cross_entropy: float - val_balanced_accuracy: float - - test_cross_entropy: float - test_balanced_accuracy: float - - @property - def score(self) -> float: - """The score of interest.""" - return self.val_balanced_accuracy +class LCBenchResult(Result[LCBenchConfig, int]): + default_value_metric: ClassVar[str] = "val_balanced_accuracy" + default_cost_metric: ClassVar[str] = "time" + metric_defs: ClassVar[Mapping[str, Metric]] = { + "val_accuracy": Metric(minimize=False, bounds=(0, 100)), + "val_balanced_accuracy": Metric(minimize=False, bounds=(0, 100)), + "val_cross_entropy": Metric(minimize=True, bounds=(0, np.inf)), + "test_balanced_accuracy": Metric(minimize=False, bounds=(0, 100)), + "test_cross_entropy": Metric(minimize=True, bounds=(0, np.inf)), + "time": Metric(minimize=True, bounds=(0, np.inf)), + } - @property - def error(self) -> float: - """The error of interest.""" - return 1 - self.val_balanced_accuracy + val_accuracy: Metric.Value + val_cross_entropy: Metric.Value + val_balanced_accuracy: Metric.Value - @property - def test_score(self) -> float: - """The score on the test set.""" - return self.test_balanced_accuracy + test_cross_entropy: Metric.Value + test_balanced_accuracy: Metric.Value - @property - def test_error(self) -> float: - """The score on the test set.""" - return 1 - self.test_balanced_accuracy - - @property - def val_score(self) -> float: - """The score on the validation set.""" - return self.val_balanced_accuracy - - @property - def val_error(self) -> float: - """The score on the validation set.""" - return 1 - self.val_balanced_accuracy - - @property - def cost(self) -> float: - """Time taken in seconds to train the config (assumed to be seconds).""" - return self.time + time: Metric.Value # unit? class LCBenchBenchmark(YAHPOBenchmark): - fidelity_name = "epoch" - fidelity_range = (1, 52, 1) - Config = LCBenchConfig - Result = LCBenchResult - + yahpo_fidelity_range = (1, 52, 1) + yahpo_fidelity_name = "epoch" + yahpo_config_type = LCBenchConfig + yahpo_result_type = LCBenchResult yahpo_base_benchmark_name = "lcbench" yahpo_task_id_name = "OpenML_task_id" - yahpo_replacements_hps = None - yahpo_forced_remove_hps = None + yahpo_has_conditionals = False yahpo_instances = ( "3945", "7593", diff --git a/src/mfpbench/yahpo/benchmarks/nb301.py b/src/mfpbench/yahpo/benchmarks/nb301.py index 53b37e6..138b97b 100644 --- a/src/mfpbench/yahpo/benchmarks/nb301.py +++ b/src/mfpbench/yahpo/benchmarks/nb301.py @@ -1,15 +1,14 @@ from __future__ import annotations from dataclasses import asdict, dataclass -from itertools import product -from typing import Any, Mapping, TypeVar, no_type_check -from typing_extensions import Literal +from typing import Any, ClassVar, Mapping +from typing_extensions import Literal, Self -from mfpbench.yahpo.benchmark import YAHPOBenchmark -from mfpbench.yahpo.config import YAHPOConfig -from mfpbench.yahpo.result import YAHPOResult +import numpy as np -Self = TypeVar("Self", bound="NB301Config") +from mfpbench.benchmark import Config, Result +from mfpbench.metric import Metric +from mfpbench.yahpo.benchmark import YAHPOBenchmark ChoicesT = Literal[ "max_pool_3x3", @@ -35,7 +34,7 @@ @dataclass(frozen=True, eq=False, unsafe_hash=True) -class NB301Config(YAHPOConfig): +class NB301Config(Config): edge_normal_0: ChoicesT edge_normal_1: ChoicesT @@ -98,52 +97,18 @@ class NB301Config(YAHPOConfig): edge_reduce_12: ChoicesT | None = None edge_reduce_13: ChoicesT | None = None - @no_type_check - def validate(self) -> None: - """Validate this is a correct config. - - Note: - ---- - We don't check conditionals validity - """ - nodes = list(range(13 + 1)) - cells = ["normal", "reduce"] - for i, cell in product(nodes, cells): - attr_name = f"edge_{cell}_{i}" - attr = getattr(self, attr_name) - assert attr is None or attr in Choices, attr_name - - choices_3 = ["0_1", "0_2", "1_2"] - choices_4 = ["0_1", "0_2", "0_3", "1_2", "1_3", "2_3"] - choices_5 = [ - "0_1", - "0_2", - "0_3", - "0_4", - "1_2", - "1_3", - "1_4", - "2_3", - "2_4", - "3_4", - ] - - nodes = list(range(3, 5 + 1)) - for i, choices in [(3, choices_3), (4, choices_4), (5, choices_5)]: - normal_node = f"inputs_node_normal_{i}" - assert getattr(self, normal_node) in choices - - reduce_node = f"inputs_node_reduce_{i}" - assert getattr(self, reduce_node) in choices - @classmethod - def from_dict(cls: type[Self], d: Mapping[str, Any]) -> Self: + def from_dict( + cls, + d: Mapping[str, Any], + renames: Mapping[str, str] | None = None, + ) -> Self: """Create from a dict or mapping object.""" - # We just flatten things because it's way too big of a name + # We may have keys that are conditional and hence we need to flatten them config = {k.replace(_hp_name_extension, ""): v for k, v in d.items()} - return cls(**config) + return super().from_dict(config, renames) - def dict(self) -> dict[str, Any]: + def as_dict(self) -> dict[str, Any]: """Converts the config to a raw dictionary.""" return { _hp_name_extension + k: v for k, v in asdict(self).items() if v is not None @@ -151,55 +116,23 @@ def dict(self) -> dict[str, Any]: @dataclass(frozen=True) # type: ignore[misc] -class NB301Result(YAHPOResult[NB301Config, int]): - runtime: float # unit? - val_accuracy: float - - @property - def score(self) -> float: - """The score of interest.""" - return self.val_accuracy - - @property - def error(self) -> float: - """The error of interest.""" - return 1 - self.val_accuracy - - @property - def test_score(self) -> float: - """The score on the test set.""" - return self.val_accuracy - - @property - def test_error(self) -> float: - """The score on the test set.""" - return 1 - self.val_accuracy - - @property - def val_score(self) -> float: - """The score on the validation set.""" - return self.val_accuracy - - @property - def val_error(self) -> float: - """The score on the validation set.""" - return 1 - self.val_accuracy - - @property - def cost(self) -> float: - """Time taken in seconds to train the config.""" - return self.runtime +class NB301Result(Result[NB301Config, int]): + default_value_metric: ClassVar[str] = "val_accuracy" + default_cost_metric: ClassVar[str] = "runtime" + metric_defs: ClassVar[Mapping[str, Metric]] = { + "runtime": Metric(minimize=True, bounds=(0, np.inf)), + "val_accuracy": Metric(minimize=False, bounds=(0, 1)), + } + runtime: Metric.Value # unit? + val_accuracy: Metric.Value -class NB301Benchmark(YAHPOBenchmark): - fidelity_name = "epoch" - fidelity_range = (1, 98, 1) - Config = NB301Config - Result = NB301Result - has_conditionals = True +class NB301Benchmark(YAHPOBenchmark): + yahpo_fidelity_name = "epoch" + yahpo_fidelity_range = (1, 98, 1) + yahpo_config_type = NB301Config + yahpo_result_type = NB301Result + yahpo_has_conditionals = True yahpo_base_benchmark_name = "nb301" - yahpo_task_id_name = None yahpo_instances = ("CIFAR10",) - yahpo_replacements_hps = None - yahpo_forced_remove_hps = None diff --git a/src/mfpbench/yahpo/benchmarks/rbv2/__init__.py b/src/mfpbench/yahpo/benchmarks/rbv2/__init__.py index 723afaa..5e8b435 100644 --- a/src/mfpbench/yahpo/benchmarks/rbv2/__init__.py +++ b/src/mfpbench/yahpo/benchmarks/rbv2/__init__.py @@ -2,37 +2,30 @@ from mfpbench.yahpo.benchmarks.rbv2.rbv2_aknn import ( RBV2aknnBenchmark, RBV2aknnConfig, - RBV2aknnResult, ) from mfpbench.yahpo.benchmarks.rbv2.rbv2_glmnet import ( RBV2glmnetBenchmark, RBV2glmnetConfig, - RBV2glmnetResult, ) from mfpbench.yahpo.benchmarks.rbv2.rbv2_ranger import ( RBV2rangerBenchmark, RBV2rangerConfig, - RBV2rangerResult, ) from mfpbench.yahpo.benchmarks.rbv2.rbv2_rpart import ( RBV2rpartBenchmark, RBV2rpartConfig, - RBV2rpartResult, ) from mfpbench.yahpo.benchmarks.rbv2.rbv2_super import ( RBV2SuperBenchmark, RBV2SuperConfig, - RBV2SuperResult, ) from mfpbench.yahpo.benchmarks.rbv2.rbv2_svm import ( RBV2svmBenchmark, RBV2svmConfig, - RBV2svmResult, ) from mfpbench.yahpo.benchmarks.rbv2.rbv2_xgboost import ( RBV2xgboostBenchmark, RBV2xgboostConfig, - RBV2xgboostResult, ) __all__ = [ @@ -40,24 +33,17 @@ "RBV2Config", "RBV2Result", "RBV2SuperBenchmark", - "RBV2SuperResult", "RBV2SuperConfig", "RBV2glmnetBenchmark", - "RBV2glmnetResult", "RBV2glmnetConfig", "RBV2rangerBenchmark", - "RBV2rangerResult", "RBV2rangerConfig", "RBV2rpartBenchmark", - "RBV2rpartResult", "RBV2rpartConfig", "RBV2svmBenchmark", - "RBV2svmResult", "RBV2svmConfig", "RBV2xgboostBenchmark", - "RBV2xgboostResult", "RBV2xgboostConfig", "RBV2aknnBenchmark", - "RBV2aknnResult", "RBV2aknnConfig", ] diff --git a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2.py b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2.py index 59f1a80..22a83b4 100644 --- a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2.py +++ b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2.py @@ -1,90 +1,74 @@ from __future__ import annotations from dataclasses import asdict, dataclass -from typing import Any, Mapping, Sequence, TypeVar +from typing import Any, ClassVar, Mapping, TypeVar +from typing_extensions import Self +import numpy as np + +from mfpbench.benchmark import Config, Result +from mfpbench.metric import Metric from mfpbench.yahpo.benchmark import YAHPOBenchmark -from mfpbench.yahpo.config import YAHPOConfig -from mfpbench.yahpo.result import YAHPOResult C = TypeVar("C", bound="RBV2Config") R = TypeVar("R", bound="RBV2Result") @dataclass(frozen=True, eq=False, unsafe_hash=True) # type: ignore[misc] -class RBV2Config(YAHPOConfig): +class RBV2Config(Config): @classmethod - def from_dict(cls: type[C], d: Mapping[str, Any]) -> C: + def from_dict( + cls, + d: Mapping[str, Any], + renames: Mapping[str, str] | None = None, + ) -> Self: """Create from a dict or mapping object.""" # We may have keys that are conditional and hence we need to flatten them config = {k.replace(".", "__"): v for k, v in d.items()} - return cls(**config) + return super().from_dict(config, renames) - def dict(self) -> dict[str, Any]: + def as_dict(self) -> dict[str, Any]: """Converts the config to a raw dictionary.""" d = asdict(self) return {k.replace("__", "."): v for k, v in d.items() if v is not None} @dataclass(frozen=True) # type: ignore[misc] -class RBV2Result(YAHPOResult[C, float]): - # Fidelity - fidelity: float - - acc: float - bac: float - auc: float - brier: float - f1: float - logloss: float - - timetrain: float - timepredict: float - - memory: float - - @property - def score(self) -> float: - """The score of interest.""" - return self.bac - - @property - def error(self) -> float: - """The error of interest.""" - return 1 - self.bac - - @property - def test_score(self) -> float: - """The score on the test set.""" - return self.score - - @property - def test_error(self) -> float: - """The error on the test set.""" - return self.error - - @property - def val_score(self) -> float: - """The score on the validation set.""" - return self.score - - @property - def val_error(self) -> float: - """The error on the validation set.""" - return self.error - - @property - def cost(self) -> float: - """The time taken in seconds to train the config.""" - return self.timetrain - - -class RBV2Benchmark(YAHPOBenchmark): +class RBV2Result(Result[C, float]): + default_value_metric: ClassVar[str] = "bac" + default_cost_metric: ClassVar[str] = "timetrain" + metric_defs: ClassVar[Mapping[str, Metric]] = { + "acc": Metric(minimize=False, bounds=(0, 1)), + "bac": Metric(minimize=False, bounds=(0, 1)), + "auc": Metric(minimize=False, bounds=(0, 1)), + "brier": Metric(minimize=True, bounds=(0, 1)), + "f1": Metric(minimize=False, bounds=(0, 1)), + "logloss": Metric(minimize=True, bounds=(0, np.inf)), + "timetrain": Metric(minimize=True, bounds=(0, np.inf)), + "timepredict": Metric(minimize=True, bounds=(0, np.inf)), + "memory": Metric(minimize=True, bounds=(0, np.inf)), + } + + acc: Metric.Value + bac: Metric.Value + auc: Metric.Value + brier: Metric.Value + f1: Metric.Value + logloss: Metric.Value + + timetrain: Metric.Value + timepredict: Metric.Value + + memory: Metric.Value + + +class RBV2Benchmark(YAHPOBenchmark[C, RBV2Result, float]): # RVB2 class of benchmarks share train size as fidelity - fidelity_range = (0.03, 1.0, 0.05) - fidelity_name = "trainsize" + yahpo_config_type: type[C] + yahpo_result_type = RBV2Result + yahpo_fidelity_range = (0.03, 1.0, 0.05) + yahpo_fidelity_name = "trainsize" yahpo_task_id_name = "task_id" # We have to specify a repl number, not sure what it is but YAHPO gym fix it to 10 - yahpo_forced_remove_hps: Mapping[str, int] = {"repl": 10} - yahpo_replacements_hps: Sequence[tuple[str, str]] | None = None + yahpo_forced_remove_hps: ClassVar[Mapping[str, int]] = {"repl": 10} diff --git a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_aknn.py b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_aknn.py index ffb6043..dfce2f3 100644 --- a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_aknn.py +++ b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_aknn.py @@ -1,10 +1,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config, RBV2Result +from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -16,31 +15,10 @@ class RBV2aknnConfig(RBV2Config): ef_construction: int # (7, 1097), log k: int # (1, 50) - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert self.num__impute__selected__cpo in [ - "impute.mean", - "impute.median", - "impute.hist", - ] - assert 18 <= self.M <= 50 - assert self.distance in ["l2", "cosine", "ip"] - assert 7 <= self.ef <= 403 - assert 7 <= self.ef_construction <= 1097 - assert 1 <= self.k <= 50 - - -@dataclass(frozen=True) -class RBV2aknnResult(RBV2Result): - config: RBV2aknnConfig - - -class RBV2aknnBenchmark(RBV2Benchmark): - Result = RBV2aknnResult - Config = RBV2aknnConfig - has_conditionals = False +class RBV2aknnBenchmark(RBV2Benchmark[RBV2aknnConfig]): + yahpo_config_type = RBV2aknnConfig + yahpo_has_conditionals = False yahpo_base_benchmark_name = "rbv2_aknn" yahpo_instances = ( "41138", diff --git a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_glmnet.py b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_glmnet.py index c43d19b..6c57549 100644 --- a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_glmnet.py +++ b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_glmnet.py @@ -1,10 +1,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config, RBV2Result +from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -14,28 +13,10 @@ class RBV2glmnetConfig(RBV2Config): alpha: float # (0.0, 1.0) s: float # (0.0009118819655545162, 1096.6331584284585), log - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert self.num__impute__selected__cpo in [ - "impute.mean", - "impute.median", - "impute.hist", - ] - assert 0.0 <= self.alpha <= 1.0 - assert 0.0009118819655545162 <= self.s <= 1096.6331584284585 - - -@dataclass(frozen=True) -class RBV2glmnetResult(RBV2Result): - config: RBV2glmnetConfig - - -class RBV2glmnetBenchmark(RBV2Benchmark): - Result = RBV2glmnetResult - Config = RBV2glmnetConfig - has_conditionals = False +class RBV2glmnetBenchmark(RBV2Benchmark[RBV2glmnetConfig]): + yahpo_config_type = RBV2glmnetConfig + yahpo_has_conditionals = False yahpo_base_benchmark_name = "rbv2_glmnet" yahpo_instances = ( "41138", diff --git a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_ranger.py b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_ranger.py index 4ffe055..c99251a 100644 --- a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_ranger.py +++ b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_ranger.py @@ -1,10 +1,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config, RBV2Result +from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -20,41 +19,11 @@ class RBV2rangerConfig(RBV2Config): num__random__splits: int | None = None # (1, 100) - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert self.num__impute__selected__cpo in [ - "impute.mean", - "impute.median", - "impute.hist", - ] - assert 1 <= self.min__node__size <= 100 - assert 0 <= self.mtry__power <= 1 - assert 1 <= self.num__trees <= 2000 - assert self.respect__unordered__factors in [ - "ignore", - "order", - "partition", - ] - assert 0.1 <= self.sample__fraction <= 1.0 - assert self.splitrule in ["gini", "extratrees"] - if self.num__random__splits is not None: - assert self.splitrule == "extratrees" - assert 1 <= self.num__random__splits <= 100 - - -@dataclass(frozen=True) -class RBV2rangerResult(RBV2Result): - config: RBV2rangerConfig - - -class RBV2rangerBenchmark(RBV2Benchmark): +class RBV2rangerBenchmark(RBV2Benchmark[RBV2rangerConfig]): + yahpo_config_type = RBV2rangerConfig yahpo_base_benchmark_name = "rbv2_ranger" - Result = RBV2rangerResult - Config = RBV2rangerConfig - has_conditionals = True - + yahpo_has_conditionals = True yahpo_instances = ( "4135", "40981", diff --git a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_rpart.py b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_rpart.py index 4347b98..dc05da4 100644 --- a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_rpart.py +++ b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_rpart.py @@ -1,10 +1,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config, RBV2Result +from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -16,30 +15,10 @@ class RBV2rpartConfig(RBV2Config): minbucket: int # (1, 100) minsplit: int # (1, 100) - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert self.num__impute__selected__cpo in [ - "impute.mean", - "impute.median", - "impute.hist", - ] - assert 0.0009118819655545162 <= self.cp <= 1.0 - assert 1 <= self.maxdepth <= 30 - assert 1 <= self.minbucket <= 100 - assert 1 <= self.minsplit <= 100 - - -@dataclass(frozen=True) -class RBV2rpartResult(RBV2Result): - config: RBV2rpartConfig - - -class RBV2rpartBenchmark(RBV2Benchmark): - Result = RBV2rpartResult - Config = RBV2rpartConfig - has_conditionals = False +class RBV2rpartBenchmark(RBV2Benchmark[RBV2rpartConfig]): + yahpo_config_type = RBV2rpartConfig + yahpo_has_conditionals = False yahpo_base_benchmark_name = "rbv2_rpart" yahpo_instances = ( "41138", diff --git a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_super.py b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_super.py index e3f3bac..fcd752e 100644 --- a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_super.py +++ b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_super.py @@ -1,10 +1,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config, RBV2Result +from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -65,172 +64,10 @@ class RBV2SuperConfig(RBV2Config): xgboost__skip_drop: float | None = None # (0.0, 1.0) xgboost__subsample: float | None = None # (0.1, 1.0) - @no_type_check - def validate(self) -> None: # noqa: C901, PLR0915, PLR0912 - """Validate this config.""" - assert self.learner_id in [ - "aknn", - "glmnet", - "ranger", - "rpart", - "svm", - "xgboost", - ] - - assert self.num__impute__selected__cpo in [ - "impute.mean", - "impute.median", - "impute.hist", - ] - - # We do some conditional checking here - learner = self.learner_id - - # We filter out all attributes except for those that must always be contained - # or are the selected learner, ... - attrs = [ - attr - for attr in dir(self) - if not attr.startswith("__") - or not attr.startswith(learner) - or attr in ["learner_id", "num__impute__selected__cpo"] - ] - - # ... the remaining must always have None set then - for attr in attrs: - assert attr is None - - if learner == "aknn": - assert self.aknn__M is not None - assert self.aknn__ef is not None - assert self.aknn__ef_construction is not None - assert self.aknn__k is not None - assert 18 <= self.aknn__M <= 50 - assert self.aknn__distance in ["l2", "cosine", "ip"] - assert 7 <= self.aknn__ef <= 403 - assert 7 <= self.aknn__ef_construction <= 1097 - assert 1 <= self.aknn__k <= 50 - - elif learner == "glmnet": - assert self.glmnet__alpha is not None - assert self.glmnet__s is not None - assert 0.0 <= self.glmnet__alpha <= 1.0 - assert 0.0009118819655545162 <= self.glmnet__s <= 1096.6331584284585 - - elif learner == "rpart": - assert self.rpart__cp is not None - assert self.rpart__maxdepth is not None - assert self.rpart__minbucket is not None - assert self.rpart__minsplit is not None - assert 0.0009118819655545162 <= self.rpart__cp <= 1.0 - assert 1 <= self.rpart__maxdepth <= 30 - assert 1 <= self.rpart__minbucket <= 100 - assert 1 <= self.rpart__minsplit <= 100 - - elif learner == "ranger": - assert self.ranger__min__node__size is not None - assert self.ranger__mtry__power is not None - assert self.ranger__num__trees is not None - assert self.ranger__respect__unordered__factors is not None - assert self.ranger__sample__fraction is not None - assert 1 <= self.ranger__min__node__size <= 100 - assert 0 <= self.ranger__mtry__power <= 1 - assert 1 <= self.ranger__num__trees <= 2000 - assert self.ranger__respect__unordered__factors in [ - "ignore", - "order", - "partition", - ] - assert 0.1 <= self.ranger__sample__fraction <= 1.0 - assert self.ranger__splitrule in ["gini", "extratrees"] - - if self.ranger__num__random__splits is not None: - assert self.ranger__splitrule == "extratrees" - assert 1 <= self.ranger__num__random__splits <= 100 - - elif learner == "svm": - assert self.svm__cost is not None - assert self.svm__gamma is not None - assert self.svm__kernel is not None - assert self.svm__tolerance is not None - - assert 4.5399929762484854e-05 <= self.svm__cost <= 22026.465794806718 - assert 4.5399929762484854e-05 <= self.svm__gamma <= 22026.465794806718 - assert self.svm__kernel in ["linear", "polynomial", "radial"] - assert 4.5399929762484854e-05 <= self.svm__tolerance <= 2.0 - - if self.svm__degree is not None: - assert 2 <= self.svm__degree <= 5 - assert self.svm__kernel == "polynomial" - - if self.svm__gamma is not None: - assert 4.5399929762484854e-05 <= self.svm__gamma <= 22026.465794806718 - assert self.svm__kernel == "radial" - - elif learner == "xgboost": - assert self.xgboost__alpha is not None - assert self.xgboost__booster is not None - assert self.xgboost__lambda is not None - assert self.xgboost__nrounds is not None - assert self.xgboost__subsample is not None - - assert self.xgboost__booster in ["gblinear", "gbtree", "dart"] - assert 0.0009118819655545162 <= self.xgboost__alpha <= 1096.6331584284585 - assert 0.0009118819655545162 <= self.xgboost__lambda <= 1096.6331584284585 - assert 7 <= self.xgboost__nrounds <= 2981 - assert 0.1 <= self.xgboost__subsample <= 1.0 - - if self.xgboost__colsample_bylevel is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert 0.01 <= self.xgboost__colsample_bylevel <= 1.0 - - if self.xgboost__colsample_bytree is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert 0.01 <= self.xgboost__colsample_bytree <= 1.0 - - if self.xgboost__eta is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert 0.0009118819655545162 <= self.xgboost__eta <= 1.0 - - if self.xgboost__gamma is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert 4.5399929762484854e-05 <= self.xgboost__gamma <= 7.38905609893065 - - if self.xgboost__max_depth is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert 1 <= self.xgboost__max_depth <= 15 - - if self.xgboost__min_child_weight is not None: - assert self.xgboost__booster in ["dart", "gbtree"] - assert ( - 2.718281828459045 - <= self.xgboost__min_child_weight - <= 148.4131591025766 - ) - - if self.xgboost__rate_drop is not None: - assert self.xgboost__booster in ["dart"] - assert 0.0 <= self.xgboost__rate_drop <= 1.0 - - if self.xgboost__skip_drop is not None: - assert self.xgboost__booster in ["dart"] - assert 0.0 <= self.xgboost__skip_drop <= 1.0 - - else: - raise NotImplementedError() - - -@dataclass(frozen=True) -class RBV2SuperResult(RBV2Result): - config: RBV2SuperConfig - - -class RBV2SuperBenchmark(RBV2Benchmark): - Result = RBV2SuperResult - Config = RBV2SuperConfig - - has_conditionals = True +class RBV2SuperBenchmark(RBV2Benchmark[RBV2SuperConfig]): + yahpo_config_type = RBV2SuperConfig + yapho_has_conditionals = True yahpo_base_benchmark_name = "rbv2_super" yahpo_instances = ( "41138", diff --git a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_svm.py b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_svm.py index 7c5801d..65865dd 100644 --- a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_svm.py +++ b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_svm.py @@ -1,10 +1,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config, RBV2Result +from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -17,39 +16,10 @@ class RBV2svmConfig(RBV2Config): tolerance: float # (4.5399929762484854e-05, 2.0) log kernel: Literal["linear", "polynomial", "radial"] | None = None - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert self.num__impute__selected__cpo in [ - "impute.mean", - "impute.median", - "impute.hist", - ] - - assert 4.5399929762484854e-05 <= self.cost <= 22026.465794806718 - assert 4.5399929762484854e-05 <= self.gamma <= 22026.465794806718 - assert self.kernel in ["linear", "polynomial", "radial"] - assert 4.5399929762484854e-05 <= self.tolerance <= 2.0 - - if self.degree is not None: - assert 2 <= self.degree <= 5 - assert self.kernel == "polynomial" - - if self.gamma is not None: - assert 4.5399929762484854e-05 <= self.gamma <= 22026.465794806718 - assert self.kernel == "radial" - - -@dataclass(frozen=True) -class RBV2svmResult(RBV2Result): - config: RBV2svmConfig - - -class RBV2svmBenchmark(RBV2Benchmark): - Result = RBV2svmResult - Config = RBV2svmConfig - has_conditionals = True +class RBV2svmBenchmark(RBV2Benchmark[RBV2svmConfig]): + yahpo_config_type = RBV2svmConfig + yahpo_has_conditionals = True yahpo_base_benchmark_name = "rbv2_svm" yahpo_instances = ( "41138", diff --git a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_xgboost.py b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_xgboost.py index dae6f3e..361e8c3 100644 --- a/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_xgboost.py +++ b/src/mfpbench/yahpo/benchmarks/rbv2/rbv2_xgboost.py @@ -1,10 +1,10 @@ from __future__ import annotations from dataclasses import dataclass -from typing import no_type_check +from typing import ClassVar, Mapping from typing_extensions import Literal -from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config, RBV2Result +from mfpbench.yahpo.benchmarks.rbv2.rbv2 import RBV2Benchmark, RBV2Config @dataclass(frozen=True, eq=False, unsafe_hash=True) @@ -27,66 +27,12 @@ class RBV2xgboostConfig(RBV2Config): rate_drop: float | None = None # (0.0, 1.0) skip_drop: float | None = None # (0.0, 1.0) - @no_type_check - def validate(self) -> None: - """Validate this config.""" - assert self.booster in ["gblinear", "gbtree", "dart"] - assert 0.0009118819655545162 <= self.alpha <= 1096.6331584284585 - assert 0.0009118819655545162 <= self._lambda <= 1096.6331584284585 - assert 7 <= self.nrounds <= 2981 - assert 0.1 <= self.subsample <= 1.0 - - if self.colsample_bylevel is not None: - assert self.booster in ["dart", "gbtree"] - assert 0.01 <= self.colsample_bylevel <= 1.0 - - if self.colsample_bytree is not None: - assert self.booster in ["dart", "gbtree"] - assert 0.01 <= self.colsample_bytree <= 1.0 - - if self.eta is not None: - assert self.booster in ["dart", "gbtree"] - assert 0.0009118819655545162 <= self.eta <= 1.0 - - if self.gamma is not None: - assert self.booster in ["dart", "gbtree"] - assert 4.5399929762484854e-05 <= self.gamma <= 7.38905609893065 - - if self.max_depth is not None: - assert self.booster in ["dart", "gbtree"] - assert 1 <= self.max_depth <= 15 - - if self.min_child_weight is not None: - assert self.booster in ["dart", "gbtree"] - assert 2.718281828459045 <= self.min_child_weight <= 148.4131591025766 - - if self.rate_drop is not None: - assert self.booster in ["dart"] - assert 0.0 <= self.rate_drop <= 1.0 - - if self.skip_drop is not None: - assert self.booster in ["dart"] - assert 0.0 <= self.skip_drop <= 1.0 - - assert self.num__impute__selected__cpo in [ - "impute.mean", - "impute.median", - "impute.hist", - ] - - -@dataclass(frozen=True) -class RBV2xgboostResult(RBV2Result): - config: RBV2xgboostConfig - - -class RBV2xgboostBenchmark(RBV2Benchmark): - Result = RBV2xgboostResult - Config = RBV2xgboostConfig - has_conditionals = True +class RBV2xgboostBenchmark(RBV2Benchmark[RBV2xgboostConfig]): + _config_renames: ClassVar[Mapping[str, str]] = {"lambda": "_lambda"} + yahpo_config_type = RBV2xgboostConfig + yahpo_has_conditionals = True yahpo_base_benchmark_name = "rbv2_xgboost" - yahpo_replacements_hps = (("_lambda", "lambda"),) yahpo_instances = ( "16", "40923", diff --git a/src/mfpbench/yahpo/config.py b/src/mfpbench/yahpo/config.py deleted file mode 100644 index 540e012..0000000 --- a/src/mfpbench/yahpo/config.py +++ /dev/null @@ -1,13 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from typing import TypeVar - -from mfpbench.config import Config - -Self = TypeVar("Self", bound="YAHPOConfig") - - -@dataclass(frozen=True, eq=False, unsafe_hash=True) # type: ignore[misc] -class YAHPOConfig(Config): - ... diff --git a/src/mfpbench/yahpo/result.py b/src/mfpbench/yahpo/result.py deleted file mode 100644 index 96caea3..0000000 --- a/src/mfpbench/yahpo/result.py +++ /dev/null @@ -1,15 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from typing import TypeVar - -from mfpbench.result import Result -from mfpbench.yahpo.config import YAHPOConfig - -C = TypeVar("C", bound=YAHPOConfig) -F = TypeVar("F", int, float) - - -@dataclass(frozen=True) # type: ignore[misc] -class YAHPOResult(Result[C, F]): - ... diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index ebbb1c8..2a47ab5 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -3,8 +3,9 @@ from dataclasses import dataclass from itertools import product from pathlib import Path -from typing import Any +from typing import Any, ClassVar, Mapping +import numpy as np import pandas as pd import pytest from pytest_cases import ( @@ -18,9 +19,10 @@ import mfpbench from mfpbench import ( Benchmark, - GenericTabularBenchmark, - MFHartmannBenchmark, + Metric, + Result, TabularBenchmark, + TabularConfig, YAHPOBenchmark, ) from mfpbench.setup_benchmark import download_status @@ -85,7 +87,7 @@ def case_pd1() -> BenchmarkTest: download_status("lcbench-tabular") is False, reason="lcbench-tabular is not downloaded", ) -@case +@case(tags="tabular") def case_lcbench_tabular() -> BenchmarkTest: return BenchmarkTest("lcbench_tabular", kwargs={"task_id": "adult"}) @@ -95,7 +97,7 @@ def case_mfh() -> BenchmarkTest: return BenchmarkTest("mfh3_good", prior="good") -@case(tags="generic_tabular") +@case(tags="tabular") def case_generic_tabular() -> BenchmarkTest: ids = "abcdefghijklmnopqrstuvwxyz" colors = ["red", "green", "blue"] @@ -108,12 +110,12 @@ def case_generic_tabular() -> BenchmarkTest: pd.DataFrame( [ { - "config": k, + "id": k, "color": c, "shape": s, "animal": a, "number": n, - "float": f, + "ffloat": f, "balanced_accuracy": v, "fidelity": fid, } @@ -123,23 +125,35 @@ def case_generic_tabular() -> BenchmarkTest: for k, (c, s, a, n, f) in zip(ids, config_values) ] df = pd.concat(values, ignore_index=True) - benchmark = GenericTabularBenchmark( - df, + + @dataclass(frozen=True) + class MyResult(Result): + default_value_metric: ClassVar[str] = "balanced_accuracy" + default_cost_metric: ClassVar[str] = "ffloat" + metric_defs: ClassVar[Mapping[str, Metric]] = { + "balanced_accuracy": Metric(minimize=False, bounds=(0, 1)), + "ffloat": Metric(minimize=True, bounds=(0, np.inf)), + } + + balanced_accuracy: Metric.Value + ffloat: Metric.Value + + @dataclass(frozen=True, eq=False, unsafe_hash=True) + class MyConfig(TabularConfig): + id: str | None + color: str + shape: str + animal: str + number: int + + benchmark = TabularBenchmark( name="testdata", - id_key="config", + table=df, + id_key="id", fidelity_key="fidelity", - config_keys=["color", "shape"], - result_keys=["balanced_accuracy"], - result_mapping={ - "error": lambda df: 1 - df["balanced_accuracy"], - "val_error": lambda df: 1 - df["balanced_accuracy"], - "test_error": lambda df: 1 - df["balanced_accuracy"], - "score": lambda df: df["balanced_accuracy"], - "val_score": lambda df: df["balanced_accuracy"], - "test_score": lambda df: df["balanced_accuracy"], - "cost": lambda df: df["float"], - }, - remove_constants=True, + config_type=MyConfig, + result_type=MyResult, + seed=1, ) return BenchmarkTest(benchmark.name, benchmark=benchmark) @@ -159,19 +173,18 @@ def benchmark(item: BenchmarkTest) -> Benchmark: @parametrize("n_samples", [1, 2, 3]) -def test_benchmark_sampling(benchmark: Benchmark, n_samples: int) -> None: +def test_benchmark_sampling( + benchmark: Benchmark, + n_samples: int, +) -> None: config = benchmark.sample() assert isinstance(config, benchmark.Config) - config.validate() configs = benchmark.sample(n_samples) assert len(configs) == n_samples for config in configs: assert isinstance(config, benchmark.Config) - for config in configs: - config.validate() - def test_query_api_validity(benchmark: Benchmark) -> None: sample = benchmark.sample() @@ -179,7 +192,7 @@ def test_query_api_validity(benchmark: Benchmark) -> None: assert result.config == sample - sample_dict = sample.dict() + sample_dict = sample.as_dict() result = benchmark.query(sample_dict) assert result.config == sample_dict @@ -188,20 +201,14 @@ def test_result_api_validity(benchmark: Benchmark) -> None: sample = benchmark.sample() result = benchmark.query(sample) - # MFHartmanns don't have scores - if not isinstance(benchmark, MFHartmannBenchmark): - assert result.score is not None - assert result.test_score is not None - assert result.val_score is not None - assert result.error is not None - assert result.test_error is not None - assert result.val_error is not None assert result.fidelity is not None assert result.cost is not None -def test_query_through_entire_fidelity_range(benchmark: Benchmark) -> None: +def test_query_through_entire_fidelity_range( + benchmark: Benchmark, +) -> None: config = benchmark.sample() results = [benchmark.query(config, at=x) for x in benchmark.iter_fidelities()] @@ -221,6 +228,50 @@ def test_repeated_query(benchmark: Benchmark) -> None: assert r1 == r2, f"{r1}\n{r2}" +def test_metric_optimums(benchmark: Benchmark) -> None: + configs = benchmark.sample(20) + + for config in configs: + result = benchmark.query(config, at=benchmark.end) + for k in benchmark.Result.metric_defs: + assert result[k].score <= benchmark.metric_optimums[k].score + assert result[k].error >= benchmark.metric_optimums[k].error + + +@parametrize_with_cases("item", cases=case_generic_tabular) +def test_table_optimums(item: BenchmarkTest) -> None: + bench: TabularBenchmark = item.benchmark # type: ignore + assert bench is not None + table = bench.table + for k, metric in bench.metrics.items(): + values = [metric.as_value(v) for v in table[k]] + scores = np.array([v.score for v in values]) + errors = np.array([v.error for v in values]) + optimum_score = bench.metric_optimums[k].score + optimum_error = bench.metric_optimums[k].error + assert np.all(scores <= optimum_score) + assert np.all(errors >= optimum_error) + + +def test_with_different_value_metric( + benchmark: Benchmark, +) -> None: + result_type = benchmark.Result + + value_choices = list(result_type.metric_defs.keys()) + cost_choices = list(result_type.metric_defs.keys()) + + for value_metric, cost_metric in product(value_choices, cost_choices): + config = benchmark.sample() + result = benchmark.query( + config, + value_metric=value_metric, + cost_metric=cost_metric, + ) + assert result.value_metric == value_metric + assert result.cost_metric == cost_metric + + def test_repeated_trajectory(benchmark: Benchmark) -> None: configs = benchmark.sample(10) @@ -231,7 +282,9 @@ def test_repeated_trajectory(benchmark: Benchmark) -> None: assert r1 == r2, f"{r1}\n{r2}" -def test_query_default_is_max_fidelity(benchmark: Benchmark) -> None: +def test_query_default_is_max_fidelity( + benchmark: Benchmark, +) -> None: config = benchmark.sample() r1 = benchmark.query(config, at=benchmark.end) r2 = benchmark.query(config) @@ -239,7 +292,9 @@ def test_query_default_is_max_fidelity(benchmark: Benchmark) -> None: assert r1 == r2 -def test_query_same_as_trajectory(benchmark: Benchmark) -> None: +def test_query_same_as_trajectory( + benchmark: Benchmark, +) -> None: config = benchmark.sample() if isinstance(benchmark, YAHPOBenchmark): pytest.skip( @@ -254,7 +309,9 @@ def test_query_same_as_trajectory(benchmark: Benchmark) -> None: assert qr == tr, f"{qr}\n{tr}" -def test_trajectory_is_over_full_range_by_default(benchmark: Benchmark) -> None: +def test_trajectory_is_over_full_range_by_default( + benchmark: Benchmark, +) -> None: config = benchmark.sample() results = benchmark.trajectory(config) @@ -262,14 +319,18 @@ def test_trajectory_is_over_full_range_by_default(benchmark: Benchmark) -> None: assert r.fidelity == fidelity -def test_configs_hashable_and_unique(benchmark: Benchmark) -> None: +def test_configs_hashable_and_unique( + benchmark: Benchmark, +) -> None: configs = benchmark.sample(10) s = set(configs) assert len(s) == len(configs) -def test_results_hashable_and_unique(benchmark: Benchmark) -> None: +def test_results_hashable_and_unique( + benchmark: Benchmark, +) -> None: configs = benchmark.sample(10) results = [benchmark.query(c) for c in configs] @@ -277,27 +338,16 @@ def test_results_hashable_and_unique(benchmark: Benchmark) -> None: assert len(s) == len(results) -def test_argmin_query(benchmark: Benchmark) -> None: - # Get a random configuration - random_config = benchmark.sample() - - # Get the argmax - argmin_config = benchmark.query(random_config, argmin="error") - - # Get the trajectory - trajectory = benchmark.trajectory(random_config) - best_in_trajectory = min(trajectory, key=lambda x: x.error) - - assert argmin_config == best_in_trajectory - - -def test_config_with_same_content_hashes_correctly(benchmark: Benchmark) -> None: +def test_config_with_same_content_hashes_correctly( + benchmark: Benchmark, +) -> None: config = benchmark.sample() if isinstance(benchmark, TabularBenchmark): - config_dict = config.dict(with_id=True) + assert isinstance(config, TabularConfig) + config_dict = config.as_dict(with_id=True) else: - config_dict = config.dict() + config_dict = config.as_dict() # Turn it into a dict and back again new_config = benchmark.Config.from_dict(config_dict) @@ -305,7 +355,9 @@ def test_config_with_same_content_hashes_correctly(benchmark: Benchmark) -> None assert hash(config) == hash(new_config) -def test_result_with_same_content_hashes_correctly(benchmark: Benchmark) -> None: +def test_result_with_same_content_hashes_correctly( + benchmark: Benchmark, +) -> None: config = benchmark.sample() result = benchmark.query(config) @@ -313,7 +365,7 @@ def test_result_with_same_content_hashes_correctly(benchmark: Benchmark) -> None new_result = benchmark.Result.from_dict( config=config, fidelity=result.fidelity, - result=result.dict(), + result=result.as_dict(), ) assert hash(result) == hash(new_result) @@ -324,18 +376,21 @@ def test_result_same_value_but_different_fidelity_has_different_hash( ) -> None: config = benchmark.sample() result = benchmark.query(config) + result_dict = result.as_dict() # Turn it into a dict and back again new_result = benchmark.Result.from_dict( config=config, fidelity=result.fidelity - 1, - result=result.dict(), + result=result_dict, + value_metric=result.value_metric, + cost_metric=result.cost_metric, ) assert hash(result) != hash(new_result) -@parametrize_with_cases("item", cases=".", has_tag=~ft.has_tag("generic_tabular")) +@parametrize_with_cases("item", cases=".", has_tag=~ft.has_tag("tabular")) def test_prior_from_yaml_file(item: BenchmarkTest, tmp_path: Path) -> None: params = item.unpack() bench = mfpbench.get(**params) @@ -355,7 +410,7 @@ def test_prior_from_yaml_file(item: BenchmarkTest, tmp_path: Path) -> None: assert default == random_config, f"{random_config}, {default}" -@parametrize_with_cases("item", cases=".", has_tag=~ft.has_tag("generic_tabular")) +@parametrize_with_cases("item", cases=".", has_tag=~ft.has_tag("tabular")) def test_prior_from_json_file(item: BenchmarkTest, tmp_path: Path) -> None: params = item.unpack() bench = mfpbench.get(**params) @@ -375,7 +430,7 @@ def test_prior_from_json_file(item: BenchmarkTest, tmp_path: Path) -> None: assert default == random_config, f"{random_config}, {default}" -@parametrize_with_cases("item", cases=".", has_tag=~ft.has_tag("generic_tabular")) +@parametrize_with_cases("item", cases=".", has_tag=~ft.has_tag("tabular")) def test_prior_from_config(item: BenchmarkTest) -> None: params = item.unpack() bench = mfpbench.get(**params) @@ -392,7 +447,7 @@ def test_prior_from_config(item: BenchmarkTest) -> None: assert default == random_config, f"{random_config}, {default}" -@parametrize_with_cases("item", cases=".", has_tag=~ft.has_tag("generic_tabular")) +@parametrize_with_cases("item", cases=".", has_tag=~ft.has_tag("tabular")) def test_prior_from_dict(item: BenchmarkTest) -> None: params = item.unpack() bench = mfpbench.get(**params) @@ -400,7 +455,7 @@ def test_prior_from_dict(item: BenchmarkTest) -> None: # Get a random config random_config = bench.sample() # Use the path of the saved config as the prior config - prior_config = random_config.dict() + prior_config = random_config.as_dict() params["prior"] = prior_config @@ -412,3 +467,37 @@ def test_prior_from_dict(item: BenchmarkTest) -> None: # The default configuration for the benchmark should be the same as the prior default = bench.space.get_default_configuration() assert default == random_config, f"{random_config}, {default}" + + +@pytest.mark.skipif( + download_status("lcbench-tabular") is False, + reason="lcbench-tabular is not downloaded", +) +def explicit_test_with_different_value_metric() -> None: + lcbench_tabular_1 = mfpbench.get( + "lcbench_tabular", + task_id="adult", + cost_metric="time", + value_metric="val_accuracy", + ) + lcbench_tabular_2 = mfpbench.get( + "lcbench_tabular", + task_id="adult", + cost_metric="time", + value_metric="val_balanced_accuracy", + ) + + config_1 = lcbench_tabular_1.sample() + config_2 = lcbench_tabular_2.sample() + + result_1 = lcbench_tabular_1.query(config_1) + result_2 = lcbench_tabular_2.query(config_2) + + assert result_1.value_metric == "val_accuracy" + assert result_2.value_metric == "val_balanced_accuracy" + + assert result_1.error != result_2.error + assert result_1.score != result_2.score + + # Same cost metric, only has one + assert result_1.cost == result_2.cost diff --git a/tests/test_hartmann.py b/tests/test_hartmann.py index 2ab16ee..f20de93 100644 --- a/tests/test_hartmann.py +++ b/tests/test_hartmann.py @@ -56,7 +56,7 @@ def test_hartmann_priors_with_and_without_noise_added( assert isinstance(bench_no_noise._prior_arg, str) # All values different - for v1, v2 in zip(clean_prior.dict().values(), noisy_prior.dict().values()): + for v1, v2 in zip(clean_prior.as_dict().values(), noisy_prior.as_dict().values()): assert v1 != v2 # configspace seeded with these priors @@ -79,8 +79,7 @@ def test_hartmann_priors_noise_in_bounds( config = bench.prior assert config is not None - config.validate() - for x in config.dict().values(): + for x in config.as_dict().values(): assert 0 <= x <= 1 diff --git a/tests/test_metric.py b/tests/test_metric.py new file mode 100644 index 0000000..cdcd43f --- /dev/null +++ b/tests/test_metric.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import numpy as np +import pytest +from pytest_cases import case, parametrize_with_cases + +from mfpbench.metric import Metric, OutOfBoundsError + +# NOTE: Each case returns the Metric, the value to use and a tuple of (score, error) + + +# MINIMIZE +@case +def case_metric_minimize_unbounded() -> tuple[Metric, float, tuple[float, float]]: + metric = Metric(minimize=True) + return metric, 0.5, (-0.5, 0.5) + + +@case +def case_metric_minimize_lower_bounded() -> tuple[Metric, float, tuple[float, float]]: + metric = Metric(minimize=True, bounds=(-1, np.inf)) + return metric, 0.5, (-0.5, 0.5) + + +@case +def case_metric_minimize_upper_bounded() -> tuple[Metric, float, tuple[float, float]]: + metric = Metric(minimize=True, bounds=(-np.inf, 1)) + return metric, 0.5, (-0.5, 0.5) + + +@case +def case_metric_minimize_bounded() -> tuple[Metric, float, tuple[float, float]]: + metric = Metric(minimize=True, bounds=(-1, 1)) + return metric, 0.5, (0.25, 0.75) + + +# MAXIMIZE +@case +def case_metric_maximize_unbounded() -> tuple[Metric, float, tuple[float, float]]: + metric = Metric(minimize=False) + return metric, 0.5, (0.5, -0.5) + + +@case +def case_metric_maximize_lower_bounded() -> tuple[Metric, float, tuple[float, float]]: + metric = Metric(minimize=False, bounds=(-1, np.inf)) + return metric, 0.5, (0.5, -0.5) + + +@case +def case_metric_maximize_upper_bounded() -> tuple[Metric, float, tuple[float, float]]: + metric = Metric(minimize=False, bounds=(-np.inf, 1)) + return (metric, 0.25, (0.25, -0.25)) + + +@case +def case_metric_maximize_bounded() -> tuple[Metric, float, tuple[float, float]]: + metric = Metric(minimize=False, bounds=(-1, 1)) + return (metric, 0.5, (0.75, 0.25)) + + +@parametrize_with_cases("metric, value, expected", cases=".") +def test_metric_error( + metric: Metric, + value: float, + expected: tuple[float, float], +) -> None: + _, error = expected + assert metric.as_value(value).error == error + + +@parametrize_with_cases("metric, value, expected", cases=".") +def test_metric_score( + metric: Metric, + value: float, + expected: tuple[float, float], +) -> None: + score, _ = expected + assert metric.as_value(value).score == score + + +@parametrize_with_cases("metric, value, expected", cases=".") +def test_metric_value( + metric: Metric, + value: float, + expected: tuple[float, float], # noqa: ARG001 +) -> None: + assert metric.as_value(value).value == value + + +def test_metric_complains_if_out_of_bounds() -> None: + metric = Metric(minimize=True, bounds=(-1, 1)) + with pytest.raises(OutOfBoundsError): + metric.as_value(-2) + with pytest.raises(OutOfBoundsError): + metric.as_value(2)