From aca8ca478b51929cab85453853c34247bf6a768d Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 31 Oct 2023 17:36:42 +0100
Subject: [PATCH] fix(tabular-benchmark): Always just use "id" for tabular ids

---
 src/mfpbench/lcbench_tabular/benchmark.py |  27 ++--
 src/mfpbench/setup_benchmark.py           |   6 +-
 src/mfpbench/tabular.py                   | 154 ++++++++++++----------
 3 files changed, 104 insertions(+), 83 deletions(-)

diff --git a/src/mfpbench/lcbench_tabular/benchmark.py b/src/mfpbench/lcbench_tabular/benchmark.py
index 8d839e4..4e18337 100644
--- a/src/mfpbench/lcbench_tabular/benchmark.py
+++ b/src/mfpbench/lcbench_tabular/benchmark.py
@@ -115,27 +115,28 @@ def _get_raw_lcbench_space(
 @dataclass(frozen=True, eq=False, unsafe_hash=True)  # type: ignore[misc]
 class LCBenchTabularConfig(TabularConfig):
     batch_size: int
-    imputation_strategy: str
-    learning_rate_scheduler: str
-    network: str
     max_dropout: float
-    normalization_strategy: str
-    optimizer: str
-    cosine_annealing_T_max: int
-    cosine_annealing_eta_min: float
-    activation: str
     max_units: int
-    mlp_shape: str
     num_layers: int
     learning_rate: float
     momentum: float
     weight_decay: float
+    # All of these are constant and hence optional
+    loss: str | None = None  # This is the name of the loss function used, not a float
+    imputation_strategy: str | None = None
+    learning_rate_scheduler: str | None = None
+    network: str | None = None
+    normalization_strategy: str | None = None
+    optimizer: str | None = None
+    cosine_annealing_T_max: int | None = None
+    cosine_annealing_eta_min: float | None = None
+    activation: str | None = None
+    mlp_shape: str | None = None
 
 
 @dataclass(frozen=True)  # type: ignore[misc]
 class LCBenchTabularResult(Result[LCBenchTabularConfig, int]):
     time: float
-    loss: float
     val_accuracy: float
     val_cross_entropy: float
     val_balanced_accuracy: float
@@ -233,7 +234,7 @@ def __init__(
         task_id: str,
         datadir: str | Path | None = None,
         *,
-        remove_constants: bool = True,
+        remove_constants: bool = False,
         seed: int | None = None,
         prior: str | Path | LCBenchTabularConfig | Mapping[str, Any] | None = None,
         perturb_prior: float | None = None,
@@ -295,8 +296,8 @@ def __init__(
         super().__init__(
             table=table,  # type: ignore
             name=benchmark_task_name,
-            config_name="config_id",
-            fidelity_name=cls.fidelity_name,
+            id_key="id",
+            fidelity_key=cls.fidelity_name,
             result_keys=LCBenchTabularResult.names(),
             config_keys=LCBenchTabularConfig.names(),
             remove_constants=remove_constants,
diff --git a/src/mfpbench/setup_benchmark.py b/src/mfpbench/setup_benchmark.py
index 3c6323f..1105b5f 100644
--- a/src/mfpbench/setup_benchmark.py
+++ b/src/mfpbench/setup_benchmark.py
@@ -210,7 +210,7 @@ def _process(cls, path: Path) -> None:
                 config: dict = config_data["config"]
 
                 log_data: dict = config_data["log"]
-                loss: list[float] = log_data["Train/loss"]
+                loss: list[str] = log_data["Train/loss"]  # Name of the loss
                 val_ce: list[float] = log_data["Train/val_cross_entropy"]
                 val_acc: list[float] = log_data["Train/val_accuracy"]
                 val_bal_acc: list[float] = log_data["Train/val_balanced_accuracy"]
@@ -240,7 +240,7 @@ def _process(cls, path: Path) -> None:
                 )
                 # These are single valued but this will make them as a list into
                 # the dataframe
-                df = df.assign(**{"config_id": config_id, **config})
+                df = df.assign(**{"id": config_id, **config})
 
                 config_frames_for_dataset.append(df)
 
@@ -249,7 +249,7 @@ def _process(cls, path: Path) -> None:
             df_for_dataset = (
                 pd.concat(config_frames_for_dataset, ignore_index=True)
                 .convert_dtypes()
-                .set_index(["config_id", "epoch"])
+                .set_index(["id", "epoch"])
                 .sort_index()
             )
             table_path = path / f"{dataset_name}.parquet"
diff --git a/src/mfpbench/tabular.py b/src/mfpbench/tabular.py
index 0567b68..6e750c2 100644
--- a/src/mfpbench/tabular.py
+++ b/src/mfpbench/tabular.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 from datetime import datetime
-from itertools import chain
 from pathlib import Path
 from typing import Any, Callable, Mapping, Sequence, TypeVar, overload
 from typing_extensions import override
@@ -26,10 +25,10 @@
 
 
 class TabularBenchmark(Benchmark[CTabular, R, F]):
-    config_name: str
+    id_key: str
     """The column in the table that contains the config id. Will be set to the index"""
 
-    fidelity_name: str
+    fidelity_key: str
     """The name of the fidelity used in this benchmark"""
 
     config_keys: Sequence[str]
@@ -41,6 +40,9 @@ class TabularBenchmark(Benchmark[CTabular, R, F]):
     table: pd.DataFrame
     """The table of results used for this benchmark"""
 
+    configs: Mapping[str, CTabular]
+    """The configs used in this benchmark"""
+
     # The config and result type of this benchmark
     Config: type[CTabular]
     Result: type[R]
@@ -48,13 +50,13 @@ class TabularBenchmark(Benchmark[CTabular, R, F]):
     # Whether this benchmark has conditonals in it or not
     has_conditionals: bool = False
 
-    def __init__(  # noqa: PLR0913, C901
+    def __init__(  # noqa: PLR0913
         self,
         name: str,
         table: pd.DataFrame,
         *,
-        config_name: str,
-        fidelity_name: str,
+        id_key: str,
+        fidelity_key: str,
         result_keys: Sequence[str],
         config_keys: Sequence[str],
         remove_constants: bool = False,
@@ -68,8 +70,8 @@ def __init__(  # noqa: PLR0913, C901
         Args:
             name: The name of this benchmark.
             table: The table to use for the benchmark.
-            config_name: The column in the table that contains the config id
-            fidelity_name: The column in the table that contains the fidelity
+            id_key: The column in the table that contains the config id
+            fidelity_key: The column in the table that contains the fidelity
             result_keys: The columns in the table that contain the results
             config_keys: The columns in the table that contain the config values
             remove_constants: Remove constant config columns from the data or not.
@@ -86,6 +88,31 @@ def __init__(  # noqa: PLR0913, C901
             seed: The seed to use for the benchmark.
         """
         cls = self.__class__
+
+        # Make sure we work with a clean slate, no issue with index.
+        table = table.reset_index()
+
+        # Make sure all the keys they specified exist
+        if id_key not in table.columns:
+            raise ValueError(f"'{id_key=}' not in columns {table.columns}")
+
+        if fidelity_key not in table.columns:
+            raise ValueError(f"'{fidelity_key=}' not in columns {table.columns}")
+
+        if not all(key in table.columns for key in result_keys):
+            raise ValueError(f"{result_keys=} not in columns {table.columns}")
+
+        if not all(key in table.columns for key in config_keys):
+            raise ValueError(f"{config_keys=} not in columns {table.columns}")
+
+        # Make sure that the column `id` only exist if it's the `id_key`
+        if "id" in table.columns and id_key != "id":
+            raise ValueError(
+                f"Can't have `id` in the columns if it's not the {id_key=}."
+                " Please drop it or rename it.",
+            )
+
+        # Remove constants from the table
         if remove_constants:
 
             def is_constant(_s: pd.Series) -> bool:
@@ -98,45 +125,24 @@ def is_constant(_s: pd.Series) -> bool:
             table = table.drop(columns=constant_cols)  # type: ignore
             config_keys = [k for k in config_keys if k not in constant_cols]
 
-        # If the table isn't indexed, index it
-        index_cols = [config_name, fidelity_name]
-        if table.index.names != index_cols:
-            # Only drop the index if it's not relevant.
-            relevant_cols: list[str] = [  # type: ignore
-                *list(index_cols),  # type: ignore
-                *list(result_keys),
-                *list(config_keys),
-            ]
-            relevant = any(name in relevant_cols for name in table.index.names)
-            table = table.reset_index(drop=not relevant)
-
-            if config_name not in table.columns:
-                raise ValueError(f"{config_name=} not in columns {table.columns}")
-            if fidelity_name not in table.columns:
-                raise ValueError(f"{fidelity_name=} not in columns {table.columns}")
+        # Remap their id column to `id`
+        table = table.rename(columns={id_key: "id"})
 
-            table = table.set_index(index_cols)
-            table = table.sort_index()
+        # Index the table
+        index_cols: list[str] = ["id", fidelity_key]
 
-        # Make sure all keys are in the table
-        for key in chain(result_keys, config_keys):
-            if key not in table.columns:
-                raise ValueError(f"{key=} not in columns {table.columns}")
-
-        # Make sure the keyword "id" is not in the columns as we use it to
-        # identify configs
-        if "id" in table.columns:
-            raise ValueError(f"{table.columns=} contains 'id'. Please rename it")
-
-        # Make sure we have equidistance fidelities for all configs
-        fidelity_values = table.index.get_level_values(fidelity_name)
-        fidelity_counts = fidelity_values.value_counts()
-        if not (fidelity_counts == fidelity_counts.iloc[0]).all():
-            raise ValueError(f"{fidelity_name=} not  uniform. \n{fidelity_counts}")
+        # Drop all the columns that are not relevant
+        relevant_cols: list[str] = [
+            *index_cols,
+            *result_keys,
+            *config_keys,
+        ]
+        table = table[relevant_cols]  # type: ignore
+        table = table.set_index(index_cols).sort_index()
 
         # We now have the following table
         #
-        # config_id fidelity | **metric, **config_values
+        #     id    fidelity | **metric, **config_values
         #     0         0    |
         #               1    |
         #               2    |
@@ -145,12 +151,23 @@ def is_constant(_s: pd.Series) -> bool:
         #               2    |
         #   ...
 
+        # Make sure we have equidistance fidelities for all configs
+        fidelity_values = table.index.get_level_values(fidelity_key)
+        fidelity_counts = fidelity_values.value_counts()
+        if not (fidelity_counts == fidelity_counts.iloc[0]).all():
+            raise ValueError(f"{fidelity_key=} not uniform. \n{fidelity_counts}")
+
+        sorted_fids = sorted(fidelity_values.unique())
+        start = sorted_fids[0]
+        end = sorted_fids[-1]
+        step = sorted_fids[1] - sorted_fids[0]
+
         # Here we get all the unique configs
-        # config_id fidelity | **metric, **config_values
+        #     id    fidelity | **metric, **config_values
         #     0         0    |
         #     1         0    |
         #   ...
-        config_id_table = table.groupby(level=config_name).agg("first")
+        id_table = table.groupby(level=id_key).agg("first")
         configs = {
             str(config_id): cls.Config.from_dict(
                 {
@@ -158,25 +175,17 @@ def is_constant(_s: pd.Series) -> bool:
                     "id": str(config_id),
                 },
             )
-            for config_id, row in config_id_table.iterrows()
+            for config_id, row in id_table.iterrows()
         }
 
-        fidelity_values = table.index.get_level_values(fidelity_name).unique()
-
-        # We just assume equidistant fidelities
-        sorted_fids = sorted(fidelity_values)
-        start = sorted_fids[0]
-        end = sorted_fids[-1]
-        step = sorted_fids[1] - sorted_fids[0]
-
         # Create the configuration space
         if space is None:
             space = ConfigurationSpace(name, seed=seed)
 
         self.table = table
         self.configs = configs
-        self.fidelity_name = fidelity_name
-        self.config_name = config_name
+        self.fidelity_key = fidelity_key
+        self.id_key = id_key
         self.config_keys = sorted(config_keys)
         self.result_keys = sorted(result_keys)
         self.fidelity_range = (start, end, step)  # type: ignore
@@ -279,23 +288,35 @@ def _find_config(
         self,
         config: CTabular | Mapping[str, Any] | str | int,
     ) -> CTabular:
+        # It's an interger but likely meant to be string
+        # We don't do any numeric based lookups
         if isinstance(config, int):
             config = str(config)
 
+        # It's a key into the self.configs dict
         if isinstance(config, str):
             return self.configs[config]
 
+        # If's a Config, that's fine
         if isinstance(config, self.Config):
             return config
 
-        if self.config_name in config:
-            _id = config[self.config_name]
-            return self.configs[_id]
+        # At this point, we assume we're basically dealing with a dictionary
+        assert isinstance(config, Mapping)
 
+        # Not sure how that ended up there, but we can at least handle that
+        if self.id_key in config:
+            _real_config_id = str(config[self.id_key])
+            return self.configs[_real_config_id]
+
+        # Also ... not sure but anywho
         if "id" in config:
             _id = config["id"]
             return self.configs[_id]
 
+        # Alright, nothing worked, here we try to match the actual hyperparameter
+        # values to what we have in our known configs and attempt to get the
+        # id that way
         match = first_true(
             self.configs.values(),
             pred=lambda c: c == config,  # type: ignore
@@ -407,8 +428,8 @@ def __init__(  # noqa: PLR0913
         table: pd.DataFrame,
         *,
         name: str | None = None,
-        fidelity_name: str,
-        config_name: str,
+        id_key: str,
+        fidelity_key: str,
         result_keys: Sequence[str],
         config_keys: Sequence[str],
         result_mapping: (dict[str, str | Callable[[pd.DataFrame], Any]] | None) = None,
@@ -424,9 +445,8 @@ def __init__(  # noqa: PLR0913
             table: The table to use for the benchmark
             name: The name of the benchmark. If None, will be set to
                 `unknown-{datetime.now().isoformat()}`
-
-            fidelity_name: The column in the table that contains the fidelity
-            config_name: The column in the table that contains the config id
+            id_key: The column in the table that contains the config id
+            fidelity_key: The column in the table that contains the fidelity
             result_keys: The columns in the table that contain the results
             config_keys: The columns in the table that contain the config values
             result_mapping: A mapping from the result keys to the table keys.
@@ -467,8 +487,8 @@ def __init__(  # noqa: PLR0913
         super().__init__(
             name=name,
             table=table,
-            config_name=config_name,
-            fidelity_name=fidelity_name,
+            id_key=id_key,
+            fidelity_key=fidelity_key,
             result_keys=[*result_keys, *_result_mapping.keys()],
             config_keys=config_keys,
             remove_constants=remove_constants,
@@ -485,8 +505,8 @@ def __init__(  # noqa: PLR0913
     table = pd.read_parquet(path)
     benchmark = GenericTabularBenchmark(
         table=table,
-        fidelity_name="epoch",
-        config_name="config_id",
+        id_key="id",
+        fidelity_key="epoch",
         result_keys=[
             "time",
             "val_accuracy",