From e16377c411d03dfe4b87d77ac0701578340e9faf Mon Sep 17 00:00:00 2001
From: iburakov <speedwatson@gmail.com>
Date: Thu, 24 Aug 2023 13:33:31 +0000
Subject: [PATCH] Tweak default model hyperparams for better synthesis results

This is the result of a series of experiments conducted to improve ML
synthesis performance measured by recently implemented "rating" of
all-experiments synthesis evaluation results. Room for improvement was
noticed thanks to the recent update to validation dataset selecting
(now it consists of completely new synthesis trees not used for
training).

A close look at new training history charts showed signs of serious
overfitting hurting *cross-tree* model generalization abilities. Training
converged very fast for training loss/mae, but that made the new
validation mae/loss only worse, so that's no good.

Lowering the learning rate, adding L2 regularization and strong dropout
to simulate model batching helped "slow down" training convergence
a lot, allowing to observe the behaviour of the new val loss in a more
detailed way. A clear minimum was spotted, overfitting confirmed, and
the chosen training epoch count / steps per epoch / batch size were
adjusted.

The resulting model showed a synthesis evaluation rating of 76/100,
which is much better than the previous model's (60/100). For reference,
default score + SOTA synthesis method has a rating of 77/100, so that's
a pretty strong result.

The new hyperparameters produce models that conduct ML synthesis
much better, so they should be chosen as the default.
---
 .../src/components/data_processing/dataset_creation.py     | 2 +-
 ml/synthesis/src/components/model_generation/models.py     | 7 ++++---
 ml/synthesis/src/components/model_generation/training.py   | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/ml/synthesis/src/components/data_processing/dataset_creation.py b/ml/synthesis/src/components/data_processing/dataset_creation.py
index 96f9bfd13..6e017fcaf 100644
--- a/ml/synthesis/src/components/data_processing/dataset_creation.py
+++ b/ml/synthesis/src/components/data_processing/dataset_creation.py
@@ -45,7 +45,7 @@ def create_datasets(main_df: pd.DataFrame, val_df: pd.DataFrame | None = None) -
     input_columns = train_df_inputs.columns.tolist()
     logger.info(f"Input columns: {input_columns}")
 
-    train_ds = _df_to_dataset(train_df_inputs, train_df_targets, batch_size=128, repeat=True)
+    train_ds = _df_to_dataset(train_df_inputs, train_df_targets, batch_size=64, repeat=True)
     val_ds = _df_to_dataset(val_df_inputs, val_df_targets)
     del train_df_inputs, val_df_inputs
 
diff --git a/ml/synthesis/src/components/model_generation/models.py b/ml/synthesis/src/components/model_generation/models.py
index 8a75cda6a..8b8aa76e0 100644
--- a/ml/synthesis/src/components/model_generation/models.py
+++ b/ml/synthesis/src/components/model_generation/models.py
@@ -5,16 +5,17 @@ def create_baseline_model(input_shape) -> Model:
     model = Sequential(
         [
             layers.InputLayer(input_shape=input_shape),
+            layers.Dense(256, activation="relu", kernel_regularizer="l2"),
+            layers.Dropout(0.7),
             layers.Dense(64, activation="relu", kernel_regularizer="l2"),
             layers.Dropout(0.5),
-            layers.Dense(16, activation="relu"),
-            layers.Dense(16, activation="relu"),
+            layers.Dense(32, activation="relu"),
             layers.Dense(1),
         ],
     )
 
     model.compile(
-        optimizer=optimizers.Adam(learning_rate=1e-4),
+        optimizer=optimizers.Adam(learning_rate=8e-5),
         loss="mse",
         metrics=["mae"],
     )
diff --git a/ml/synthesis/src/components/model_generation/training.py b/ml/synthesis/src/components/model_generation/training.py
index 58e53d333..d19efadca 100644
--- a/ml/synthesis/src/components/model_generation/training.py
+++ b/ml/synthesis/src/components/model_generation/training.py
@@ -31,8 +31,8 @@ def train_and_save_baseline_model(
 
     model = create_baseline_model(input_shape=sample.shape)
     effective_fitting_kwargs = dict(
-        epochs=45,
-        steps_per_epoch=3000,
+        epochs=30,
+        steps_per_epoch=1500,
     )
     if fitting_kwargs:
         effective_fitting_kwargs.update(fitting_kwargs)