From 6b334d449bbc647908bcfd25f534a55f842bf994 Mon Sep 17 00:00:00 2001
From: Hanwen <hanwenli@amazon.com>
Date: Thu, 29 Aug 2024 13:13:24 -0700
Subject: [PATCH] [integ-tests] Improve scaling tests

1. Use multiple AZs to get more capacity
2. Use t3.medium for compute nodes. Because t3.medium has more capacity than c5.large.
3. Use c5n.18xlarge (instead of c5.24xlarge) as head node because the bottleneck is the networking.
4. Add more dynamic nodes to the clusters. Therefore, the tests are testing cluster with 150k dynamic compute nodes, in addition to scaling up / down with maximum 4000 nodes.

Signed-off-by: Hanwen <hanwenli@amazon.com>
---
 .../configs/scaling_stress_test.yaml          |  8 +++----
 .../common/scaling/scaling_test_config.yaml   |  2 +-
 .../tests/performance_tests/test_scaling.py   |  9 ++++---
 .../pcluster.config.yaml                      | 24 +++++++++++++++++++
 .../pcluster.config.yaml                      | 19 +++++++++++++++
 5 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml
index dff5613e6e..36a763420b 100644
--- a/tests/integration-tests/configs/scaling_stress_test.yaml
+++ b/tests/integration-tests/configs/scaling_stress_test.yaml
@@ -2,13 +2,13 @@ test-suites:
   performance_tests:
     test_scaling.py::test_scaling_stress_test:
       dimensions:
-        - regions: [ "use1-az6" ]
-          instances: [ "c5.large" ]
+        - regions: [ "us-east-1" ]
+          instances: [ "t3.medium" ]
           oss: [ "alinux2" ]
           schedulers: [ "slurm" ]
     test_scaling.py::test_static_scaling_stress_test:
       dimensions:
-        - regions: [ "use1-az6" ]
-          instances: [ "c5.large" ]
+        - regions: [ "us-east-1" ]
+          instances: [ "t3.medium" ]
           oss: [ "alinux2" ]
           schedulers: [ "slurm" ]
diff --git a/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml b/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml
index c5a8872274..4e72fea177 100644
--- a/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml
+++ b/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml
@@ -1,4 +1,4 @@
 MaxMonitoringTimeInMins: 20
 ScalingTargets: [1000, 2000, 3000, 4000]
 SharedHeadNodeStorageType: 'Efs'
-HeadNodeInstanceType: 'c5.24xlarge'
+HeadNodeInstanceType: 'c5n.18xlarge'
diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py
index ada615eddb..10a04fc34f 100644
--- a/tests/integration-tests/tests/performance_tests/test_scaling.py
+++ b/tests/integration-tests/tests/performance_tests/test_scaling.py
@@ -13,6 +13,8 @@
 from tests.common.assertions import assert_no_msg_in_logs
 from tests.common.scaling_common import get_bootstrap_errors, get_scaling_metrics, validate_and_get_scaling_test_config
 
+MAX_QUEUE_SIZE = 50000
+
 
 @pytest.mark.parametrize(
     "max_nodes",
@@ -123,7 +125,7 @@ def test_scaling_stress_test(
     cluster_config = pcluster_config_reader(
         # Prevent nodes being set down before we start monitoring the scale down metrics
         scaledown_idletime=max_monitoring_time_in_mins,
-        max_cluster_size=max(scaling_targets),
+        max_cluster_size=MAX_QUEUE_SIZE,
         head_node_instance_type=head_node_instance_type,
         shared_headnode_storage_type=shared_headnode_storage_type,
         scaling_strategy=scaling_strategy,
@@ -137,6 +139,7 @@ def test_scaling_stress_test(
 
     with soft_assertions():
         for scaling_target in scaling_targets:
+            logging.info("Scaling to %d nodes", scaling_target)
             _scale_up_and_down(
                 cluster,
                 head_node_instance_type,
@@ -156,7 +159,7 @@ def test_scaling_stress_test(
             # ref https://docs.aws.amazon.com/AWSEC2/latest/APIReference/throttling.html
             if scaling_target != scaling_targets[-1]:
                 logging.info("Waiting for the RunInstances Resource Token Bucket to refill")
-                time.sleep(300)
+                time.sleep(500)
 
 
 @pytest.mark.usefixtures("scheduler")
@@ -212,7 +215,7 @@ def test_static_scaling_stress_test(
                 shared_headnode_storage_type=shared_headnode_storage_type,
                 scaling_strategy=scaling_strategy,
                 min_cluster_size=scaling_target,
-                max_cluster_size=scaling_target,
+                max_cluster_size=MAX_QUEUE_SIZE,
                 output_file=f"{scaling_target}-upscale-pcluster.config.yaml",
             )
             _scale_up_and_down(
diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml
index e10418abc0..47dd6c590a 100644
--- a/tests/integration-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml
+++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml
@@ -23,4 +23,28 @@ Scheduling:
           MaxCount: {{ max_cluster_size }}
       Networking:
         SubnetIds:
+          {% for private_subnet_id in private_subnet_ids %}
           - {{ private_subnet_id }}
+          {% endfor %}
+    - Name: queue-1
+      ComputeResources:
+        - Name: compute-resource-2
+          Instances:
+            - InstanceType: {{ instance }}
+          MaxCount: {{ max_cluster_size }}
+      Networking:
+        SubnetIds:
+          {% for private_subnet_id in private_subnet_ids %}
+          - {{ private_subnet_id }}
+          {% endfor %}
+    - Name: queue-2
+      ComputeResources:
+        - Name: compute-resource-2
+          Instances:
+            - InstanceType: {{ instance }}
+          MaxCount: {{ max_cluster_size }}
+      Networking:
+        SubnetIds:
+          {% for private_subnet_id in private_subnet_ids %}
+          - {{ private_subnet_id }}
+          {% endfor %}
diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml
index 0151991381..7ff9714194 100644
--- a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml
+++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml
@@ -24,3 +24,22 @@ Scheduling:
       Networking:
         SubnetIds:
           - {{ private_subnet_id }}
+    - Name: dynamic-1
+      ComputeResources:
+        - Name: compute-resource-1
+          InstanceType: {{ instance }}
+          MinCount: 0
+          MaxCount: {{ max_cluster_size }}
+      Networking:
+        SubnetIds:
+          - {{ private_subnet_id }}
+    - Name: dynamic-2
+      ComputeResources:
+        - Name: compute-resource-2
+          InstanceType: {{ instance }}
+          MinCount: 0
+          MaxCount: {{ max_cluster_size }}
+      Networking:
+        SubnetIds:
+          - {{ private_subnet_id }}
+