[integ-tests] Improve scaling tests

1. Use multiple AZs to get more capacity 2. Use t3.medium for compute nodes. Because t3.medium has more capacity than c5.large. 3. Use c5n.18xlarge (instead of c5.24xlarge) as head node because the bottleneck is the networking. 4. Add more dynamic nodes to the clusters. Therefore, the tests are testing cluster with 160k dynamic compute nodes, in addition to scaling up / down with maximum 4000 nodes. Signed-off-by: Hanwen <[email protected]>
aws · Aug 29, 2024 · 7e01b3d · 7e01b3d
1 parent e59f3f5
commit 7e01b3d
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 8 deletions.
diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml
@@ -2,13 +2,13 @@ test-suites:
   performance_tests:
     test_scaling.py::test_scaling_stress_test:
       dimensions:
-        - regions: [ "use1-az6" ]
-          instances: [ "c5.large" ]
+        - regions: [ "us-east-1" ]
+          instances: [ "t3.medium" ]
           oss: [ "alinux2" ]
           schedulers: [ "slurm" ]
     test_scaling.py::test_static_scaling_stress_test:
       dimensions:
-        - regions: [ "use1-az6" ]
-          instances: [ "c5.large" ]
+        - regions: [ "us-east-1" ]
+          instances: [ "t3.medium" ]
           oss: [ "alinux2" ]
           schedulers: [ "slurm" ]
diff --git a/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml b/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml
@@ -1,4 +1,4 @@
 MaxMonitoringTimeInMins: 20
 ScalingTargets: [1000, 2000, 3000, 4000]
 SharedHeadNodeStorageType: 'Efs'
-HeadNodeInstanceType: 'c5.24xlarge'
+HeadNodeInstanceType: 'c5n.18xlarge'
diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py
@@ -13,6 +13,7 @@
 from tests.common.assertions import assert_no_msg_in_logs
 from tests.common.scaling_common import get_bootstrap_errors, get_scaling_metrics, validate_and_get_scaling_test_config
 
+MAX_QUEUE_SIZE = 50000
 
 @pytest.mark.parametrize(
     "max_nodes",
@@ -123,7 +124,7 @@ def test_scaling_stress_test(
     cluster_config = pcluster_config_reader(
         # Prevent nodes being set down before we start monitoring the scale down metrics
         scaledown_idletime=max_monitoring_time_in_mins,
-        max_cluster_size=max(scaling_targets),
+        max_cluster_size=MAX_QUEUE_SIZE,
         head_node_instance_type=head_node_instance_type,
         shared_headnode_storage_type=shared_headnode_storage_type,
         scaling_strategy=scaling_strategy,
@@ -137,6 +138,7 @@ def test_scaling_stress_test(
 
     with soft_assertions():
         for scaling_target in scaling_targets:
+            logging.info("Scaling to %d nodes", scaling_target)
             _scale_up_and_down(
                 cluster,
                 head_node_instance_type,
@@ -156,7 +158,7 @@ def test_scaling_stress_test(
             # ref https://docs.aws.amazon.com/AWSEC2/latest/APIReference/throttling.html
             if scaling_target != scaling_targets[-1]:
                 logging.info("Waiting for the RunInstances Resource Token Bucket to refill")
-                time.sleep(300)
+                time.sleep(500)
 
 
 @pytest.mark.usefixtures("scheduler")
@@ -212,7 +214,7 @@ def test_static_scaling_stress_test(
                 shared_headnode_storage_type=shared_headnode_storage_type,
                 scaling_strategy=scaling_strategy,
                 min_cluster_size=scaling_target,
-                max_cluster_size=scaling_target,
+                max_cluster_size=MAX_QUEUE_SIZE,
                 output_file=f"{scaling_target}-upscale-pcluster.config.yaml",
             )
             _scale_up_and_down(

diff --git a/...-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml b/...-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml
@@ -23,4 +23,28 @@ Scheduling:
           MaxCount: {{ max_cluster_size }}
       Networking:
         SubnetIds:
+          {% for private_subnet_id in private_subnet_ids %}
           - {{ private_subnet_id }}
+          {% endfor %}
+    - Name: queue-1
+      ComputeResources:
+        - Name: compute-resource-2
+          Instances:
+            - InstanceType: {{ instance }}
+          MaxCount: {{ max_cluster_size }}
+      Networking:
+        SubnetIds:
+          {% for private_subnet_id in private_subnet_ids %}
+          - {{ private_subnet_id }}
+          {% endfor %}
+    - Name: queue-2
+      ComputeResources:
+        - Name: compute-resource-2
+          Instances:
+            - InstanceType: {{ instance }}
+          MaxCount: {{ max_cluster_size }}
+      Networking:
+        SubnetIds:
+          {% for private_subnet_id in private_subnet_ids %}
+          - {{ private_subnet_id }}
+          {% endfor %}
diff --git a/...tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml b/...tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml
@@ -24,3 +24,22 @@ Scheduling:
       Networking:
         SubnetIds:
           - {{ private_subnet_id }}
+    - Name: dynamic-1
+      ComputeResources:
+        - Name: compute-resource-1
+          InstanceType: {{ instance }}
+          MinCount: 0
+          MaxCount: {{ max_cluster_size }}
+      Networking:
+        SubnetIds:
+          - {{ private_subnet_id }}
+    - Name: dynamic-2
+      ComputeResources:
+        - Name: compute-resource-2
+          InstanceType: {{ instance }}
+          MinCount: 0
+          MaxCount: {{ max_cluster_size }}
+      Networking:
+        SubnetIds:
+          - {{ private_subnet_id }}
+