From 6b334d449bbc647908bcfd25f534a55f842bf994 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Thu, 29 Aug 2024 13:13:24 -0700 Subject: [PATCH] [integ-tests] Improve scaling tests 1. Use multiple AZs to get more capacity 2. Use t3.medium for compute nodes. Because t3.medium has more capacity than c5.large. 3. Use c5n.18xlarge (instead of c5.24xlarge) as head node because the bottleneck is the networking. 4. Add more dynamic nodes to the clusters. Therefore, the tests are testing cluster with 150k dynamic compute nodes, in addition to scaling up / down with maximum 4000 nodes. Signed-off-by: Hanwen --- .../configs/scaling_stress_test.yaml | 8 +++---- .../common/scaling/scaling_test_config.yaml | 2 +- .../tests/performance_tests/test_scaling.py | 9 ++++--- .../pcluster.config.yaml | 24 +++++++++++++++++++ .../pcluster.config.yaml | 19 +++++++++++++++ 5 files changed, 54 insertions(+), 8 deletions(-) diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml index dff5613e6e..36a763420b 100644 --- a/tests/integration-tests/configs/scaling_stress_test.yaml +++ b/tests/integration-tests/configs/scaling_stress_test.yaml @@ -2,13 +2,13 @@ test-suites: performance_tests: test_scaling.py::test_scaling_stress_test: dimensions: - - regions: [ "use1-az6" ] - instances: [ "c5.large" ] + - regions: [ "us-east-1" ] + instances: [ "t3.medium" ] oss: [ "alinux2" ] schedulers: [ "slurm" ] test_scaling.py::test_static_scaling_stress_test: dimensions: - - regions: [ "use1-az6" ] - instances: [ "c5.large" ] + - regions: [ "us-east-1" ] + instances: [ "t3.medium" ] oss: [ "alinux2" ] schedulers: [ "slurm" ] diff --git a/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml b/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml index c5a8872274..4e72fea177 100644 --- a/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml +++ b/tests/integration-tests/tests/common/scaling/scaling_test_config.yaml @@ -1,4 +1,4 @@ MaxMonitoringTimeInMins: 20 ScalingTargets: [1000, 2000, 3000, 4000] SharedHeadNodeStorageType: 'Efs' -HeadNodeInstanceType: 'c5.24xlarge' +HeadNodeInstanceType: 'c5n.18xlarge' diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py index ada615eddb..10a04fc34f 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling.py +++ b/tests/integration-tests/tests/performance_tests/test_scaling.py @@ -13,6 +13,8 @@ from tests.common.assertions import assert_no_msg_in_logs from tests.common.scaling_common import get_bootstrap_errors, get_scaling_metrics, validate_and_get_scaling_test_config +MAX_QUEUE_SIZE = 50000 + @pytest.mark.parametrize( "max_nodes", @@ -123,7 +125,7 @@ def test_scaling_stress_test( cluster_config = pcluster_config_reader( # Prevent nodes being set down before we start monitoring the scale down metrics scaledown_idletime=max_monitoring_time_in_mins, - max_cluster_size=max(scaling_targets), + max_cluster_size=MAX_QUEUE_SIZE, head_node_instance_type=head_node_instance_type, shared_headnode_storage_type=shared_headnode_storage_type, scaling_strategy=scaling_strategy, @@ -137,6 +139,7 @@ def test_scaling_stress_test( with soft_assertions(): for scaling_target in scaling_targets: + logging.info("Scaling to %d nodes", scaling_target) _scale_up_and_down( cluster, head_node_instance_type, @@ -156,7 +159,7 @@ def test_scaling_stress_test( # ref https://docs.aws.amazon.com/AWSEC2/latest/APIReference/throttling.html if scaling_target != scaling_targets[-1]: logging.info("Waiting for the RunInstances Resource Token Bucket to refill") - time.sleep(300) + time.sleep(500) @pytest.mark.usefixtures("scheduler") @@ -212,7 +215,7 @@ def test_static_scaling_stress_test( shared_headnode_storage_type=shared_headnode_storage_type, scaling_strategy=scaling_strategy, min_cluster_size=scaling_target, - max_cluster_size=scaling_target, + max_cluster_size=MAX_QUEUE_SIZE, output_file=f"{scaling_target}-upscale-pcluster.config.yaml", ) _scale_up_and_down( diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml index e10418abc0..47dd6c590a 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_scaling_stress_test/pcluster.config.yaml @@ -23,4 +23,28 @@ Scheduling: MaxCount: {{ max_cluster_size }} Networking: SubnetIds: + {% for private_subnet_id in private_subnet_ids %} - {{ private_subnet_id }} + {% endfor %} + - Name: queue-1 + ComputeResources: + - Name: compute-resource-2 + Instances: + - InstanceType: {{ instance }} + MaxCount: {{ max_cluster_size }} + Networking: + SubnetIds: + {% for private_subnet_id in private_subnet_ids %} + - {{ private_subnet_id }} + {% endfor %} + - Name: queue-2 + ComputeResources: + - Name: compute-resource-2 + Instances: + - InstanceType: {{ instance }} + MaxCount: {{ max_cluster_size }} + Networking: + SubnetIds: + {% for private_subnet_id in private_subnet_ids %} + - {{ private_subnet_id }} + {% endfor %} diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml index 0151991381..7ff9714194 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml @@ -24,3 +24,22 @@ Scheduling: Networking: SubnetIds: - {{ private_subnet_id }} + - Name: dynamic-1 + ComputeResources: + - Name: compute-resource-1 + InstanceType: {{ instance }} + MinCount: 0 + MaxCount: {{ max_cluster_size }} + Networking: + SubnetIds: + - {{ private_subnet_id }} + - Name: dynamic-2 + ComputeResources: + - Name: compute-resource-2 + InstanceType: {{ instance }} + MinCount: 0 + MaxCount: {{ max_cluster_size }} + Networking: + SubnetIds: + - {{ private_subnet_id }} +