From 95371b760cadb8a1ff20636c14da97e5cc0be9f3 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 7 Aug 2023 16:35:53 +0200 Subject: [PATCH] Add new DebugLevel DevSetting parameter This new parameter permits to: * Change cinc-client log_level * Avoid to terminate compute nodes if there are bootstrap issues This must be a valid cinc-client `log_level` value: debug, error, fatal, info, trace, or warn. ### References * https://docs.chef.io/debug/ Signed-off-by: Enrico Usai --- cli/src/pcluster/config/cluster_config.py | 2 ++ .../resources/compute_node/user_data.sh | 22 +++++++++++-------- cli/src/pcluster/schemas/cluster_schema.py | 1 + .../pcluster/templates/cdk_builder_utils.py | 1 + cli/src/pcluster/templates/cluster_stack.py | 12 ++++++---- cli/src/pcluster/templates/queues_stack.py | 4 +++- 6 files changed, 28 insertions(+), 14 deletions(-) diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index 69d4de216b..fa9259a7d9 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -1121,6 +1121,7 @@ def __init__( instance_types_data: str = None, timeouts: Timeouts = None, compute_startup_time_metric_enabled: bool = None, + debug_level: str = None, **kwargs, ): super().__init__(**kwargs) @@ -1131,6 +1132,7 @@ def __init__( self.compute_startup_time_metric_enabled = Resource.init_param( compute_startup_time_metric_enabled, default=False ) + self.debug_level = Resource.init_param(debug_level, default="info") def _register_validators(self, context: ValidatorContext = None): super()._register_validators(context) diff --git a/cli/src/pcluster/resources/compute_node/user_data.sh b/cli/src/pcluster/resources/compute_node/user_data.sh index ab41c30d03..ff57014d49 100644 --- a/cli/src/pcluster/resources/compute_node/user_data.sh +++ b/cli/src/pcluster/resources/compute_node/user_data.sh @@ -104,7 +104,8 @@ write_files: "head_node_private_ip": "${HeadNodePrivateIp}", "directory_service": { "enabled": "${DirectoryServiceEnabled}" - } + }, + "debug_level": "${DebugLevel}" } } - path: /etc/chef/client.rb @@ -125,11 +126,14 @@ write_files: function error_exit { echo "Bootstrap failed with error: $1" - # wait logs flush before signaling the failure - sleep 10 - # TODO: add possibility to override this behavior and keep the instance for debugging - shutdown -h now - exit 1 + if [ "${DebugLevel}" != "info" ]; then + echo "Skipping termination because debug_level is set to ${DebugLevel}" + else + # wait logs flush before signaling the failure + sleep 10 + shutdown -h now + exit 1 + fi } function vendor_cookbook { @@ -214,11 +218,11 @@ write_files: jq --argfile f1 /tmp/dna.json --argfile f2 /tmp/extra.json -n '$f1 * $f2' > /etc/chef/dna.json || ( echo "jq not installed or invalid extra_json"; cp /tmp/dna.json /etc/chef/dna.json) { pushd /etc/chef && - cinc-client --local-mode --config /etc/chef/client.rb --log_level info --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::init && + cinc-client --local-mode --config /etc/chef/client.rb --log_level ${DebugLevel} --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::init && /opt/parallelcluster/scripts/fetch_and_run -preinstall && - cinc-client --local-mode --config /etc/chef/client.rb --log_level info --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::config && + cinc-client --local-mode --config /etc/chef/client.rb --log_level ${DebugLevel} --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::config && /opt/parallelcluster/scripts/fetch_and_run -postinstall && - cinc-client --local-mode --config /etc/chef/client.rb --log_level info --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::finalize && + cinc-client --local-mode --config /etc/chef/client.rb --log_level ${DebugLevel} --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::finalize && popd } || error_exit 'Failed to run bootstrap recipes. If --norollback was specified, check /var/log/cfn-init.log and /var/log/cloud-init-output.log.' diff --git a/cli/src/pcluster/schemas/cluster_schema.py b/cli/src/pcluster/schemas/cluster_schema.py index cd7ba255d2..b0a9971f25 100644 --- a/cli/src/pcluster/schemas/cluster_schema.py +++ b/cli/src/pcluster/schemas/cluster_schema.py @@ -1070,6 +1070,7 @@ class ClusterDevSettingsSchema(BaseDevSettingsSchema): instance_types_data = fields.Str(metadata={"update_policy": UpdatePolicy.SUPPORTED}) timeouts = fields.Nested(TimeoutsSchema, metadata={"update_policy": UpdatePolicy.SUPPORTED}) compute_startup_time_metric_enabled = fields.Bool(metadata={"update_policy": UpdatePolicy.SUPPORTED}) + debug_level = fields.Str(metadata={"update_policy": UpdatePolicy.SUPPORTED}) @post_load def make_resource(self, data, **kwargs): diff --git a/cli/src/pcluster/templates/cdk_builder_utils.py b/cli/src/pcluster/templates/cdk_builder_utils.py index 7003b40b7e..e132e09dc3 100644 --- a/cli/src/pcluster/templates/cdk_builder_utils.py +++ b/cli/src/pcluster/templates/cdk_builder_utils.py @@ -87,6 +87,7 @@ def get_common_user_data_env(node: Union[HeadNode, SlurmQueue, LoginNodesPool], "CookbookVersion": COOKBOOK_PACKAGES_VERSIONS["cookbook"], "ChefVersion": COOKBOOK_PACKAGES_VERSIONS["chef"], "BerkshelfVersion": COOKBOOK_PACKAGES_VERSIONS["berkshelf"], + "DebugLevel": config.dev_settings.debug_level, } diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index 7afc775963..3460fd6cc1 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -1362,7 +1362,8 @@ def _add_head_node(self): "commands": { "chef": { "command": ( - "cinc-client --local-mode --config /etc/chef/client.rb --log_level info " + "cinc-client --local-mode --config /etc/chef/client.rb " + f"--log_level {self.config.dev_settings.debug_level} " "--logfile /var/log/chef-client.log --force-formatter --no-color " "--chef-zero-port 8889 --json-attributes /etc/chef/dna.json " "--override-runlist aws-parallelcluster-entrypoints::init" @@ -1378,7 +1379,8 @@ def _add_head_node(self): "commands": { "chef": { "command": ( - "cinc-client --local-mode --config /etc/chef/client.rb --log_level info " + "cinc-client --local-mode --config /etc/chef/client.rb " + f"--log_level {self.config.dev_settings.debug_level} " "--logfile /var/log/chef-client.log --force-formatter --no-color " "--chef-zero-port 8889 --json-attributes /etc/chef/dna.json " "--override-runlist aws-parallelcluster-entrypoints::config" @@ -1394,7 +1396,8 @@ def _add_head_node(self): "commands": { "chef": { "command": ( - "cinc-client --local-mode --config /etc/chef/client.rb --log_level info " + "cinc-client --local-mode --config /etc/chef/client.rb " + f"--log_level {self.config.dev_settings.debug_level} " "--logfile /var/log/chef-client.log --force-formatter --no-color " "--chef-zero-port 8889 --json-attributes /etc/chef/dna.json " "--override-runlist aws-parallelcluster-entrypoints::finalize" @@ -1414,7 +1417,8 @@ def _add_head_node(self): "chef": { "command": ( ". /etc/profile.d/pcluster.sh; " - "cinc-client --local-mode --config /etc/chef/client.rb --log_level info" + "cinc-client --local-mode --config /etc/chef/client.rb " + f"--log_level {self.config.dev_settings.debug_level}" " --logfile /var/log/chef-client.log --force-formatter --no-color" " --chef-zero-port 8889 --json-attributes /etc/chef/dna.json" " --override-runlist aws-parallelcluster-entrypoints::update &&" diff --git a/cli/src/pcluster/templates/queues_stack.py b/cli/src/pcluster/templates/queues_stack.py index bb8e5b82c0..e016399a24 100644 --- a/cli/src/pcluster/templates/queues_stack.py +++ b/cli/src/pcluster/templates/queues_stack.py @@ -198,7 +198,9 @@ def _add_compute_resource_launch_template( instance_market_options=self._launch_template_builder.get_instance_market_options( queue, compute_resource ), - instance_initiated_shutdown_behavior="terminate", + instance_initiated_shutdown_behavior="stop" + if self._config.dev_settings.debug_level != "info" + else "terminate", capacity_reservation_specification=self._launch_template_builder.get_capacity_reservation( queue, compute_resource,