Skip to content

Commit

Permalink
Add new DebugLevel DevSetting parameter
Browse files Browse the repository at this point in the history
This new parameter permits to:
* Change cinc-client log_level
* Avoid to terminate compute nodes if there are bootstrap issues

This must be a valid cinc-client `log_level` value:
debug, error, fatal, info, trace, or warn.

### References
* https://docs.chef.io/debug/

Signed-off-by: Enrico Usai <[email protected]>
  • Loading branch information
enrico-usai committed Aug 7, 2023
1 parent badbe95 commit 95371b7
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 14 deletions.
2 changes: 2 additions & 0 deletions cli/src/pcluster/config/cluster_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,7 @@ def __init__(
instance_types_data: str = None,
timeouts: Timeouts = None,
compute_startup_time_metric_enabled: bool = None,
debug_level: str = None,
**kwargs,
):
super().__init__(**kwargs)
Expand All @@ -1131,6 +1132,7 @@ def __init__(
self.compute_startup_time_metric_enabled = Resource.init_param(
compute_startup_time_metric_enabled, default=False
)
self.debug_level = Resource.init_param(debug_level, default="info")

def _register_validators(self, context: ValidatorContext = None):
super()._register_validators(context)
Expand Down
22 changes: 13 additions & 9 deletions cli/src/pcluster/resources/compute_node/user_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ write_files:
"head_node_private_ip": "${HeadNodePrivateIp}",
"directory_service": {
"enabled": "${DirectoryServiceEnabled}"
}
},
"debug_level": "${DebugLevel}"
}
}
- path: /etc/chef/client.rb
Expand All @@ -125,11 +126,14 @@ write_files:
function error_exit
{
echo "Bootstrap failed with error: $1"
# wait logs flush before signaling the failure
sleep 10
# TODO: add possibility to override this behavior and keep the instance for debugging
shutdown -h now
exit 1
if [ "${DebugLevel}" != "info" ]; then
echo "Skipping termination because debug_level is set to ${DebugLevel}"
else
# wait logs flush before signaling the failure
sleep 10
shutdown -h now
exit 1
fi
}
function vendor_cookbook
{
Expand Down Expand Up @@ -214,11 +218,11 @@ write_files:
jq --argfile f1 /tmp/dna.json --argfile f2 /tmp/extra.json -n '$f1 * $f2' > /etc/chef/dna.json || ( echo "jq not installed or invalid extra_json"; cp /tmp/dna.json /etc/chef/dna.json)
{
pushd /etc/chef &&
cinc-client --local-mode --config /etc/chef/client.rb --log_level info --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::init &&
cinc-client --local-mode --config /etc/chef/client.rb --log_level ${DebugLevel} --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::init &&
/opt/parallelcluster/scripts/fetch_and_run -preinstall &&
cinc-client --local-mode --config /etc/chef/client.rb --log_level info --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::config &&
cinc-client --local-mode --config /etc/chef/client.rb --log_level ${DebugLevel} --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::config &&
/opt/parallelcluster/scripts/fetch_and_run -postinstall &&
cinc-client --local-mode --config /etc/chef/client.rb --log_level info --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::finalize &&
cinc-client --local-mode --config /etc/chef/client.rb --log_level ${DebugLevel} --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster-entrypoints::finalize &&
popd
} || error_exit 'Failed to run bootstrap recipes. If --norollback was specified, check /var/log/cfn-init.log and /var/log/cloud-init-output.log.'

Expand Down
1 change: 1 addition & 0 deletions cli/src/pcluster/schemas/cluster_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,7 @@ class ClusterDevSettingsSchema(BaseDevSettingsSchema):
instance_types_data = fields.Str(metadata={"update_policy": UpdatePolicy.SUPPORTED})
timeouts = fields.Nested(TimeoutsSchema, metadata={"update_policy": UpdatePolicy.SUPPORTED})
compute_startup_time_metric_enabled = fields.Bool(metadata={"update_policy": UpdatePolicy.SUPPORTED})
debug_level = fields.Str(metadata={"update_policy": UpdatePolicy.SUPPORTED})

@post_load
def make_resource(self, data, **kwargs):
Expand Down
1 change: 1 addition & 0 deletions cli/src/pcluster/templates/cdk_builder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def get_common_user_data_env(node: Union[HeadNode, SlurmQueue, LoginNodesPool],
"CookbookVersion": COOKBOOK_PACKAGES_VERSIONS["cookbook"],
"ChefVersion": COOKBOOK_PACKAGES_VERSIONS["chef"],
"BerkshelfVersion": COOKBOOK_PACKAGES_VERSIONS["berkshelf"],
"DebugLevel": config.dev_settings.debug_level,
}


Expand Down
12 changes: 8 additions & 4 deletions cli/src/pcluster/templates/cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1362,7 +1362,8 @@ def _add_head_node(self):
"commands": {
"chef": {
"command": (
"cinc-client --local-mode --config /etc/chef/client.rb --log_level info "
"cinc-client --local-mode --config /etc/chef/client.rb "
f"--log_level {self.config.dev_settings.debug_level} "
"--logfile /var/log/chef-client.log --force-formatter --no-color "
"--chef-zero-port 8889 --json-attributes /etc/chef/dna.json "
"--override-runlist aws-parallelcluster-entrypoints::init"
Expand All @@ -1378,7 +1379,8 @@ def _add_head_node(self):
"commands": {
"chef": {
"command": (
"cinc-client --local-mode --config /etc/chef/client.rb --log_level info "
"cinc-client --local-mode --config /etc/chef/client.rb "
f"--log_level {self.config.dev_settings.debug_level} "
"--logfile /var/log/chef-client.log --force-formatter --no-color "
"--chef-zero-port 8889 --json-attributes /etc/chef/dna.json "
"--override-runlist aws-parallelcluster-entrypoints::config"
Expand All @@ -1394,7 +1396,8 @@ def _add_head_node(self):
"commands": {
"chef": {
"command": (
"cinc-client --local-mode --config /etc/chef/client.rb --log_level info "
"cinc-client --local-mode --config /etc/chef/client.rb "
f"--log_level {self.config.dev_settings.debug_level} "
"--logfile /var/log/chef-client.log --force-formatter --no-color "
"--chef-zero-port 8889 --json-attributes /etc/chef/dna.json "
"--override-runlist aws-parallelcluster-entrypoints::finalize"
Expand All @@ -1414,7 +1417,8 @@ def _add_head_node(self):
"chef": {
"command": (
". /etc/profile.d/pcluster.sh; "
"cinc-client --local-mode --config /etc/chef/client.rb --log_level info"
"cinc-client --local-mode --config /etc/chef/client.rb "
f"--log_level {self.config.dev_settings.debug_level}"
" --logfile /var/log/chef-client.log --force-formatter --no-color"
" --chef-zero-port 8889 --json-attributes /etc/chef/dna.json"
" --override-runlist aws-parallelcluster-entrypoints::update &&"
Expand Down
4 changes: 3 additions & 1 deletion cli/src/pcluster/templates/queues_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,9 @@ def _add_compute_resource_launch_template(
instance_market_options=self._launch_template_builder.get_instance_market_options(
queue, compute_resource
),
instance_initiated_shutdown_behavior="terminate",
instance_initiated_shutdown_behavior="stop"
if self._config.dev_settings.debug_level != "info"
else "terminate",
capacity_reservation_specification=self._launch_template_builder.get_capacity_reservation(
queue,
compute_resource,
Expand Down

0 comments on commit 95371b7

Please sign in to comment.