Skip to content

Commit

Permalink
Merge pull request #892 from gthao313/ephemeral-cluster
Browse files Browse the repository at this point in the history
sonobuoy: re-set the assume role credentials if it expires
  • Loading branch information
gthao313 committed Mar 18, 2024
2 parents f1b58de + 2a84be3 commit c778fb6
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 9 deletions.
10 changes: 9 additions & 1 deletion bottlerocket/agents/src/bin/k8s-workload-agent/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ where
&self.config,
&self.results_dir,
info_client,
&self.aws_secret_name.as_ref(),
)
.await
}
Expand All @@ -115,7 +116,14 @@ where
.await?;
info!("Stored kubeconfig in {}", TEST_CLUSTER_KUBECONFIG_PATH);

rerun_failed_workload(TEST_CLUSTER_KUBECONFIG_PATH, &self.results_dir, info_client).await
rerun_failed_workload(
TEST_CLUSTER_KUBECONFIG_PATH,
&self.results_dir,
info_client,
&self.config,
&self.aws_secret_name.as_ref(),
)
.await
}

async fn terminate(&mut self) -> Result<(), Self::E> {
Expand Down
6 changes: 4 additions & 2 deletions bottlerocket/agents/src/bin/sonobuoy-test-agent/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ use test_agent::{
};
use testsys_model::{SecretName, TestResults};

// Default Sonobuoy agents assume role duration to 4 hours.
const DEFAULT_ASSUME_ROLE_SESSION_DURATION: i32 = 14400;
// Default Sonobuoy agents assume role duration to 1 hour.
const DEFAULT_ASSUME_ROLE_SESSION_DURATION: i32 = 3600;

struct SonobuoyTestRunner {
config: SonobuoyConfig,
Expand Down Expand Up @@ -111,6 +111,7 @@ where
&self.config,
&self.results_dir,
info_client,
&self.aws_secret_name.as_ref(),
)
.await
}
Expand Down Expand Up @@ -153,6 +154,7 @@ where
&self.config,
&self.results_dir,
info_client,
&self.aws_secret_name.as_ref(),
)
.await
}
Expand Down
36 changes: 33 additions & 3 deletions bottlerocket/agents/src/sonobuoy.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::error;
use agent_utils::aws::aws_config;
use bottlerocket_types::agent_config::{SonobuoyConfig, SONOBUOY_RESULTS_FILENAME};
use log::{error, info, trace};
use serde_json::Value;
Expand All @@ -9,7 +10,7 @@ use std::path::Path;
use std::process::Command;
use std::time::Duration;
use test_agent::InfoClient;
use testsys_model::{Outcome, TestResults};
use testsys_model::{Outcome, SecretName, TestResults};

/// Timeout for sonobuoy status to become available (seconds)
const SONOBUOY_STATUS_TIMEOUT: u64 = 900;
Expand All @@ -22,6 +23,7 @@ pub async fn run_sonobuoy<I>(
sonobuoy_config: &SonobuoyConfig,
results_dir: &Path,
info_client: &I,
aws_secret_name: &Option<&SecretName>,
) -> Result<TestResults, error::Error>
where
I: InfoClient,
Expand Down Expand Up @@ -102,7 +104,14 @@ where
.await
.context(error::SonobuoyTimeoutSnafu)??;
info!("Sonobuoy status is available, waiting for test to complete");
wait_for_sonobuoy_results(kubeconfig_path, None, info_client).await?;
wait_for_sonobuoy_results(
kubeconfig_path,
None,
info_client,
&sonobuoy_config.assume_role,
aws_secret_name,
)
.await?;
info!("Sonobuoy testing has completed, checking results");

results_sonobuoy(kubeconfig_path, results_dir)
Expand All @@ -115,6 +124,7 @@ pub async fn rerun_failed_sonobuoy<I>(
sonobuoy_config: &SonobuoyConfig,
results_dir: &Path,
info_client: &I,
aws_secret_name: &Option<&SecretName>,
) -> Result<TestResults, error::Error>
where
I: InfoClient,
Expand Down Expand Up @@ -175,7 +185,14 @@ where
.await
.context(error::SonobuoyTimeoutSnafu)??;
info!("Sonobuoy status is available, waiting for test to complete");
wait_for_sonobuoy_results(kubeconfig_path, None, info_client).await?;
wait_for_sonobuoy_results(
kubeconfig_path,
None,
info_client,
&sonobuoy_config.assume_role,
aws_secret_name,
)
.await?;
info!("Sonobuoy testing has completed, checking results");

results_sonobuoy(kubeconfig_path, results_dir)
Expand Down Expand Up @@ -220,6 +237,8 @@ pub async fn wait_for_sonobuoy_results<I>(
kubeconfig_path: &str,
namespace: Option<&str>,
info_client: &I,
assume_role: &Option<String>,
aws_secret_name: &Option<&SecretName>,
) -> Result<(), error::Error>
where
I: InfoClient,
Expand All @@ -230,10 +249,20 @@ where
..Default::default()
};
let mut retries = 0;
// Max duration for assume role credential is 3600 seconds, and we refresh every 50 loops * 30 seconds (loop sleep time) before it expires.
let mut credential_refresh_countdown = 50;

loop {
if retries > 5 {
return Err(error::Error::SonobuoyStatus { retries });
}

// Refresh the credentials if the countdown is 0
if credential_refresh_countdown == 0 {
aws_config(aws_secret_name, assume_role, &None, &None, &None, true).await?;
credential_refresh_countdown = 50;
}

let kubeconfig_arg = vec!["--kubeconfig", kubeconfig_path];
let namespace_arg = namespace
.map(|namespace| vec!["--namespace", namespace])
Expand Down Expand Up @@ -300,6 +329,7 @@ where
.for_each(|e| error!("Unable to send test update: {}", e));

tokio::time::sleep(Duration::from_secs(30)).await;
credential_refresh_countdown -= 1;
}
}

Expand Down
23 changes: 20 additions & 3 deletions bottlerocket/agents/src/workload.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::Duration;
use test_agent::InfoClient;
use testsys_model::TestResults;
use testsys_model::{SecretName, TestResults};

/// Timeout for sonobuoy status to become available (seconds)
const SONOBUOY_STATUS_TIMEOUT: u64 = 900;
Expand All @@ -24,6 +24,7 @@ pub async fn run_workload<I>(
workload_config: &WorkloadConfig,
results_dir: &Path,
info_client: &I,
aws_secret_name: &Option<&SecretName>,
) -> Result<TestResults, error::Error>
where
I: InfoClient,
Expand Down Expand Up @@ -92,7 +93,14 @@ where
.await
.context(error::SonobuoyTimeoutSnafu)??;
info!("Workload status is available, waiting for test to complete");
wait_for_sonobuoy_results(kubeconfig_path, Some("testsys-workload"), info_client).await?;
wait_for_sonobuoy_results(
kubeconfig_path,
Some("testsys-workload"),
info_client,
&workload_config.assume_role,
aws_secret_name,
)
.await?;
info!("Workload testing has completed, checking results");

results_workload(kubeconfig_path, results_dir)
Expand All @@ -103,6 +111,8 @@ pub async fn rerun_failed_workload<I>(
kubeconfig_path: &str,
results_dir: &Path,
info_client: &I,
workload_config: &WorkloadConfig,
aws_secret_name: &Option<&SecretName>,
) -> Result<TestResults, error::Error>
where
I: InfoClient,
Expand Down Expand Up @@ -138,7 +148,14 @@ where
.await
.context(error::SonobuoyTimeoutSnafu)??;
info!("Workload status is available, waiting for test to complete");
wait_for_sonobuoy_results(kubeconfig_path, Some("testsys-workload"), info_client).await?;
wait_for_sonobuoy_results(
kubeconfig_path,
Some("testsys-workload"),
info_client,
&workload_config.assume_role,
aws_secret_name,
)
.await?;
info!("Workload testing has completed, checking results");

results_workload(kubeconfig_path, results_dir)
Expand Down

0 comments on commit c778fb6

Please sign in to comment.