diff --git a/docs/run_test/runbook.rst b/docs/run_test/runbook.rst index d6d345a146..669548543c 100644 --- a/docs/run_test/runbook.rst +++ b/docs/run_test/runbook.rst @@ -66,6 +66,8 @@ Runbook Reference - `environment <#environment>`__ + - `retry <#retry>`__ + - `environments <#environments>`__ - `name <#name-4>`__ @@ -670,6 +672,11 @@ environment List of environments. For more information, refer to :ref:`write_test/concepts:node and environment`. +retry +^^^^^^^^^^^^ + +Number of retry attempts for failed deployments, default value is 0. + environments ^^^^^^^^^^^^ diff --git a/lisa/environment.py b/lisa/environment.py index cd6d578db2..b0e2865039 100644 --- a/lisa/environment.py +++ b/lisa/environment.py @@ -161,6 +161,7 @@ def __init__( self, is_predefined: bool, warn_as_error: bool, + retry: int, id_: int, runbook: schema.Environment, ) -> None: @@ -173,6 +174,7 @@ def __init__( self.is_new: bool = True self.id: str = str(id_) self.warn_as_error = warn_as_error + self.retry = retry self.platform: Optional[Platform] = None self.log = get_logger("env", self.name) self.source_test_result: Optional[TestResult] = None @@ -185,6 +187,9 @@ def __init__( self._raw_id = id_ self._retries: int = 0 + # Counter for the number of deployment attempts + # increments when deployment fails and retry > 0 + self.tried_times: int = 0 # cost uses to plan order of environments. # cheaper env can fit cases earlier to run more cases on it. # 1. smaller is higher priority, it can be index of candidate environment @@ -380,6 +385,7 @@ def get_guest_environment(self) -> "Environment": env = Environment( is_predefined=self.is_predefined, warn_as_error=self.warn_as_error, + retry=self.retry, id_=self._raw_id, runbook=runbook, ) @@ -462,9 +468,11 @@ class Environments(EnvironmentsDict): def __init__( self, warn_as_error: bool = False, + retry: int = 0, ) -> None: super().__init__() self.warn_as_error = warn_as_error + self.retry = retry def get_or_create(self, requirement: EnvironmentSpace) -> Optional[Environment]: result: Optional[Environment] = None @@ -507,6 +515,7 @@ def from_runbook( env = Environment( is_predefined=is_predefined_runbook, warn_as_error=self.warn_as_error, + retry=self.retry, id_=id_, runbook=copied_runbook, ) @@ -523,6 +532,7 @@ def load_environments( if root_runbook: environments = Environments( warn_as_error=root_runbook.warn_as_error, + retry=root_runbook.retry, ) environments_runbook = root_runbook.environments diff --git a/lisa/runner.py b/lisa/runner.py index d0e7e84a80..ec73708585 100644 --- a/lisa/runner.py +++ b/lisa/runner.py @@ -10,6 +10,7 @@ from lisa import messages, notifier, schema, transformer from lisa.action import Action from lisa.combinator import Combinator +from lisa.environment import Environment from lisa.messages import TestResultMessage, TestResultMessageBase, TestStatus from lisa.notifier import register_notifier from lisa.parameter_parser.runbook import RunbookBuilder @@ -190,6 +191,20 @@ def _reset_awaitable_timer(self, name: str) -> None: self._wait_resource_logged = False self._wait_resource_timers[name] = _wait_resource_timer + def _need_retry(self, environment: Environment) -> bool: + if environment.tried_times >= environment.retry: + if environment.retry > 0: + self._log.info( + f"Tried {environment.tried_times + 1} times, but failed again." + ) + return False + + environment.tried_times += 1 + self._log.info( + f"Retrying... (Attempt {environment.tried_times}/{environment.retry})" + ) + return True + class RootRunner(Action): """ diff --git a/lisa/runners/lisa_runner.py b/lisa/runners/lisa_runner.py index 45bf44c7e8..6de9e38c3e 100644 --- a/lisa/runners/lisa_runner.py +++ b/lisa/runners/lisa_runner.py @@ -292,6 +292,7 @@ def _deploy_environment_task( ) -> None: try: try: + # Attempt to deploy the environment self.platform.deploy_environment(environment) assert ( environment.status == EnvironmentStatus.Deployed @@ -308,12 +309,16 @@ def _deploy_environment_task( # rerun prepare to calculate resource again. environment.status = EnvironmentStatus.New except Exception as identifier: - self._attach_failed_environment_to_result( - environment=environment, - result=test_results[0], - exception=identifier, - ) - self._delete_environment_task(environment=environment, test_results=[]) + if self._need_retry(environment): + environment.status = EnvironmentStatus.New + else: + # Final attempt failed; handle the failure + self._attach_failed_environment_to_result( + environment=environment, + result=test_results[0], + exception=identifier, + ) + self._delete_environment_task(environment=environment, test_results=[]) def _initialize_environment_task( self, environment: Environment, test_results: List[TestResult] diff --git a/lisa/schema.py b/lisa/schema.py index c23b2a08d2..647cd7d1b3 100644 --- a/lisa/schema.py +++ b/lisa/schema.py @@ -1320,6 +1320,13 @@ def reload_requirements(self) -> None: class EnvironmentRoot: warn_as_error: bool = field(default=False) environments: List[Environment] = field(default_factory=list) + # Number of retry attempts for failed deployments (min=0) + retry: int = field( + default=0, + metadata=field_metadata( + field_function=fields.Int, validate=validate.Range(min=0) + ), + ) @dataclass_json() diff --git a/selftests/azure/test_prepare.py b/selftests/azure/test_prepare.py index 00c524a0bf..dcb11e5f9d 100644 --- a/selftests/azure/test_prepare.py +++ b/selftests/azure/test_prepare.py @@ -368,6 +368,7 @@ def verify_eligible_vm_size( def load_environment( self, node_req_count: int = 2, + retry: int = 0, ) -> Environment: runbook = schema.Environment() if node_req_count > 0: @@ -377,7 +378,11 @@ def load_environment( _ = node_req.get_extended_runbook(common.AzureNodeSchema, AZURE) runbook._original_nodes_requirement.append(node_req) environment = Environment( - is_predefined=True, warn_as_error=False, id_=0, runbook=runbook + is_predefined=True, + warn_as_error=False, + id_=0, + runbook=runbook, + retry=retry, ) return environment diff --git a/selftests/runners/test_lisa_runner.py b/selftests/runners/test_lisa_runner.py index 640d002406..3492edce57 100644 --- a/selftests/runners/test_lisa_runner.py +++ b/selftests/runners/test_lisa_runner.py @@ -616,6 +616,51 @@ def test_env_skipped_no_case(self) -> None: test_results=test_results, ) + def test_env_retry_on_deployment_fail(self) -> None: + # retry when fail to deploy + platform_schema = test_platform.MockPlatformSchema(deploy_success=False) + test_testsuite.generate_cases_metadata() + env_runbook = generate_env_runbook() + env_runbook.retry = 2 + runner = generate_runner(env_runbook, platform_schema=platform_schema) + test_result_messages = self._run_all_tests(runner) + deployment_failure = "deployment failed. LisaException: mock deploy failed" + self.verify_env_results( + expected_prepared=[ + "generated_0", + "generated_1", + "generated_2", + "generated_0", + "generated_0", + "generated_1", + "generated_1", + "generated_2", + "generated_2", + ], + expected_deployed_envs=[], + expected_deleted_envs=[ + "generated_0", + "generated_1", + "generated_2", + ], + runner=runner, + ) + self.verify_test_results( + expected_test_order=["mock_ut1", "mock_ut2", "mock_ut3"], + expected_envs=["generated_0", "generated_1", "generated_2"], + expected_status=[ + TestStatus.FAILED, + TestStatus.FAILED, + TestStatus.FAILED, + ], + expected_message=[ + deployment_failure, + deployment_failure, + deployment_failure, + ], + test_results=test_result_messages, + ) + def verify_test_results( self, expected_test_order: List[str], diff --git a/selftests/test_environment.py b/selftests/test_environment.py index 46590d8f1f..ef8691e6dc 100644 --- a/selftests/test_environment.py +++ b/selftests/test_environment.py @@ -198,6 +198,7 @@ def test_load_empty_runbook(self) -> None: envs = load_environments(None) self.assertEqual(0, len(envs)) self.assertEqual(False, envs.warn_as_error) + self.assertEqual(0, envs.retry) def test_create_from_runbook_split(self) -> None: runbook = generate_runbook(local=True, remote=True)