Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add retry_on_deployment #3417

Merged
merged 3 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/run_test/runbook.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ Runbook Reference

- `environment <#environment>`__

- `retry <#retry>`__

- `environments <#environments>`__

- `name <#name-4>`__
Expand Down Expand Up @@ -670,6 +672,11 @@ environment
List of environments. For more information, refer to
:ref:`write_test/concepts:node and environment`.

retry
^^^^^^^^^^^^

Number of retry attempts for failed deployments, default value is 0.

environments
^^^^^^^^^^^^

Expand Down
10 changes: 10 additions & 0 deletions lisa/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def __init__(
self,
is_predefined: bool,
warn_as_error: bool,
retry: int,
id_: int,
runbook: schema.Environment,
) -> None:
Expand All @@ -173,6 +174,7 @@ def __init__(
self.is_new: bool = True
self.id: str = str(id_)
self.warn_as_error = warn_as_error
self.retry = retry
self.platform: Optional[Platform] = None
self.log = get_logger("env", self.name)
self.source_test_result: Optional[TestResult] = None
Expand All @@ -185,6 +187,9 @@ def __init__(
self._raw_id = id_
self._retries: int = 0

# Counter for the number of deployment attempts
# increments when deployment fails and retry > 0
self.tried_times: int = 0
# cost uses to plan order of environments.
# cheaper env can fit cases earlier to run more cases on it.
# 1. smaller is higher priority, it can be index of candidate environment
Expand Down Expand Up @@ -380,6 +385,7 @@ def get_guest_environment(self) -> "Environment":
env = Environment(
is_predefined=self.is_predefined,
warn_as_error=self.warn_as_error,
retry=self.retry,
id_=self._raw_id,
runbook=runbook,
)
Expand Down Expand Up @@ -462,9 +468,11 @@ class Environments(EnvironmentsDict):
def __init__(
self,
warn_as_error: bool = False,
retry: int = 0,
) -> None:
super().__init__()
self.warn_as_error = warn_as_error
self.retry = retry

def get_or_create(self, requirement: EnvironmentSpace) -> Optional[Environment]:
result: Optional[Environment] = None
Expand Down Expand Up @@ -507,6 +515,7 @@ def from_runbook(
env = Environment(
is_predefined=is_predefined_runbook,
warn_as_error=self.warn_as_error,
retry=self.retry,
id_=id_,
runbook=copied_runbook,
)
Expand All @@ -523,6 +532,7 @@ def load_environments(
if root_runbook:
environments = Environments(
warn_as_error=root_runbook.warn_as_error,
retry=root_runbook.retry,
)

environments_runbook = root_runbook.environments
Expand Down
15 changes: 15 additions & 0 deletions lisa/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from lisa import messages, notifier, schema, transformer
from lisa.action import Action
from lisa.combinator import Combinator
from lisa.environment import Environment
from lisa.messages import TestResultMessage, TestResultMessageBase, TestStatus
from lisa.notifier import register_notifier
from lisa.parameter_parser.runbook import RunbookBuilder
Expand Down Expand Up @@ -190,6 +191,20 @@ def _reset_awaitable_timer(self, name: str) -> None:
self._wait_resource_logged = False
self._wait_resource_timers[name] = _wait_resource_timer

def _need_retry(self, environment: Environment) -> bool:
squirrelsc marked this conversation as resolved.
Show resolved Hide resolved
if environment.tried_times >= environment.retry:
if environment.retry > 0:
self._log.info(
f"Tried {environment.tried_times + 1} times, but failed again."
)
return False

environment.tried_times += 1
self._log.info(
f"Retrying... (Attempt {environment.tried_times}/{environment.retry})"
)
return True


class RootRunner(Action):
"""
Expand Down
17 changes: 11 additions & 6 deletions lisa/runners/lisa_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ def _deploy_environment_task(
) -> None:
try:
try:
# Attempt to deploy the environment
self.platform.deploy_environment(environment)
assert (
environment.status == EnvironmentStatus.Deployed
Expand All @@ -308,12 +309,16 @@ def _deploy_environment_task(
# rerun prepare to calculate resource again.
environment.status = EnvironmentStatus.New
except Exception as identifier:
self._attach_failed_environment_to_result(
environment=environment,
result=test_results[0],
exception=identifier,
)
self._delete_environment_task(environment=environment, test_results=[])
if self._need_retry(environment):
environment.status = EnvironmentStatus.New
else:
# Final attempt failed; handle the failure
self._attach_failed_environment_to_result(
environment=environment,
result=test_results[0],
exception=identifier,
)
self._delete_environment_task(environment=environment, test_results=[])

def _initialize_environment_task(
self, environment: Environment, test_results: List[TestResult]
Expand Down
7 changes: 7 additions & 0 deletions lisa/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1320,6 +1320,13 @@ def reload_requirements(self) -> None:
class EnvironmentRoot:
warn_as_error: bool = field(default=False)
environments: List[Environment] = field(default_factory=list)
# Number of retry attempts for failed deployments (min=0)
retry: int = field(
default=0,
metadata=field_metadata(
field_function=fields.Int, validate=validate.Range(min=0)
),
)


@dataclass_json()
Expand Down
7 changes: 6 additions & 1 deletion selftests/azure/test_prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,7 @@ def verify_eligible_vm_size(
def load_environment(
self,
node_req_count: int = 2,
retry: int = 0,
) -> Environment:
runbook = schema.Environment()
if node_req_count > 0:
Expand All @@ -377,7 +378,11 @@ def load_environment(
_ = node_req.get_extended_runbook(common.AzureNodeSchema, AZURE)
runbook._original_nodes_requirement.append(node_req)
environment = Environment(
is_predefined=True, warn_as_error=False, id_=0, runbook=runbook
is_predefined=True,
warn_as_error=False,
id_=0,
runbook=runbook,
retry=retry,
)

return environment
Expand Down
45 changes: 45 additions & 0 deletions selftests/runners/test_lisa_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,51 @@ def test_env_skipped_no_case(self) -> None:
test_results=test_results,
)

def test_env_retry_on_deployment_fail(self) -> None:
# retry when fail to deploy
platform_schema = test_platform.MockPlatformSchema(deploy_success=False)
test_testsuite.generate_cases_metadata()
env_runbook = generate_env_runbook()
env_runbook.retry = 2
runner = generate_runner(env_runbook, platform_schema=platform_schema)
test_result_messages = self._run_all_tests(runner)
deployment_failure = "deployment failed. LisaException: mock deploy failed"
self.verify_env_results(
expected_prepared=[
"generated_0",
"generated_1",
"generated_2",
"generated_0",
"generated_0",
"generated_1",
"generated_1",
"generated_2",
"generated_2",
],
expected_deployed_envs=[],
expected_deleted_envs=[
"generated_0",
"generated_1",
"generated_2",
],
runner=runner,
)
self.verify_test_results(
expected_test_order=["mock_ut1", "mock_ut2", "mock_ut3"],
expected_envs=["generated_0", "generated_1", "generated_2"],
expected_status=[
TestStatus.FAILED,
TestStatus.FAILED,
TestStatus.FAILED,
],
expected_message=[
deployment_failure,
deployment_failure,
deployment_failure,
],
test_results=test_result_messages,
)

def verify_test_results(
self,
expected_test_order: List[str],
Expand Down
1 change: 1 addition & 0 deletions selftests/test_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ def test_load_empty_runbook(self) -> None:
envs = load_environments(None)
self.assertEqual(0, len(envs))
self.assertEqual(False, envs.warn_as_error)
self.assertEqual(0, envs.retry)

def test_create_from_runbook_split(self) -> None:
runbook = generate_runbook(local=True, remote=True)
Expand Down
Loading