Skip to content

Commit

Permalink
add retry_on_deployment
Browse files Browse the repository at this point in the history
  • Loading branch information
LiliDeng committed Sep 14, 2024
1 parent a1358de commit 0f557ab
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 7 deletions.
7 changes: 7 additions & 0 deletions lisa/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def __init__(
self,
is_predefined: bool,
warn_as_error: bool,
retry: int,
id_: int,
runbook: schema.Environment,
) -> None:
Expand All @@ -173,6 +174,7 @@ def __init__(
self.is_new: bool = True
self.id: str = str(id_)
self.warn_as_error = warn_as_error
self.retry = retry
self.platform: Optional[Platform] = None
self.log = get_logger("env", self.name)
self.source_test_result: Optional[TestResult] = None
Expand Down Expand Up @@ -380,6 +382,7 @@ def get_guest_environment(self) -> "Environment":
env = Environment(
is_predefined=self.is_predefined,
warn_as_error=self.warn_as_error,
retry=self.retry,
id_=self._raw_id,
runbook=runbook,
)
Expand Down Expand Up @@ -462,9 +465,11 @@ class Environments(EnvironmentsDict):
def __init__(
self,
warn_as_error: bool = False,
retry: int = 0,
) -> None:
super().__init__()
self.warn_as_error = warn_as_error
self.retry = retry

def get_or_create(self, requirement: EnvironmentSpace) -> Optional[Environment]:
result: Optional[Environment] = None
Expand Down Expand Up @@ -507,6 +512,7 @@ def from_runbook(
env = Environment(
is_predefined=is_predefined_runbook,
warn_as_error=self.warn_as_error,
retry=self.retry,
id_=id_,
runbook=copied_runbook,
)
Expand All @@ -523,6 +529,7 @@ def load_environments(
if root_runbook:
environments = Environments(
warn_as_error=root_runbook.warn_as_error,
retry=root_runbook.retry,
)

environments_runbook = root_runbook.environments
Expand Down
18 changes: 18 additions & 0 deletions lisa/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ def __init__(
self._wait_resource_timers: Dict[str, Timer] = dict()
self._wait_resource_logged: bool = False

self._max_retries = runbook.environment.retry if runbook.environment else 0
self._current_attempt = 0

self.canceled = False

def __repr__(self) -> str:
Expand Down Expand Up @@ -190,6 +193,21 @@ def _reset_awaitable_timer(self, name: str) -> None:
self._wait_resource_logged = False
self._wait_resource_timers[name] = _wait_resource_timer

def _check_retry_limit(self) -> bool:
if self._max_retries > 0:
if self._current_attempt >= self._max_retries:
self._log.info(
f"Retry limit exceeded after {self._current_attempt} attempts."
)
return False

self._current_attempt += 1
self._log.info(
f"Retrying... (Attempt {self._current_attempt}/{self._max_retries})"
)
return True
return False


class RootRunner(Action):
"""
Expand Down
17 changes: 11 additions & 6 deletions lisa/runners/lisa_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ def _deploy_environment_task(
) -> None:
try:
try:
# Attempt to deploy the environment
self.platform.deploy_environment(environment)
assert (
environment.status == EnvironmentStatus.Deployed
Expand All @@ -308,12 +309,16 @@ def _deploy_environment_task(
# rerun prepare to calculate resource again.
environment.status = EnvironmentStatus.New
except Exception as identifier:
self._attach_failed_environment_to_result(
environment=environment,
result=test_results[0],
exception=identifier,
)
self._delete_environment_task(environment=environment, test_results=[])
if self._check_retry_limit():
environment.status = EnvironmentStatus.New
else:
# Final attempt failed; handle the failure
self._attach_failed_environment_to_result(
environment=environment,
result=test_results[0],
exception=identifier,
)
self._delete_environment_task(environment=environment, test_results=[])

def _initialize_environment_task(
self, environment: Environment, test_results: List[TestResult]
Expand Down
7 changes: 7 additions & 0 deletions lisa/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1320,6 +1320,13 @@ def reload_requirements(self) -> None:
class EnvironmentRoot:
warn_as_error: bool = field(default=False)
environments: List[Environment] = field(default_factory=list)
# Number of retry attempts for failed deployments (min=0)
retry: int = field(
default=0,
metadata=field_metadata(
field_function=fields.Int, validate=validate.Range(min=0)
),
)


@dataclass_json()
Expand Down
6 changes: 5 additions & 1 deletion selftests/azure/test_prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,11 @@ def load_environment(
_ = node_req.get_extended_runbook(common.AzureNodeSchema, AZURE)
runbook._original_nodes_requirement.append(node_req)
environment = Environment(
is_predefined=True, warn_as_error=False, id_=0, runbook=runbook
is_predefined=True,
warn_as_error=False,
id_=0,
runbook=runbook,
retry=0,
)

return environment
Expand Down

0 comments on commit 0f557ab

Please sign in to comment.